In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from lifelines import KaplanMeierFitter
import seaborn as sns
from datetime import timedelta
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_distances

In [None]:
# @title Data_preparation
def analyze_disease_time(cancer_code, df, significant_relationships):
    """
    Analyze time to statistically significant related diseases
    after the first occurrence of the specified cancer diagnosis

    Parameters:
        cancer_code (str): Cancer ICD code (e.g., 'C787')
        df (DataFrame): Patient data with diagnoses and timestamps
        significant_relationships (DataFrame): Statistically significant disease relationships
    """
    # Filter only statistically significant relationships (p < 0.05)
    sig_adj = significant_relationships[
        (significant_relationships['p_value_adj'] < 0.05) &
        (significant_relationships['cancer'] == cancer_code)
    ].copy()

    if sig_adj.empty:
        print(f"No significant relationships found for cancer: {cancer_code}")
        return

    print(f"Found {len(sig_adj)} significant relationships for {cancer_code}")

    # Convert time to datetime
    df['admittime'] = pd.to_datetime(df['admittime'])

    # Get all patients with the specified cancer
    cancer_patients = df[df['icd_code'] == cancer_code][['subject_id']].drop_duplicates()

    # Find first cancer occurrence for each patient
    first_cancer = df[df['icd_code'] == cancer_code].groupby('subject_id')['admittime'].min().reset_index()
    first_cancer.rename(columns={'admittime': 'first_cancer_time'}, inplace=True)

    results = []

    # Process each significant relationship
    for _, row in sig_adj.iterrows():
        non_cancer = row['disease']
        cooccurrence = row['cooccurrence']
        odds_ratio = row['odds_ratio']
        p_value_adj = row['p_value_adj']

        print(f"\nAnalyzing pair: {cancer_code} → {non_cancer}")
        print(f"Co-occurrence: {cooccurrence}, Odds ratio: {odds_ratio:.2f}, Adj. p-value: {p_value_adj:.4e}")

        # Get patients with the current related disease
        non_cancer_patients = df[df['icd_code'] == non_cancer][['subject_id', 'admittime']]

        # Find first occurrence of related disease
        first_non_cancer = non_cancer_patients.groupby('subject_id')['admittime'].min().reset_index()
        first_non_cancer.rename(columns={'admittime': 'first_non_cancer_time'}, inplace=True)

        # Merge data
        temp_df = cancer_patients.merge(first_cancer, on='subject_id', how='left')
        temp_df = temp_df.merge(first_non_cancer, on='subject_id', how='left')
        temp_df = temp_df.dropna(subset=['first_non_cancer_time'])

        if temp_df.empty:
            print(f"  No patients with both {cancer_code} and {non_cancer}")
            continue

        # Calculate time between events in days
        temp_df['time_diff'] = (temp_df['first_non_cancer_time'] - temp_df['first_cancer_time']).dt.days
        temp_df['event'] = (temp_df['time_diff'] > 0).astype(int)

        # Separate data by direction
        forward_df = temp_df[temp_df['event'] == 1]  # cancer → non_cancer
        backward_df = temp_df[temp_df['event'] == 0]  # non_cancer → cancer

        # Calculate absolute time for analysis
        forward_df['abs_time'] = forward_df['time_diff']
        backward_df['abs_time'] = -backward_df['time_diff']

        print(f"  Patients with {cancer_code} → {non_cancer}: {len(forward_df)}")
        print(f"  Patients with {non_cancer} → {cancer_code}: {len(backward_df)}")

        # Create survival plot only if we have data in at least one direction
        if len(forward_df) + len(backward_df) > 0:
            plt.figure(figsize=(12, 8))
            sns.set_style("whitegrid")
            ax = plt.gca()

            # Cancer → Non-cancer direction
            if not forward_df.empty:
                kmf_forward = KaplanMeierFitter()
                kmf_forward.fit(
                    forward_df['abs_time'],
                    event_observed=np.ones(len(forward_df)),
                    label=f"{cancer_code} → {non_cancer}"
                )
                kmf_forward.plot_survival_function(ax=ax, ci_show=False, linewidth=2.5)

            # Non-cancer → Cancer direction
            if not backward_df.empty:
                kmf_backward = KaplanMeierFitter()
                kmf_backward.fit(
                    backward_df['abs_time'],
                    event_observed=np.ones(len(backward_df)),
                    label=f"{non_cancer} → {cancer_code}"
                )
                kmf_backward.plot_survival_function(ax=ax, ci_show=False, linewidth=2.5)

            # Add vertical line at time zero
            plt.axvline(x=0, color='gray', linestyle='--', alpha=0.7)
            plt.text(0.01, 0.5, f'First {cancer_code} diagnosis',
                    transform=ax.get_xaxis_transform(),
                    verticalalignment='center')

            # Format plot
            plt.title(f'Time between diagnoses: {cancer_code} and {non_cancer}\n'
                     f'Odds ratio: {odds_ratio:.2f}, Adj. p-value: {p_value_adj:.2e}', fontsize=14)
            plt.xlabel('Days relative to first cancer diagnosis', fontsize=12)
            plt.ylabel('Probability without second diagnosis', fontsize=12)
            plt.legend(title="Direction", fontsize=11)

            # Add statistics box
            stats_text = (f"Total patients: {len(temp_df)}\n"
                         f"{cancer_code} first: {len(forward_df)}\n"
                         f"{non_cancer} first: {len(backward_df)}")
            plt.annotate(stats_text, xy=(0.75, 0.15), xycoords='axes fraction',
                        bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))

            plt.tight_layout()
            plt.show()

            # Save results
            results.append({
                'cancer': cancer_code,
                'non_cancer': non_cancer,
                'forward_count': len(forward_df),
                'backward_count': len(backward_df),
                'forward_kmf': kmf_forward if not forward_df.empty else None,
                'backward_kmf': kmf_backward if not backward_df.empty else None,
                'data': temp_df
            })
        else:
            print("  No temporal data available for plotting")

    return results

# Example usage
if __name__ == "__main__":
    # Load data
    significant_relationships = pd.read_csv('significant_cancer_relationships.csv')
    significant_relationships = significant_relationships[significant_relationships['p_value_adj'] < 0.05]
    cancer_code = 'C7B'
    results = analyze_disease_time(cancer_code, df, significant_relationships)

In [None]:
def assign_priorities(group):
    """
    1. Assign sequential priorities 1..N based on the group's existing order.
    2. If there is at least one cancer diagnosis in the group,
       swap the priority of the first cancer with the one that had priority 1.
    """
    n = len(group)
    # Initial sequential priorities
    prios = list(range(1, n + 1))

    # Find indices of cancer diagnoses in this group
    is_cancer = group['diagnosis'].str[:3].isin(cancer_icd3).tolist()
    cancer_idxs = [i for i, flag in enumerate(is_cancer) if flag]

    if cancer_idxs:
        # Pick the first cancer occurrence
        ci = cancer_idxs[0]
        # Swap prios[0] (original first) with prios[ci]
        prios[0], prios[ci] = prios[ci], prios[0]

    # Return a Series aligned with group.index
    return pd.Series(prios, index=group.index)



In [None]:
# 1. Load the list of 3‑char cancer ICD codes
cancer_icd3 = pd.read_excel('./hosp/Results/cancer_icd3_codes.xlsx')['icd3'].astype(str).tolist()
df = pd.read_csv('./cooc_matrix_for_icd10_time_ordered.csv', index=True)
# 2. Build a new DataFrame without touching the original df
new_df = (
    df
    # Sort only this view to compute POA correctly
    .sort_values(['subject_id', 'admittime', 'seq_num'])
    .loc[:, ['subject_id', 'admittime', 'dischtime', 'icd_code']]
    .copy()
)

# 3. Rename columns to target names
new_df.rename(columns={
    'subject_id': 'patient_sk',
    'admittime': 'admit_dt_tm',
    'dischtime': 'discharge_dt_tm',
    'icd_code':  'diagnosis',
}, inplace=True)

# 4. Compute POA: 'Y' if this patient has the same diagnosis earlier, else 'N'
new_df['poa'] = (
    new_df
    .duplicated(subset=['patient_sk', 'diagnosis'], keep='first')
    .map({True: 'Y', False: 'N'})
)

# Apply to new_df (which has patient_sk, admit_dt_tm, diagnosis already set):
new_df['diagnosis_priority'] = (
    new_df
    .groupby(['patient_sk', 'admit_dt_tm'], group_keys=False)
    .apply(assign_priorities)
    .astype(int)
)

# 6. third_party_ind = 0 for all
new_df['third_party_ind'] = 0

# 7. Select and reorder final columns
new_df = new_df[[
    'patient_sk',
    'admit_dt_tm',
    'discharge_dt_tm',
    'diagnosis',
    'poa',
    'diagnosis_priority',
    'third_party_ind'
]]

In [None]:
# 8. Save to CSV
new_df.to_csv('hosp/Results/transformed_diagnoses.csv', index=False)

In [None]:
# @title Clinical_Bert
icd_df = pd.read_csv('d_icd_diagnoses.csv', dtype={'icd_code': str})
code_to_desc = dict(zip(icd_df['icd_code'], icd_df['long_title']))
df = pd.read_csv('transformed_diagnoses.csv', dtype={'diagnosis': str})
codes = df['diagnosis'].unique().tolist()
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model     = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model.eval()


embeddings = []
valid_codes = []
for code in codes:
    desc = code_to_desc.get(code)
    if not desc:
        continue
    inputs = tokenizer(desc, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()
    embeddings.append(cls_emb)
    valid_codes.append(code)
embeddings = np.vstack(embeddings)  # [n_codes, hidden_dim]


embedding_df = pd.DataFrame(embeddings, index=valid_codes)
embedding_df.index.name = 'icd_code'
embedding_df.to_csv('icd_medbert_embeddings.csv')

dist_matrix = cosine_distances(embeddings)  # [n_codes, n_codes]
dist_df = pd.DataFrame(dist_matrix, index=valid_codes, columns=valid_codes)
dist_df.to_csv('icd_medbert_cosine_distances.csv')

print("✅ Embeddings and distance matrix saved as CSV.")
