In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, log_loss, brier_score_loss

In [None]:
df

In [None]:
# task='mortality'
# days = '90days'

task='readmission'
days = '15days'

modality_order = [
    'Tabular', 'Lab', 'Note',
    'E2E Fusion - Sum', 'E2E Fusion - Weighted Sum', 'E2E Fusion - Attn Masked',
    'Simple Average', # Total
    # 'Fusion - Sum', 'Fusion - Weighted Sum', 'Fusion - Attn Masked' # Fusion
]


df = pd.read_csv(Path('Results') / f"Whole_{task.capitalize()}_{days}.csv")
                  
ours_scores = df.set_index('Modality').loc[modality_order].reset_index()
ours_scores = ours_scores[['Modality', 'ROC-AUC', 'PR-AUC', 'NLL Loss', 'Brier Loss']]
ours_scores

In [None]:
def load_keys(adm_ids_path) : 
    adm_ids = pickle.load(open(adm_ids_path, 'rb'))
    code_ids, lab_ids, note_ids = adm_ids['test_code_ids'], adm_ids['test_lab_ids'], adm_ids['test_discharge_ids']
    code_ids = np.array(code_ids).astype(int)
    lab_ids = np.array(lab_ids).astype(int)
    note_ids = np.array(note_ids).astype(int)
    return set(code_ids), set(lab_ids), set(note_ids)

def get_score(df ,ids) : 
    target_df = df.query('pid in @ids')
    labels = target_df['label'].values
    probs = target_df['score'].values
    roc_auc = roc_auc_score(labels, probs)
    precision, recall, _ = precision_recall_curve(labels, probs)
    pr_auc = auc(recall, precision)
    nll_loss = log_loss(labels, probs)
    brier_loss = brier_score_loss(labels, probs)
    return roc_auc, pr_auc, len(labels), nll_loss, brier_loss

# Get MUSE Score

In [None]:
muse_dir = Path('previous_study/MUSE/Results/MUSE/output')
muse_score_path = muse_dir / f'MUSE_{task}_{days}_scores.csv'
data_dir = Path('/home/data/2025_MIMICIV_processed/mimic4/')

if muse_score_path.exists() : 
    muse_scores = pd.read_csv(muse_score_path)

else : 
    suffix = '-finetune'
    output_dirs = list(muse_dir.glob(f'*{task}*{suffix}'))

    total_output = []
    for output_dir in output_dirs : 
        prediction_path = output_dir / 'predictions.txt'
        prediction = pd.read_csv(prediction_path, header=None)
        prediction.columns = ['pid', 'label', 'score']
        
        seed = int(prediction_path.parent.stem.split('-')[-2])
        
        adm_ids_path = data_dir / f"task:{task}_{days}/admission_ids_seed_{seed}.pkl"
        adm_ids = pickle.load(open(adm_ids_path, 'rb'))
        code_ids, lab_ids, note_ids = load_keys(adm_ids_path)
        
        # ## ['Tab All', 'Lab All', 'Note All', 'Tab Only', 'Tab + Lab', 'Tab + Note', 'Tab + Lab + Note']
        tab_all_keys = np.array(list(code_ids))
        total_output.append([seed, 'Tabular', *get_score(prediction, tab_all_keys)])
        lab_all_keys = np.array(list(lab_ids))
        total_output.append([seed, 'Lab', *get_score(prediction, lab_all_keys)])
        note_all_keys = np.array(list(note_ids))
        total_output.append([seed, 'Note', *get_score(prediction, note_all_keys)])
        
        tab_only_keys = np.array(list( # Tabular에만 있고 나머지는 차집합
            code_ids - lab_ids - note_ids
        ))
        total_output.append([seed, 'Tabular Only', *get_score(prediction, tab_only_keys)])
        
        
        tab_lab_keys = np.array(list( # Tab + Lab - Note
            lab_ids - note_ids
        ))
        total_output.append([seed, 'Tab + Lab', *get_score(prediction, tab_lab_keys)])
        
        tab_note_keys = np.array(list( # Tab + Note - Lab
            note_ids - lab_ids
        ))
        total_output.append([seed, 'Tab + Note', *get_score(prediction, tab_note_keys)])
        
        mm_keys = np.array(list(
            code_ids & lab_ids & note_ids
        ))
        total_output.append([seed, 'Multimodal', *get_score(prediction, mm_keys)])
        
        all_keys = np.array(list(
            code_ids | lab_ids | note_ids
        ))
        total_output.append([seed, 'Total', *get_score(prediction, all_keys)])

    total_df = pd.DataFrame(total_output, columns = ['seed', 'Modality', 'AUROC', 'AUPRC', 'N', 'NLL', 'Brier'])
    total_df_avg = total_df.groupby(['Modality']).mean().reset_index().drop(columns = ['seed'])
    total_df_avg['N'] = total_df_avg['N'].astype(int)
    total_df_avg[['AUROC', 'AUPRC']] = total_df_avg[['AUROC', 'AUPRC']].apply(lambda x : round(x*100, 2))
    total_df_avg[['NLL', 'Brier']] = total_df_avg[['NLL', 'Brier']].apply(lambda x : round(x, 4))

    last = total_df_avg.iloc[-1]
    # remove last row and add modified last row
    total_df_avg.loc[len(total_df_avg)-1] = ['Simple Average'] + list(last[1:] )
    total_df_avg.loc[len(total_df_avg)] = ['Fusion - Sum'] + list(last[1:])
    total_df_avg.loc[len(total_df_avg)] = ['Fusion - Weighted Sum'] + list(last[1:])
    total_df_avg.loc[len(total_df_avg)] = ['Fusion - Attn Masked'] + list(last[1:])
    total_df_avg

    muse_scores = total_df_avg.set_index('Modality').loc[modality_order].reset_index()
    muse_scores.to_csv(muse_score_path, index=False)
muse_scores

# Merge

In [None]:
score = 'AUROC'
merged = pd.merge(
    ours_scores[['Modality', score]],
    muse_scores[['Modality', score]],
    on='Modality',
    suffixes=('_ours', '_muse')
)
merged[f'Delta {score} (ours - muse)'] = merged[f'{score}_ours'] - merged[f'{score}_muse']
merged = merged.set_index('Modality').loc[modality_order].reset_index()
merged