In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import warnings

def preprocess_data(file, scaler=True):

    if type(file) == pd.DataFrame:
        df = file
    else:
        df = pd.read_csv(file)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        df.replace('#NAME?', np.nan, inplace=True)
        df['swt_d_3_energy_entropy'] = df['swt_d_3_energy_entropy'].astype(float)
        df['swt_d_4_energy_entropy'] = df['swt_d_4_energy_entropy'].astype(float)

        features = df[['age', 'sex', 'heart_rate_min', 't_wave_multiscale_permutation_entropy_std', 'heart_rate_max', 't_wave_multiscale_permutation_entropy_median',
                    'rs_time_std', 'p_wave_corr_coeff_median', 'rri_median', 'heart_rate_mean', 'rri_cluster_ssd_3', 'rri_fisher_info', 'pnn60',
                    'swt_d_4_energy_entropy', 'rri_cluster_ssd_2', 'heart_rate_activity', 'diff_rri_min', 't_wave_permutation_entropy_std',
                    'p_wave_sample_entropy_std', 'swt_d_3_energy_entropy', 'p_wave_approximate_entropy_median', 'rpeak_approximate_entropy']]

        y = df[['270492004', '164889003', '164890007', '713426002', '445118002', '39732003', '164909002', '251146004', '284470004',
                        '47665007', '59118001', '427393009', '426177001', '426783006', '427084000', '164934002', '59931005']]

        X = features.fillna(features.median()).replace([np.inf, -np.inf], 0)

        if scaler:
            scaler = MinMaxScaler()
            X = scaler.fit_transform(X)

        X_df = pd.DataFrame(X, columns=features.columns)

        return X_df, y
    
# Feature path
feature_path = 'data/features/'

dfs = []
for source in os.listdir(feature_path):
    if source.endswith('.csv'):
        df = pd.read_csv(os.path.join(feature_path, source))
        df['source'] = source[:-10]
        dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)

## 5-Fold CV

In [2]:
kfold_path = 'data/split_csvs/5FCV'

kfold_results = {}
for i in range(1,6):
    macro_auc_scores = []
    micro_auc_scores = []
    
    # 5-Fold CV with predetermined folds
    for j in range(1,6):
        train_split = pd.read_csv(os.path.join(kfold_path, f'train_split_{i}_{j}.csv'))
        for idx, row in train_split.iterrows():
            train_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]
        
        val_split = pd.read_csv(os.path.join(kfold_path, f'val_split_{i}_{j}.csv'))
        for idx, row in val_split.iterrows():
            val_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]
        
        df_train = df_all[df_all['id'].isin(train_split['id'])]
        df_val = df_all[df_all['id'].isin(val_split['id'])]

        X_train, y_train = preprocess_data(file=df_train)
        X_val, y_val = preprocess_data(file=df_val)

        unavailable_diagnoses_train = y_train.sum()[y_train.sum() < 1].index
        unavailable_diagnoses_val = y_val.sum()[y_val.sum() < 1].index
        unavailable_diagnoses = unavailable_diagnoses_train.union(unavailable_diagnoses_val)

        if len(unavailable_diagnoses) > 0:
            y_train = y_train.drop(unavailable_diagnoses, axis=1)
            y_val = y_val.drop(unavailable_diagnoses, axis=1)

        model = LogisticRegression(max_iter=2000)
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
        
        y_pred = model.predict_proba(X_val)
        macro_auc_cv = round(roc_auc_score(y_val, y_pred, average='macro'), 4)
        macro_auc_scores.append(macro_auc_cv)
        micro_auc_cv = round(roc_auc_score(y_val, y_pred, average='micro'), 4)
        micro_auc_scores.append(micro_auc_cv)

        kfold_results[f'split_{i}_{j}'] = {
            'Macro AUC': macro_auc_cv,
            'Micro AUC': micro_auc_cv
        }

    print(np.mean(micro_auc_scores))
    print(np.mean(macro_auc_scores))

print('5-Fold CV:')
for source, score in kfold_results.items():
    print(source, score)

0.8587400000000001
0.7713400000000001
0.9539
0.83666
0.92546
0.7815599999999999
0.92792
0.8306999999999999
0.93392
0.8013199999999999
5-Fold CV:
split_1_1 {'Macro AUC': 0.7698, 'Micro AUC': 0.8585}
split_1_2 {'Macro AUC': 0.7776, 'Micro AUC': 0.8649}
split_1_3 {'Macro AUC': 0.768, 'Micro AUC': 0.8566}
split_1_4 {'Macro AUC': 0.7605, 'Micro AUC': 0.852}
split_1_5 {'Macro AUC': 0.7808, 'Micro AUC': 0.8617}
split_2_1 {'Macro AUC': 0.8302, 'Micro AUC': 0.9521}
split_2_2 {'Macro AUC': 0.836, 'Micro AUC': 0.9522}
split_2_3 {'Macro AUC': 0.8313, 'Micro AUC': 0.9439}
split_2_4 {'Macro AUC': 0.844, 'Micro AUC': 0.9631}
split_2_5 {'Macro AUC': 0.8418, 'Micro AUC': 0.9582}
split_3_1 {'Macro AUC': 0.7672, 'Micro AUC': 0.9201}
split_3_2 {'Macro AUC': 0.7965, 'Micro AUC': 0.925}
split_3_3 {'Macro AUC': 0.8011, 'Micro AUC': 0.93}
split_3_4 {'Macro AUC': 0.7562, 'Micro AUC': 0.9274}
split_3_5 {'Macro AUC': 0.7868, 'Micro AUC': 0.9248}
split_4_1 {'Macro AUC': 0.833, 'Micro AUC': 0.9324}
split_4_2 {'Mac

## 4-Fold CV

In [4]:
kfold_path = 'data/split_csvs/4FCV'

kfold_results = {}
lso_test_results = {}
kfold_results = {}
for i in range(1,6):
    macro_auc_scores = []
    micro_auc_scores = []
    
    # 4-Fold CV with predetermined folds
    for j in range(1,5):
        
        train_split = pd.read_csv(os.path.join(kfold_path, f'train_split_{i}_{j}.csv'))
        for idx, row in train_split.iterrows():
            train_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]
        
        val_split = pd.read_csv(os.path.join(kfold_path, f'val_split_{i}_{j}.csv'))
        for idx, row in val_split.iterrows():
            val_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]
        df_train = df_all[df_all['id'].isin(train_split['id'])]
        df_val = df_all[df_all['id'].isin(val_split['id'])]

        X_train, y_train = preprocess_data(file=df_train)
        X_val, y_val = preprocess_data(file=df_val)

        unavailable_diagnoses_train = y_train.sum()[y_train.sum() < 1].index
        unavailable_diagnoses_val = y_val.sum()[y_val.sum() < 1].index
        unavailable_diagnoses = unavailable_diagnoses_train.union(unavailable_diagnoses_val)

        if len(unavailable_diagnoses) > 0:
            y_train = y_train.drop(unavailable_diagnoses, axis=1)
            y_val = y_val.drop(unavailable_diagnoses, axis=1)
        
        model = LogisticRegression(max_iter=2000)
        model = OneVsRestClassifier(model)
        model.fit(X_train, y_train)
    
        y_pred = model.predict_proba(X_val)
        macro_auc_cv = round(roc_auc_score(y_val, y_pred, average='macro'), 4)
        macro_auc_scores.append(macro_auc_cv)
        micro_auc_cv = round(roc_auc_score(y_val, y_pred, average='micro'), 4)
        micro_auc_scores.append(micro_auc_cv)

        kfold_results[f'split_{i}_{j}'] = {
            'Macro AUC': macro_auc_cv,
            'Micro AUC': micro_auc_cv
        }

    print(np.mean(micro_auc_scores))
    print(np.mean(macro_auc_scores))

print('4-Fold CV:')
for source, score in kfold_results.items():
    print(source, score)

0.90895
0.814875
0.8855
0.80055
0.896475
0.8182499999999999
0.89445
0.8027
0.89515
0.8196
4-Fold CV:
split_1_1 {'Macro AUC': 0.8159, 'Micro AUC': 0.9182}
split_1_2 {'Macro AUC': 0.8161, 'Micro AUC': 0.9194}
split_1_3 {'Macro AUC': 0.8125, 'Micro AUC': 0.8839}
split_1_4 {'Macro AUC': 0.815, 'Micro AUC': 0.9143}
split_2_1 {'Macro AUC': 0.8017, 'Micro AUC': 0.8323}
split_2_2 {'Macro AUC': 0.7995, 'Micro AUC': 0.906}
split_2_3 {'Macro AUC': 0.8005, 'Micro AUC': 0.8974}
split_2_4 {'Macro AUC': 0.8005, 'Micro AUC': 0.9063}
split_3_1 {'Macro AUC': 0.8149, 'Micro AUC': 0.8458}
split_3_2 {'Macro AUC': 0.818, 'Micro AUC': 0.9065}
split_3_3 {'Macro AUC': 0.8196, 'Micro AUC': 0.9129}
split_3_4 {'Macro AUC': 0.8205, 'Micro AUC': 0.9207}
split_4_1 {'Macro AUC': 0.8036, 'Micro AUC': 0.9023}
split_4_2 {'Macro AUC': 0.8075, 'Micro AUC': 0.9036}
split_4_3 {'Macro AUC': 0.8016, 'Micro AUC': 0.9024}
split_4_4 {'Macro AUC': 0.7981, 'Micro AUC': 0.8695}
split_5_1 {'Macro AUC': 0.8198, 'Micro AUC': 0.9171}
s

## OvO

In [7]:
feature_path = 'data/features'
sources = ['G12EC', 'SPH', 'CPSC_CPSC-Extra', 'ChapmanShaoxing_Ningbo', 'PTB_PTBXL']
OvO_results = {}

for train_source in sources:

    X_train, y_train = preprocess_data(file=os.path.join(feature_path, f'{train_source}_feats.csv'))

    unavailable_diagnoses_train = y_train.sum()[y_train.sum() < 1].index
    y_train = y_train.drop(unavailable_diagnoses_train, axis=1)

    model = LogisticRegression(max_iter=2000, C=1)
    model = OneVsRestClassifier(model)
    model.fit(X_train, y_train)

    # One source vs one source
    OvO_results[train_source] = {}    
    for test_source in sources:
        X_test, y_test = preprocess_data(file=os.path.join(feature_path, f'{test_source}_feats.csv'))

        y_test = y_test.drop(unavailable_diagnoses_train, axis=1)
        available_test = (y_test.sum() >= 1)
        unavailable_diagnoses_test = y_test.sum()[y_test.sum() < 1].index
        y_test = y_test.drop(unavailable_diagnoses_test, axis=1)

        y_pred = model.predict_proba(X_test)[:,available_test]

        score = {'Macro AUC': round(roc_auc_score(y_test, y_pred, average='macro'), 4),
                 'Micro AUC': round(roc_auc_score(y_test, y_pred, average='micro'), 4)
        }
        
        OvO_results[train_source][test_source] = score
        
avg_macro = 0
avg_micro = 0
for k in OvO_results.keys():
    for k2 in OvO_results[k].keys():
        if k != k2:
            avg_macro += OvO_results[k][k2]['Macro AUC']
            avg_micro += OvO_results[k][k2]['Micro AUC']
avg_macro /= len(OvO_results.keys())*(len(OvO_results[k].keys()) - 1)
avg_micro /= len(OvO_results.keys())*(len(OvO_results[k].keys()) - 1)

df_macro = pd.DataFrame()
df_micro = pd.DataFrame()

for outer_key, inner_dict in OvO_results.items():
    for inner_key, metrics in inner_dict.items():
        df_macro.loc[inner_key, outer_key] = metrics.get('Macro AUC', None)
        df_micro.loc[inner_key, outer_key] = metrics.get('Micro AUC', None)

print("Macro AUC:")
display(df_macro)

print("\nMicro AUC:")
display(df_micro)

Macro AUC:


Unnamed: 0,G12EC,SPH,CPSC_CPSC-Extra,ChapmanShaoxing_Ningbo,PTB_PTBXL
G12EC,0.7827,0.7399,0.7105,0.7458,0.7493
SPH,0.8119,0.8454,0.7603,0.8131,0.809
CPSC_CPSC-Extra,0.7483,0.7067,0.836,0.7456,0.7109
ChapmanShaoxing_Ningbo,0.7879,0.7882,0.7204,0.836,0.7952
PTB_PTBXL,0.7705,0.7738,0.7209,0.7712,0.8083



Micro AUC:


Unnamed: 0,G12EC,SPH,CPSC_CPSC-Extra,ChapmanShaoxing_Ningbo,PTB_PTBXL
G12EC,0.8721,0.7841,0.6402,0.7887,0.7699
SPH,0.8465,0.965,0.8039,0.7012,0.9489
CPSC_CPSC-Extra,0.8086,0.7901,0.9366,0.5844,0.7958
ChapmanShaoxing_Ningbo,0.8302,0.854,0.6475,0.9375,0.8046
PTB_PTBXL,0.7964,0.8849,0.7831,0.6845,0.9421


## LSO

In [8]:
feature_path = 'data/features'
sources = ['G12EC', 'SPH', 'CPSC_CPSC-Extra', 'ChapmanShaoxing_Ningbo', 'PTB_PTBXL']

kfold_results = {}
lso_train_results = {}
lso_test_results = {}
for test_source in sources:
    lso_train_results[test_source] = {}
    train_sources = [s for s in sources if s != test_source]

    # LSO test
    X_train, y_train = preprocess_data(df_all[~df_all['source'].isin([test_source])])
    X_test, y_test = preprocess_data(file=os.path.join(feature_path, f'{test_source}_feats.csv'))

    unavailable_diagnoses_train = y_train.sum()[y_test.sum() < 1].index
    unavailable_diagnoses_test = y_test.sum()[y_test.sum() < 1].index
    unavailable_diagnoses = unavailable_diagnoses_train.union(unavailable_diagnoses_test)

    if len(unavailable_diagnoses) > 0:
        # print(unavailable_diagnoses)
        y_test = y_test.drop(unavailable_diagnoses, axis=1)
        y_train = y_train.drop(unavailable_diagnoses, axis=1)

    model = LogisticRegression(max_iter=2000)
    model = OneVsRestClassifier(model)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
 
    score = {
        'Macro AUC': round(roc_auc_score(y_test, y_pred, average='macro'), 4),
        'Micro AUC': round(roc_auc_score(y_test, y_pred, average='micro'), 4)
    }
    lso_test_results[test_source] = score

    print(f'Test set {test_source}:\n{score}')

avg_macro = 0
avg_micro = 0
for k in lso_test_results.keys():
    avg_macro += lso_test_results[k]['Macro AUC']
    avg_micro += lso_test_results[k]['Micro AUC']
avg_macro /= len(sources)
avg_micro /= len(sources)

print(f'\nTest Macro/Micro: {avg_macro:.2f} / {avg_micro:.2f}')

lso_test_results

Test set G12EC:
{'Macro AUC': 0.7261, 'Micro AUC': 0.5831}
Test set SPH:
{'Macro AUC': 0.7956, 'Micro AUC': 0.843}
Test set CPSC_CPSC-Extra:
{'Macro AUC': 0.735, 'Micro AUC': 0.7036}
Test set ChapmanShaoxing_Ningbo:
{'Macro AUC': 0.7966, 'Micro AUC': 0.7288}
Test set PTB_PTBXL:
{'Macro AUC': 0.7687, 'Micro AUC': 0.7221}

Test Macro/Micro: 0.76 / 0.72


{'G12EC': {'Macro AUC': 0.7261, 'Micro AUC': 0.5831},
 'SPH': {'Macro AUC': 0.7956, 'Micro AUC': 0.843},
 'CPSC_CPSC-Extra': {'Macro AUC': 0.735, 'Micro AUC': 0.7036},
 'ChapmanShaoxing_Ningbo': {'Macro AUC': 0.7966, 'Micro AUC': 0.7288},
 'PTB_PTBXL': {'Macro AUC': 0.7687, 'Micro AUC': 0.7221}}

# XGBoost

## 5-Fold CV

In [9]:
kfold_path = 'data/split_csvs/5FCV'
kfold_results = {}
for i in range(1,6):
    macro_auc_scores = []
    micro_auc_scores = []
    
    # 5-Fold CV with predetermined folds
    for j in range(1,6):
        train_split = pd.read_csv(os.path.join(kfold_path, f'train_split_{i}_{j}.csv'))
        for idx, row in train_split.iterrows():
            train_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]
        
        val_split = pd.read_csv(os.path.join(kfold_path, f'val_split_{i}_{j}.csv'))
        for idx, row in val_split.iterrows():
            val_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]

        X_train, y_train = preprocess_data(file=df_train)
        X_val, y_val = preprocess_data(file=df_val)

        unavailable_diagnoses_train = y_train.sum()[y_train.sum() < 1].index
        unavailable_diagnoses_val = y_val.sum()[y_val.sum() < 1].index
        unavailable_diagnoses = unavailable_diagnoses_train.union(unavailable_diagnoses_val)

        if len(unavailable_diagnoses) > 0:
            y_train = y_train.drop(unavailable_diagnoses, axis=1)
            y_val = y_val.drop(unavailable_diagnoses, axis=1)

        model = xgb.XGBClassifier()
        model.fit(X_train, y_train)
        
        y_pred = model.predict_proba(X_val)
        macro_auc_cv = round(roc_auc_score(y_val, y_pred, average='macro'), 4)
        macro_auc_scores.append(macro_auc_cv)
        micro_auc_cv = round(roc_auc_score(y_val, y_pred, average='micro'), 4)
        micro_auc_scores.append(micro_auc_cv)

        kfold_results[f'split_{i}_{j}'] = {
            'Macro AUC': macro_auc_cv,
            'Micro AUC': micro_auc_cv
        }

    print(np.mean(micro_auc_scores))
    print(np.mean(macro_auc_scores))

print('5-Fold CV:')
for source, score in kfold_results.items():
    print(source, score)

0.8786000000000002
0.7659
0.8786000000000002
0.7659
0.8786000000000002
0.7659
0.8786000000000002
0.7659
0.8786000000000002
0.7659
5-Fold CV:
split_1_1 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_1_2 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_1_3 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_1_4 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_1_5 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_2_1 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_2_2 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_2_3 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_2_4 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_2_5 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_3_1 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_3_2 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_3_3 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_3_4 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_3_5 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_4_1 {'Macro AUC': 0.7659, 'Micro AUC': 0.8786}
split_4_2 {

## 4-Fold CV

In [10]:
kfold_path = 'data/split_csvs/4FCV'
kfold_results = {}
for i in range(1,6):
    macro_auc_scores = []
    micro_auc_scores = []
    
    # 4-Fold CV with predetermined folds
    for j in range(1,5):
        
        train_split = pd.read_csv(os.path.join(kfold_path, f'train_split_{i}_{j}.csv'))
        for idx, row in train_split.iterrows():
            train_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]
        
        val_split = pd.read_csv(os.path.join(kfold_path, f'val_split_{i}_{j}.csv'))
        for idx, row in val_split.iterrows():
            val_split.loc[idx, 'id'] = row['path'].split('/')[-1].split('_')[0]
        df_train = df_all[df_all['id'].isin(train_split['id'])]
        df_val = df_all[df_all['id'].isin(val_split['id'])]

        X_train, y_train = preprocess_data(file=df_train)
        X_val, y_val = preprocess_data(file=df_val)

        unavailable_diagnoses_train = y_train.sum()[y_train.sum() < 1].index
        unavailable_diagnoses_val = y_val.sum()[y_val.sum() < 1].index
        unavailable_diagnoses = unavailable_diagnoses_train.union(unavailable_diagnoses_val)

        if len(unavailable_diagnoses) > 0:
            y_train = y_train.drop(unavailable_diagnoses, axis=1)
            y_val = y_val.drop(unavailable_diagnoses, axis=1)
        
        model = xgb.XGBClassifier()
        model.fit(X_train, y_train)
    
        y_pred = model.predict_proba(X_val)
        macro_auc_cv = round(roc_auc_score(y_val, y_pred, average='macro'), 4)
        macro_auc_scores.append(macro_auc_cv)
        micro_auc_cv = round(roc_auc_score(y_val, y_pred, average='micro'), 4)
        micro_auc_scores.append(micro_auc_cv)

        kfold_results[f'split_{i}_{j}'] = {
            'Macro AUC': macro_auc_cv,
            'Micro AUC': micro_auc_cv
        }

    print(np.mean(micro_auc_scores))
    print(np.mean(macro_auc_scores))

print('4-Fold CV:')
for source, score in kfold_results.items():
    print(source, score)

0.9208500000000001
0.8266
0.8728750000000001
0.7641249999999999
0.86755
0.7618
0.81605
0.7530000000000001
0.882925
0.7785
4-Fold CV:
split_1_1 {'Macro AUC': 0.8179, 'Micro AUC': 0.9249}
split_1_2 {'Macro AUC': 0.8309, 'Micro AUC': 0.9336}
split_1_3 {'Macro AUC': 0.8331, 'Micro AUC': 0.9295}
split_1_4 {'Macro AUC': 0.8245, 'Micro AUC': 0.8954}
split_2_1 {'Macro AUC': 0.7621, 'Micro AUC': 0.8677}
split_2_2 {'Macro AUC': 0.753, 'Micro AUC': 0.8811}
split_2_3 {'Macro AUC': 0.7686, 'Micro AUC': 0.8558}
split_2_4 {'Macro AUC': 0.7728, 'Micro AUC': 0.8869}
split_3_1 {'Macro AUC': 0.7763, 'Micro AUC': 0.8889}
split_3_2 {'Macro AUC': 0.7504, 'Micro AUC': 0.8433}
split_3_3 {'Macro AUC': 0.7347, 'Micro AUC': 0.8428}
split_3_4 {'Macro AUC': 0.7858, 'Micro AUC': 0.8952}
split_4_1 {'Macro AUC': 0.7503, 'Micro AUC': 0.8069}
split_4_2 {'Macro AUC': 0.7531, 'Micro AUC': 0.7961}
split_4_3 {'Macro AUC': 0.7377, 'Micro AUC': 0.8009}
split_4_4 {'Macro AUC': 0.7709, 'Micro AUC': 0.8603}
split_5_1 {'Macro AU

## OvO

In [11]:
feature_path = 'data/features'
sources = ['G12EC', 'SPH', 'CPSC_CPSC-Extra', 'ChapmanShaoxing_Ningbo', 'PTB_PTBXL']
OvO_results = {}

for train_source in sources:

    X_train, y_train = preprocess_data(file=os.path.join(feature_path, f'{train_source}_feats.csv'))

    unavailable_diagnoses_train = y_train.sum()[y_train.sum() < 1].index
    y_train = y_train.drop(unavailable_diagnoses_train, axis=1)

    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # One source vs one source
    OvO_results[train_source] = {}    
    for test_source in sources:
        X_test, y_test = preprocess_data(file=os.path.join(feature_path, f'{test_source}_feats.csv'))

        y_test = y_test.drop(unavailable_diagnoses_train, axis=1)
        available_test = (y_test.sum() >= 1)
        unavailable_diagnoses_test = y_test.sum()[y_test.sum() < 1].index
        y_test = y_test.drop(unavailable_diagnoses_test, axis=1)

        y_pred = model.predict_proba(X_test)[:,available_test]

        score = {'Macro AUC': round(roc_auc_score(y_test, y_pred, average='macro'), 4),
                 'Micro AUC': round(roc_auc_score(y_test, y_pred, average='micro'), 4)
        }
        
        OvO_results[train_source][test_source] = score
        
avg_macro = 0
avg_micro = 0
for k in OvO_results.keys():
    for k2 in OvO_results[k].keys():
        if k != k2:
            avg_macro += OvO_results[k][k2]['Macro AUC']
            avg_micro += OvO_results[k][k2]['Micro AUC']
avg_macro /= len(OvO_results.keys())*(len(OvO_results[k].keys()) - 1)
avg_micro /= len(OvO_results.keys())*(len(OvO_results[k].keys()) - 1)

df_macro = pd.DataFrame()
df_micro = pd.DataFrame()

for outer_key, inner_dict in OvO_results.items():
    for inner_key, metrics in inner_dict.items():
        df_macro.loc[inner_key, outer_key] = metrics.get('Macro AUC', None)
        df_micro.loc[inner_key, outer_key] = metrics.get('Micro AUC', None)

print("Macro AUC:")
display(df_macro)

print("\nMicro AUC:")
display(df_micro)

Macro AUC:


Unnamed: 0,G12EC,SPH,CPSC_CPSC-Extra,ChapmanShaoxing_Ningbo,PTB_PTBXL
G12EC,0.9997,0.7193,0.6826,0.7464,0.6815
SPH,0.7316,0.9992,0.7397,0.7398,0.7672
CPSC_CPSC-Extra,0.6915,0.7358,0.9998,0.7104,0.7215
ChapmanShaoxing_Ningbo,0.7557,0.7287,0.6985,0.9918,0.7264
PTB_PTBXL,0.6737,0.7325,0.7142,0.7085,0.9971



Micro AUC:


Unnamed: 0,G12EC,SPH,CPSC_CPSC-Extra,ChapmanShaoxing_Ningbo,PTB_PTBXL
G12EC,0.9999,0.786,0.6372,0.8313,0.7495
SPH,0.8203,0.9996,0.7993,0.8831,0.9415
CPSC_CPSC-Extra,0.7448,0.7932,0.9999,0.643,0.8096
ChapmanShaoxing_Ningbo,0.7811,0.8439,0.6764,0.9972,0.787
PTB_PTBXL,0.709,0.8492,0.7675,0.7747,0.9988


## LSO

In [12]:
feature_path = 'data/features'
sources = ['G12EC', 'SPH', 'CPSC_CPSC-Extra', 'ChapmanShaoxing_Ningbo', 'PTB_PTBXL']

kfold_results = {}
lso_train_results = {}
lso_test_results = {}
for test_source in sources:
    lso_train_results[test_source] = {}
    train_sources = [s for s in sources if s != test_source]

    # LSO test
    X_train, y_train = preprocess_data(df_all[~df_all['source'].isin([test_source])])
    X_test, y_test = preprocess_data(file=os.path.join(feature_path, f'{test_source}_feats.csv'))

    unavailable_diagnoses_train = y_train.sum()[y_test.sum() < 1].index
    unavailable_diagnoses_test = y_test.sum()[y_test.sum() < 1].index
    unavailable_diagnoses = unavailable_diagnoses_train.union(unavailable_diagnoses_test)

    if len(unavailable_diagnoses) > 0:
        # print(unavailable_diagnoses)
        y_test = y_test.drop(unavailable_diagnoses, axis=1)
        y_train = y_train.drop(unavailable_diagnoses, axis=1)

    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
 
    score = {
        'Macro AUC': round(roc_auc_score(y_test, y_pred, average='macro'), 4),
        'Micro AUC': round(roc_auc_score(y_test, y_pred, average='micro'), 4)
    }
    lso_test_results[test_source] = score

    print(f'Test set {test_source}:\n{score}')

avg_macro = 0
avg_micro = 0
for k in lso_test_results.keys():
    avg_macro += lso_test_results[k]['Macro AUC']
    avg_micro += lso_test_results[k]['Micro AUC']
avg_macro /= len(sources)
avg_micro /= len(sources)

print(f'\nTest Macro/Micro: {avg_macro:.2f} / {avg_micro:.2f}')

lso_test_results

Test set G12EC:
{'Macro AUC': 0.681, 'Micro AUC': 0.7181}
Test set SPH:
{'Macro AUC': 0.7074, 'Micro AUC': 0.878}
Test set CPSC_CPSC-Extra:
{'Macro AUC': 0.6818, 'Micro AUC': 0.7448}
Test set ChapmanShaoxing_Ningbo:
{'Macro AUC': 0.7192, 'Micro AUC': 0.7865}
Test set PTB_PTBXL:
{'Macro AUC': 0.7061, 'Micro AUC': 0.7758}

Test Macro/Micro: 0.70 / 0.78


{'G12EC': {'Macro AUC': 0.681, 'Micro AUC': 0.7181},
 'SPH': {'Macro AUC': 0.7074, 'Micro AUC': 0.878},
 'CPSC_CPSC-Extra': {'Macro AUC': 0.6818, 'Micro AUC': 0.7448},
 'ChapmanShaoxing_Ningbo': {'Macro AUC': 0.7192, 'Micro AUC': 0.7865},
 'PTB_PTBXL': {'Macro AUC': 0.7061, 'Micro AUC': 0.7758}}