In [3]:
import pandas as pd
import json
import glob
import os

loading all mimic files for basic eda

In [4]:
mimic_domain_path = '/pvc/data/continualLearning/mimic_domains/'
def load_single_df(all_files: str) -> pd.DataFrame:
    """ Load a single .csv file, transform the labels from text into nummeric, one-hot encoded version"""

    converter = {'labels': eval}
    df = pd.read_csv(all_files, converters = converter)
    df.rename(columns={'TEXT': 'text', 'notes': 'text'}, inplace=True)

    return df

def load_multiple_dfs(path) -> dict:
    """ Load all .csv files in a given directory, using the load_and_label_df function for each file by default, 
    return a dictionary with the dataframe, and a dictionary mapping the nummeric labels to the text labels
    specific_sets: str, default None, could be 'mimic' or 'codiesp' to load only one set of dataset"""
    
    # get all files in the directory
    files = glob.glob(os.path.join(path, "*.csv"))
    file_names = [(i.split('/')[-1]) for i in files]  # get the file name without the path to use as dictionary key
    
    # # load all files into the dictionaries
    frames = {}
    for idx, i in enumerate(file_names):
        frames[i] = load_single_df(files[idx])
    
    return frames

mimic_domain_files = load_multiple_dfs(mimic_domain_path)

In [5]:
keys = sorted(mimic_domain_files.keys())
print(f'{len(keys)} files loaded: {keys}') 

63 files loaded: ['mimic_iv_domain_Certain_infectious_and_parasitic_diseases.csv_fold_1_dev.csv', 'mimic_iv_domain_Certain_infectious_and_parasitic_diseases.csv_fold_1_test.csv', 'mimic_iv_domain_Certain_infectious_and_parasitic_diseases.csv_fold_1_train.csv', 'mimic_iv_domain_Congenital_malformations_deformations_and_chromosomal_abnormalities.csv_fold_1_dev.csv', 'mimic_iv_domain_Congenital_malformations_deformations_and_chromosomal_abnormalities.csv_fold_1_test.csv', 'mimic_iv_domain_Congenital_malformations_deformations_and_chromosomal_abnormalities.csv_fold_1_train.csv', 'mimic_iv_domain_Diseases_of_the_blood_and_blood-forming_organs_and_certain_disorders_involving_the_immune_mechanism.csv_fold_1_dev.csv', 'mimic_iv_domain_Diseases_of_the_blood_and_blood-forming_organs_and_certain_disorders_involving_the_immune_mechanism.csv_fold_1_test.csv', 'mimic_iv_domain_Diseases_of_the_blood_and_blood-forming_organs_and_certain_disorders_involving_the_immune_mechanism.csv_fold_1_train.csv', '

In [6]:
train_only = [i for i in keys if 'train' in i]
result_list = [(i, len(mimic_domain_files[i])) for i in train_only]
sorted_by_size = sorted(result_list, key=lambda x: x[1], reverse=True)
for i, item in enumerate(sorted_by_size):
    print(i, item)

0 ('mimic_iv_domain_Diseases_of_the_circulatory_system.csv_fold_1_train.csv', 50976)
1 ('mimic_iv_domain_Factors_influencing_health_status_and_contact_with_health_services.csv_fold_1_train.csv', 42992)
2 ('mimic_iv_domain_Endocrine_nutritional_and_metabolic_diseases.csv_fold_1_train.csv', 28337)
3 ('mimic_iv_domain_Diseases_of_the_digestive_system.csv_fold_1_train.csv', 20577)
4 ('mimic_iv_domain_Mental_and_behavioural_disorders.csv_fold_1_train.csv', 14029)
5 ('mimic_iv_domain_Symptoms_signs_and_abnormal_clinical_and_laboratory_findings_not_elsewhere_classified.csv_fold_1_train.csv', 12008)
6 ('mimic_iv_domain_Diseases_of_the_musculoskeletal_system_and_connective_tissue.csv_fold_1_train.csv', 7455)
7 ('mimic_iv_domain_Injury_poisoning_and_certain_other_consequences_of_external_causes.csv_fold_1_train.csv', 6048)
8 ('mimic_iv_domain_Diseases_of_the_genitourinary_system.csv_fold_1_train.csv', 5501)
9 ('mimic_iv_domain_Neoplasms.csv_fold_1_train.csv', 5446)
10 ('mimic_iv_domain_External_

In [10]:
train_only

['mimic_iv_domain_Certain_infectious_and_parasitic_diseases.csv_fold_1_train.csv',
 'mimic_iv_domain_Congenital_malformations_deformations_and_chromosomal_abnormalities.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_blood_and_blood-forming_organs_and_certain_disorders_involving_the_immune_mechanism.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_circulatory_system.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_digestive_system.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_ear_and_mastoid_process.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_eye_and_adnexa.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_genitourinary_system.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_musculoskeletal_system_and_connective_tissue.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_nervous_system.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_respiratory_system.csv_fold_1_train.csv',
 'mimic_iv_domain_Diseases_of_the_sk

In [14]:
results = mimic_domain_files
dev = results['mimic_iv_domain_Symptoms_signs_and_abnormal_clinical_and_laboratory_findings_not_elsewhere_classified.csv_fold_1_dev.csv']
train = results['mimic_iv_domain_Symptoms_signs_and_abnormal_clinical_and_laboratory_findings_not_elsewhere_classified.csv_fold_1_train.csv']
test = results['mimic_iv_domain_Symptoms_signs_and_abnormal_clinical_and_laboratory_findings_not_elsewhere_classified.csv_fold_1_test.csv']

In [16]:
pd.concat([train, dev, test]).reset_index(drop=True).text.drop_duplicates().shape

(18035,)

In [17]:
set_dev = set(dev.subject_id.tolist()) 
set_train = set(train.subject_id.tolist())
set_test = set(test.subject_id.tolist())

In [18]:
len(set_dev.intersection(set_train))

476

In [19]:
len(set_dev.intersection(set_test))

155

In [20]:
len(set_train.intersection(set_test))

504

In [21]:
pd.set_option('display.max_colwidth', None)
path_remote = '/pvc/optuna_best_results/'
path_local = '/Users/lsacy/lightning_output/'

def load_multiple_dfs(ending, path) -> dict:
    """ Load all .csv files in a given directory, using the load_and_label_df function for each file by default, 
    return a dictionary with the dataframe, and a dictionary mapping the nummeric labels to the text labels
    specific_sets: str, default None, could be 'mimic' or 'codiesp' to load only one set of dataset"""
    
    # get all files in the directory
    files = glob.glob(os.path.join(path, ending))
    file_names = [(i.split('/')[-1]) for i in files]  # get the file name without the path to use as dictionary key
    
    # # load all files into the dictionaries
    frames = {}
    for idx, i in enumerate(file_names):
        frames[i] = load_single_df(files[idx])
    
    return frames

results = load_multiple_dfs('*.csv', path_remote)

In [22]:
list(results.keys())

['20230721-185725_reverse_domain_order_test_results.csv',
 '20230721-182054_reverse_domain_order_test_results.csv',
 '20230721-150511_Factors_influencing_health_status_results.csv',
 '20230721-092808_domains_remote_results.csv',
 '20230726-103229_domains_hpo_all_entries_reverse_top6_results.csv',
 '20230721-035255_domains_hpo_results.csv',
 '20230721-124316_domains_remote_results.csv',
 'ray_tun_hpo_5_langs.csv',
 '20230726-155107_domains_2-3_results.csv',
 '20230720-164436_domains_remote_results.csv',
 '20230721-153408_Factors_influencing_health_status_results.csv',
 '20230720-152700_test_results.csv',
 '20230713-081757_results.csv',
 '20230727-164238_language-remotetest_results.csv',
 '20230721-153324_Factors_influencing_health_status_results.csv',
 '20230721-153122_Factors_influencing_health_status_results.csv']

In [23]:
sorted_results = sorted(list(results.keys()))
sorted_results

['20230713-081757_results.csv',
 '20230720-152700_test_results.csv',
 '20230720-164436_domains_remote_results.csv',
 '20230721-035255_domains_hpo_results.csv',
 '20230721-092808_domains_remote_results.csv',
 '20230721-124316_domains_remote_results.csv',
 '20230721-150511_Factors_influencing_health_status_results.csv',
 '20230721-153122_Factors_influencing_health_status_results.csv',
 '20230721-153324_Factors_influencing_health_status_results.csv',
 '20230721-153408_Factors_influencing_health_status_results.csv',
 '20230721-182054_reverse_domain_order_test_results.csv',
 '20230721-185725_reverse_domain_order_test_results.csv',
 '20230726-103229_domains_hpo_all_entries_reverse_top6_results.csv',
 '20230726-155107_domains_2-3_results.csv',
 '20230727-164238_language-remotetest_results.csv',
 'ray_tun_hpo_5_langs.csv']

In [24]:
results[sorted_results[-2]]

Unnamed: 0.1,Unnamed: 0,brazilian,mimic
0,brazilian,[{'torchmetrics.auroc': 0.49723201990127563}],
1,mimic,[{'torchmetrics.auroc': 0.5645577311515808}],[{'torchmetrics.auroc': 0.6042832732200623}]


In [34]:
results[sorted_results[-2]]

Unnamed: 0.1,Unnamed: 0,Symptoms_signs_and_abnormal,Mental_and_behavioural_disorders
0,Symptoms_signs_and_abnormal,[{'torchmetrics.auroc': 0.7643149495124817}],
1,Mental_and_behavioural_disorders,[{'torchmetrics.auroc': 0.5938756465911865}],[{'torchmetrics.auroc': 0.6512281894683838}]


In [40]:
results[sorted_results[-7]]

Unnamed: 0.1,Unnamed: 0,Factors_influencing_health_status
0,Factors_influencing_health_status,[{'torchmetrics.auroc': 0.6909234523773193}]


In [36]:
results[sorted_results[-4]]

Unnamed: 0.1,Unnamed: 0,Symptoms_signs_and_abnormal
0,Symptoms_signs_and_abnormal,[{'torchmetrics.auroc': 0.7102967500686646}]


In [35]:
results[sorted_results[-5]]

Unnamed: 0.1,Unnamed: 0,Diseases_of_the_circulatory_system
0,Diseases_of_the_circulatory_system,[{'torchmetrics.auroc': 0.6293169856071472}]
