In [1]:
import pandas as pd
import numpy as np
import time
from pycaret.classification import *

In [2]:
def check_class_ratio(dataset):
    class_ratio = round(np.mean(dataset.label), 2)
    return class_ratio

def random_split_stay(df, train_ratio, Threshold, n_trial):
    data = df.copy()
    
    search_time = time.time()
    
    for T in range(n_trial):
        array = data.subject_id.unique()
        
        # seed = np.random.randint(0, 10000, 1)
        seed = 3582
        np.random.seed(seed) 
        np.random.shuffle(array)


        split_point = int(train_ratio * len(array))
        stay_for_train, stay_for_test = np.split(array, [split_point])

        

        condition_train = data.subject_id.isin(stay_for_train)
        holdout_train = data[condition_train]

        condition_test = data.subject_id.isin(stay_for_test)
        holdout_test = data[condition_test]
        
        # holdout_test = holdout_test.sort_values(by=['subject_id',  'stay_id', 'Time_since_ICU_admission']) 
        # holdout_train = holdout_train.sort_values(by=['subject_id',  'stay_id', 'Time_since_ICU_admission']) 

        train_class_ratio  = check_class_ratio(holdout_train)
        test_class_ratio  = check_class_ratio(holdout_test)
                
            
        if (train_class_ratio - test_class_ratio) >= Threshold:
            
            break  # 클래스 비율이 모두 threshold 이상일 경우 반복문 종료
        
        if T % 100 == 0:
            print('Trial: ', T)
            
        if T % 10000 == 0:
        
            Threshold = Threshold + 0.05
            print('Threshold 조정 + 0.05, 현재 한계값: {}'.format(Threshold))
        
        if T == 9999:
            print('최대 Trial 달성, 분할 불가')
        
    train = holdout_train.copy()
    test = holdout_test.copy()
    search_time_end = time.time()
    
    trn_class1 = train.label.value_counts()[0]
    trn_class2 = train.label.value_counts()[1]
    
    tes_class1 = test.label.value_counts()[0]
    tes_class2 = test.label.value_counts()[1]

    
    
    print('train set : test set = {} : {}'.format(train_ratio, 1-train_ratio))
    print('Train set class: ', train.label.value_counts().sort_index())
    print('Test set class: ', test.label.value_counts().sort_index())
    print('-'*20)
    print('Train class ratio: {}:{}'.format((trn_class1)/(trn_class1+trn_class2), (trn_class2)/(trn_class1+trn_class2)))
    print('Test class ratio: {}:{}'.format((tes_class1)/(tes_class1+tes_class2), (tes_class2)/(tes_class1+tes_class2)))
    print('-'*20)
    print('Number of trainset patient:', len(train.subject_id.unique()))
    print('Number of testset patient:', len(test.subject_id.unique()))
    print('Number of trainset stay:', len(train.stay_id.unique()))
    print('Number of testset stay:', len(test.stay_id.unique()))
    print('-'*20)
    print('Split seed: ',seed)
    print('train ratio:', train_ratio)
    print('Threshold:', Threshold)
    print('-'*20)
    print('총 소요 시간(초):{}'.format(search_time_end - search_time))
    print('시도한 trial 수: ', T)
    
    return train, test, stay_for_train, stay_for_test

In [3]:
dataset = pd.read_csv('/Users/DAHS/MIMIC-IV-Data-Pipeline/MIMIC_pipeline/Case Labeling/mimic_df.csv.gz', index_col = 0, compression='gzip')

In [4]:
embset = pd.read_csv('/Users/DAHS/MIMIC-IV-Data-Pipeline/MIMIC_pipeline/supervised_contrastive_learning/embedding_data.csv.gz', index_col = 0, compression='gzip')

In [5]:
total_dataset = pd.concat([dataset.reset_index(drop=True), embset.drop(['subject_id', 'stay_id', 'hadm_id', '43'], axis = 1).reset_index(drop=True)], axis = 1)

In [6]:
import gc
gc.collect()

20

## Mortality

In [20]:
mortality = pd.read_csv('/Users/DAHS/Desktop/circ_mimic_preprocessing_1day/data/cohort/cohort_icu_mortality_0_.csv.gz', index_col = 0, compression='gzip')

In [21]:
mort_dict = dict(zip(mortality[['stay_id', 'label']].stay_id, mortality[['stay_id', 'label']].label))
total_dataset['label'] = total_dataset['stay_id'].copy()
total_dataset['label'] = total_dataset['label'].map(mort_dict)

In [39]:
total_dataset.label.value_counts()

0    1741624
1     298419
Name: label, dtype: int64

In [56]:
feature_list = ['septic_shock', 'SoFa_score', 'mortor_response', 'GCS_score', 'Eye_Opening', 'sepsis', 'suspected_infection', 'PAPs', 'Peak_insp_P', 'None_Invasive_Ventilation_within48',
           'Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h', 'label', 'PaO2/FiO2', 'Cumulative_Norepinephrine', 'INR', 'MAP', 'spinal_cord_injury', 'Sofa_GCS', 'Invasive_Ventilation_within48', 'Age',
           'ECMO', 'Cumulative_Digoxin', 'ECMO_within48', 'height_fillna', 'height', 'IABP', 'IABP_within48', 'Impella', 'Impella_within48']

In [57]:
mimic_train_ori, mimic_valid_ori, trn_patient, val_patient = random_split_stay(total_dataset[feature_list], 0.7, Threshold=0.05, n_trial=1) 

Trial:  0
Threshold 조정 + 0.05, 현재 한계값: 0.1
train set : test set = 0.7 : 0.30000000000000004
Train set class:  0    1209558
1     204287
Name: label, dtype: int64
Test set class:  0    532066
1     94132
Name: label, dtype: int64
--------------------
Train class ratio: 0.8555096209273294:0.14449037907267062
Test class ratio: 0.8496769392428593:0.15032306075714072
--------------------
Number of trainset patient: 12918
Number of testset patient: 5537
Number of trainset stay: 14337
Number of testset stay: 6212
--------------------
Split seed:  3582
train ratio: 0.7
Threshold: 0.1
--------------------
총 소요 시간(초):0.2560770511627197
시도한 trial 수:  0


In [62]:
# original space, circulatory prediction

input_space_clf_setting = setup(data = mimic_train_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), target = 'label',
                                test_data = mimic_valid_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), normalize=True, normalize_method='minmax',
                                index=False, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Binary
3,Original data shape,"(2040043, 28)"
4,Transformed data shape,"(2040043, 28)"
5,Transformed train set shape,"(1413845, 28)"
6,Transformed test set shape,"(626198, 28)"
7,Numeric features,27
8,Preprocess,True
9,Imputation type,simple


In [63]:
from sklearn.metrics import average_precision_score
add_metric('auprc', 'AUPRC', average_precision_score, target = 'pred_proba')

Name                                                             AUPRC
Display Name                                                     AUPRC
Score Function       <pycaret.internal.metrics.EncodedDecodedLabels...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: auprc, dtype: object

In [64]:
best_model = compare_models(cross_validation=False, include=['lightgbm', 'xgboost', 'svm', 'nb', 'lr'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.872,0.8447,0.346,0.6365,0.4483,0.383,0.4058,0.5311,1.56
lr,Logistic Regression,0.8685,0.8202,0.2945,0.6348,0.4024,0.3394,0.3712,0.5079,9.44
svm,SVM - Linear Kernel,0.867,0.586,0.1841,0.7269,0.2938,0.2482,0.321,0.2565,2.18
xgboost,Extreme Gradient Boosting,0.864,0.8279,0.3505,0.5784,0.4365,0.3644,0.3792,0.4988,2.29
nb,Naive Bayes,0.7278,0.7715,0.6716,0.3118,0.4258,0.2775,0.3126,0.3877,1.34


In [35]:
best_model = compare_models(cross_validation=False, include=['lightgbm', 'xgboost', 'svm', 'nb', 'lr'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8773,0.8704,0.3837,0.6574,0.4846,0.4204,0.4401,0.5902,32.59
lr,Logistic Regression,0.8753,0.8503,0.3693,0.6503,0.471,0.4064,0.4274,0.5682,186.36
xgboost,Extreme Gradient Boosting,0.872,0.8497,0.3914,0.6171,0.479,0.4101,0.424,0.5452,19.88
svm,SVM - Linear Kernel,0.871,0.605,0.2246,0.7311,0.3436,0.2937,0.3576,0.2808,24.09
nb,Naive Bayes,0.8355,0.7579,0.4154,0.4491,0.4316,0.3356,0.336,0.3688,16.18


In [28]:
total_dataset.head()

Unnamed: 0.1,Unnamed: 0,Time_since_ICU_admission,Phenylephrine,NaCl 0.9%,Dextrose_5%,Norepinephrine,PO Intake,NaCl 0.45%,LR,D5 1/2NS,...,34,35,36,37,38,39,40,41,42,label
0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.00074,0.067597,4.653528,2.20289,2.901496,2.864099,2.618783,0.087353,3.096479,0
1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000639,0.047668,3.728165,1.5986,2.192191,2.168457,1.949224,0.059626,2.341767,0
2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000423,0.051923,2.583844,1.276105,1.700567,1.666697,1.52652,0.063449,1.81368,0
3,3,3,0.0,941.299999,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000693,0.046165,0.422545,2.706169,2.693073,2.628306,2.679519,0.068963,2.690938,0
4,4,4,0.0,941.299999,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0006,0.025036,0.314166,1.989198,1.934439,1.872924,1.936209,0.04555,1.912561,0


In [7]:
readmission = pd.read_csv('/Users/DAHS/Desktop/circ_mimic_preprocessing_1day/data/cohort/cohort_icu_readmission_30_.csv.gz', index_col = 0, compression='gzip')

readm_dict = dict(zip(readmission[['stay_id', 'label']].stay_id, readmission[['stay_id', 'label']].label))
total_dataset['label'] = total_dataset['stay_id'].copy()
total_dataset['label'] = total_dataset['label'].map(readm_dict)

mimic_train_ori, mimic_valid_ori, trn_patient, val_patient = random_split_stay(total_dataset.dropna(), 0.7, Threshold=0.05, n_trial=1) 

Trial:  0
Threshold 조정 + 0.05, 현재 한계값: 0.1
train set : test set = 0.7 : 0.30000000000000004
Train set class:  0.0    967376
1.0    246181
Name: label, dtype: int64
Test set class:  0.0    428308
1.0     99300
Name: label, dtype: int64
--------------------
Train class ratio: 0.7971409665965422:0.20285903340345776
Test class ratio: 0.8117920880653818:0.18820791193461814
--------------------
Number of trainset patient: 11365
Number of testset patient: 4872
Number of trainset stay: 12552
Number of testset stay: 5406
--------------------
Split seed:  3582
train ratio: 0.7
Threshold: 0.1
--------------------
총 소요 시간(초):1.4943499565124512
시도한 trial 수:  0


In [8]:
input_space_clf_setting = setup(data = mimic_train_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), target = 'label',
                                test_data = mimic_valid_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), normalize=True,
                                index=False, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Binary
3,Original data shape,"(1741165, 255)"
4,Transformed data shape,"(1741165, 255)"
5,Transformed train set shape,"(1213557, 255)"
6,Transformed test set shape,"(527608, 255)"
7,Numeric features,254
8,Preprocess,True
9,Imputation type,simple


In [9]:
import gc
gc.collect()
from sklearn.metrics import average_precision_score
add_metric('auprc', 'AUPRC', average_precision_score, target = 'pred_proba')
best_model = compare_models(cross_validation=False, include=['lightgbm', 'xgboost', 'nb', 'lr'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8044,0.6322,0.0319,0.3103,0.0579,0.0236,0.0439,0.2658,17.59
xgboost,Extreme Gradient Boosting,0.7858,0.5805,0.0688,0.2494,0.1079,0.0288,0.0367,0.2329,17.27
nb,Naive Bayes,0.203,0.5089,0.9748,0.188,0.3152,-0.0004,-0.0029,0.1911,14.76


Processing:   0%|          | 0/21 [00:00<?, ?it/s]

KeyboardInterrupt: 