In [1]:
import pandas as pd
import numpy as np
import time
from pycaret.classification import *

In [2]:
def check_class_ratio(dataset):
    class_ratio = round(np.mean(dataset.label), 2)
    return class_ratio

def random_split_stay(df, train_ratio, Threshold, n_trial):
    data = df.copy()
    
    search_time = time.time()
    
    for T in range(n_trial):
        array = data.subject_id.unique()
        
        # seed = np.random.randint(0, 10000, 1)
        seed = 3582
        np.random.seed(seed) 
        np.random.shuffle(array)


        split_point = int(train_ratio * len(array))
        stay_for_train, stay_for_test = np.split(array, [split_point])

        

        condition_train = data.subject_id.isin(stay_for_train)
        holdout_train = data[condition_train]

        condition_test = data.subject_id.isin(stay_for_test)
        holdout_test = data[condition_test]
        
        # holdout_test = holdout_test.sort_values(by=['subject_id',  'stay_id', 'Time_since_ICU_admission']) 
        # holdout_train = holdout_train.sort_values(by=['subject_id',  'stay_id', 'Time_since_ICU_admission']) 

        train_class_ratio  = check_class_ratio(holdout_train)
        test_class_ratio  = check_class_ratio(holdout_test)
                
            
        if (train_class_ratio - test_class_ratio) >= Threshold:
            
            break  # 클래스 비율이 모두 threshold 이상일 경우 반복문 종료
        
        if T % 100 == 0:
            print('Trial: ', T)
            
        if T % 10000 == 0:
        
            Threshold = Threshold + 0.05
            print('Threshold 조정 + 0.05, 현재 한계값: {}'.format(Threshold))
        
        if T == 9999:
            print('최대 Trial 달성, 분할 불가')
        
    train = holdout_train.copy()
    test = holdout_test.copy()
    search_time_end = time.time()
    
    trn_class1 = train.label.value_counts()[0]
    trn_class2 = train.label.value_counts()[1]
    
    tes_class1 = test.label.value_counts()[0]
    tes_class2 = test.label.value_counts()[1]

    
    
    print('train set : test set = {} : {}'.format(train_ratio, 1-train_ratio))
    print('Train set class: ', train.label.value_counts().sort_index())
    print('Test set class: ', test.label.value_counts().sort_index())
    print('-'*20)
    print('Train class ratio: {}:{}'.format((trn_class1)/(trn_class1+trn_class2), (trn_class2)/(trn_class1+trn_class2)))
    print('Test class ratio: {}:{}'.format((tes_class1)/(tes_class1+tes_class2), (tes_class2)/(tes_class1+tes_class2)))
    print('-'*20)
    print('Number of trainset patient:', len(train.subject_id.unique()))
    print('Number of testset patient:', len(test.subject_id.unique()))
    print('Number of trainset stay:', len(train.stay_id.unique()))
    print('Number of testset stay:', len(test.stay_id.unique()))
    print('-'*20)
    print('Split seed: ',seed)
    print('train ratio:', train_ratio)
    print('Threshold:', Threshold)
    print('-'*20)
    print('총 소요 시간(초):{}'.format(search_time_end - search_time))
    print('시도한 trial 수: ', T)
    
    return train, test, stay_for_train, stay_for_test

In [3]:
dataset = pd.read_csv('/Users/DAHS/MIMIC-IV-Data-Pipeline/MIMIC_pipeline/Case Labeling/mimic_df.csv.gz', index_col = 0, compression='gzip')

In [4]:
import gc
gc.collect()

20

## Mortality

In [5]:
mortality = pd.read_csv('/Users/DAHS/Desktop/circ_mimic_preprocessing_1day/data/cohort/cohort_icu_mortality_0_.csv.gz', index_col = 0, compression='gzip')

In [6]:
mort_dict = dict(zip(mortality[['stay_id', 'label']].stay_id, mortality[['stay_id', 'label']].label))
dataset['label'] = dataset['stay_id'].copy()
dataset['label'] = dataset['label'].map(mort_dict)

In [7]:
dataset.label.value_counts()

0    1741624
1     298419
Name: label, dtype: int64

In [8]:
mimic_train_ori, mimic_valid_ori, trn_patient, val_patient = random_split_stay(dataset, 0.7, Threshold=0.05, n_trial=1) 

Trial:  0
Threshold 조정 + 0.05, 현재 한계값: 0.1
train set : test set = 0.7 : 0.30000000000000004
Train set class:  0    1209558
1     204287
Name: label, dtype: int64
Test set class:  0    532066
1     94132
Name: label, dtype: int64
--------------------
Train class ratio: 0.8555096209273294:0.14449037907267062
Test class ratio: 0.8496769392428593:0.15032306075714072
--------------------
Number of trainset patient: 12918
Number of testset patient: 5537
Number of trainset stay: 14337
Number of testset stay: 6212
--------------------
Split seed:  3582
train ratio: 0.7
Threshold: 0.1
--------------------
총 소요 시간(초):1.3889904022216797
시도한 trial 수:  0


In [9]:
# original space, circulatory prediction

input_space_clf_setting = setup(data = mimic_train_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), target = 'label',
                                test_data = mimic_valid_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), normalize=True,
                                index=False, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Binary
3,Original data shape,"(2040043, 215)"
4,Transformed data shape,"(2040043, 215)"
5,Transformed train set shape,"(1413845, 215)"
6,Transformed test set shape,"(626198, 215)"
7,Numeric features,214
8,Preprocess,True
9,Imputation type,simple


In [10]:
from sklearn.metrics import average_precision_score
add_metric('auprc', 'AUPRC', average_precision_score, target = 'pred_proba')

Name                                                             AUPRC
Display Name                                                     AUPRC
Score Function       <pycaret.internal.metrics.EncodedDecodedLabels...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: auprc, dtype: object

In [11]:
best_model = compare_models(cross_validation=False, include=['lightgbm', 'xgboost', 'nb', 'lr'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8777,0.8698,0.3816,0.6613,0.484,0.4202,0.4407,0.5928,34.02
lr,Logistic Regression,0.8759,0.8497,0.3678,0.6552,0.4712,0.4071,0.4289,0.5691,70.42
xgboost,Extreme Gradient Boosting,0.8743,0.8566,0.3964,0.6302,0.4867,0.4192,0.4339,0.5699,22.67
nb,Naive Bayes,0.8355,0.7697,0.4015,0.4474,0.4232,0.3276,0.3283,0.3745,14.22


## Readmission

In [4]:
readmission = pd.read_csv('/Users/DAHS/Desktop/circ_mimic_preprocessing_1day/data/cohort/cohort_icu_readmission_30_.csv.gz', index_col = 0, compression='gzip')

readm_dict = dict(zip(readmission[['stay_id', 'label']].stay_id, readmission[['stay_id', 'label']].label))
dataset['label'] = dataset['stay_id'].copy()
dataset['label'] = dataset['label'].map(readm_dict)

mimic_train_ori, mimic_valid_ori, trn_patient, val_patient = random_split_stay(dataset.dropna(), 0.7, Threshold=0.05, n_trial=1) 

Trial:  0
Threshold 조정 + 0.05, 현재 한계값: 0.1
train set : test set = 0.7 : 0.30000000000000004
Train set class:  0.0    967376
1.0    246181
Name: label, dtype: int64
Test set class:  0.0    428308
1.0     99300
Name: label, dtype: int64
--------------------
Train class ratio: 0.7971409665965422:0.20285903340345776
Test class ratio: 0.8117920880653818:0.18820791193461814
--------------------
Number of trainset patient: 11365
Number of testset patient: 4872
Number of trainset stay: 12552
Number of testset stay: 5406
--------------------
Split seed:  3582
train ratio: 0.7
Threshold: 0.1
--------------------
총 소요 시간(초):1.168750524520874
시도한 trial 수:  0


In [5]:
# original space, readmission prediction

input_space_clf_setting = setup(data = mimic_train_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), target = 'label',
                                test_data = mimic_valid_ori.drop(['Unnamed: 0', 'subject_id', 'stay_id', 'hadm_id','Annotation','classes', 'CIRC_next_12h'], axis = 1), normalize=True,
                                index=False, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Binary
3,Original data shape,"(1741165, 215)"
4,Transformed data shape,"(1741165, 215)"
5,Transformed train set shape,"(1213557, 215)"
6,Transformed test set shape,"(527608, 215)"
7,Numeric features,214
8,Preprocess,True
9,Imputation type,simple


In [6]:
from sklearn.metrics import average_precision_score
add_metric('auprc', 'AUPRC', average_precision_score, target = 'pred_proba')
best_model = compare_models(cross_validation=False, include=['lightgbm', 'xgboost', 'nb', 'lr'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC,TT (Sec)
lr,Logistic Regression,0.8081,0.6367,0.0422,0.4057,0.0764,0.0425,0.0786,0.2852,90.51
lightgbm,Light Gradient Boosting Machine,0.8022,0.6216,0.0298,0.2692,0.0537,0.0168,0.0302,0.2566,22.89
xgboost,Extreme Gradient Boosting,0.7903,0.59,0.0816,0.2942,0.1277,0.0501,0.0636,0.242,22.69
nb,Naive Bayes,0.2025,0.5071,0.9747,0.1879,0.3151,-0.0007,-0.0046,0.1905,19.83
