In [1]:
import pandas as pd
import numpy as np
import time
from imblearn.over_sampling import SMOTE

In [2]:
def check_class_ratio(dataset):
    class_ratio = round(np.mean(dataset.classes), 2)
    return class_ratio

def random_split_stay(df, train_ratio, Threshold, n_trial):
    data = df.copy()
    
    search_time = time.time()
    
    for T in range(n_trial):
        array = data.subject_id.unique()
        
        # seed = np.random.randint(0, 10000, 1)
        seed = 9756
        np.random.seed(seed) 
        np.random.shuffle(array)


        split_point = int(train_ratio * len(array))
        stay_for_train, stay_for_test = np.split(array, [split_point])

        

        condition_train = data.subject_id.isin(stay_for_train)
        holdout_train = data[condition_train]

        condition_test = data.subject_id.isin(stay_for_test)
        holdout_test = data[condition_test]
        
        # holdout_test = holdout_test.sort_values(by=['subject_id',  'stay_id', 'Time_since_ICU_admission']) 
        # holdout_train = holdout_train.sort_values(by=['subject_id',  'stay_id', 'Time_since_ICU_admission']) 

        train_class_ratio  = check_class_ratio(holdout_train)
        test_class_ratio  = check_class_ratio(holdout_test)
                
            
        if (train_class_ratio - test_class_ratio) >= Threshold:
            
            break  # 클래스 비율이 모두 threshold 이상일 경우 반복문 종료
        
        if T % 100 == 0:
            print('Trial: ', T)
            
        if T % 10000 == 0:
        
            Threshold = Threshold + 0.05
            print('Threshold 조정 + 0.05, 현재 한계값: {}'.format(Threshold))
        
        if T == 9999:
            print('최대 Trial 달성, 분할 불가')
        
    train = holdout_train.copy()
    test = holdout_test.copy()
    search_time_end = time.time()
    
    trn_class1 = train.classes.value_counts()[0]
    trn_class2 = train.classes.value_counts()[1]
    
    tes_class1 = test.classes.value_counts()[0]
    tes_class2 = test.classes.value_counts()[1]

    
    
    print('train set : test set = {} : {}'.format(train_ratio, 1-train_ratio))
    print('Train set class: ', train.classes.value_counts().sort_index())
    print('Test set class: ', test.classes.value_counts().sort_index())
    print('-'*20)
    print('Train class ratio: {}:{}'.format((trn_class1)/(trn_class1+trn_class2), (trn_class2)/(trn_class1+trn_class2)))
    print('Test class ratio: {}:{}'.format((tes_class1)/(tes_class1+tes_class2), (tes_class2)/(tes_class1+tes_class2)))
    print('-'*20)
    print('Number of trainset patient:', len(train.subject_id.unique()))
    print('Number of testset patient:', len(test.subject_id.unique()))
    print('Number of trainset stay:', len(train.stay_id.unique()))
    print('Number of testset stay:', len(test.stay_id.unique()))
    print('-'*20)
    print('Split seed: ',seed)
    print('train ratio:', train_ratio)
    print('Threshold:', Threshold)
    print('-'*20)
    print('총 소요 시간(초):{}'.format(search_time_end - search_time))
    print('시도한 trial 수: ', T)
    
    return train, test, stay_for_train, stay_for_test

In [3]:
dataset = pd.read_csv('/Users/DAHS/MIMIC-IV-Data-Pipeline/MIMIC_pipeline/mimic_df.csv', index_col = 0, compression='gzip')

In [4]:
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

dataset['pao2/fio2_fillna']=0
idx = dataset[dataset['pao2/fio2'].isnull()].index
dataset['pao2/fio2'].loc[idx]=0
dataset['pao2/fio2_fillna'].loc[idx]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['pao2/fio2'].loc[idx]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['pao2/fio2_fillna'].loc[idx]=1


In [21]:
len(dataset)

1325997

In [5]:
# dataset = pd.read_csv('./Data/final/mimic-v_circ12h.csv', index_col = 0)
early_prediction = dataset[(dataset['classes']==0)|(dataset['classes']==1)]
mimic_train_ori, mimic_valid_ori, trn_patient, val_patient = random_split_stay(early_prediction, 0.7, Threshold=0.05, n_trial=1) 
mimic_train_ori = mimic_train_ori.reset_index(drop=True)
mimic_valid_ori = mimic_valid_ori.reset_index(drop=True)

Trial:  0
Threshold 조정 + 0.05, 현재 한계값: 0.1
train set : test set = 0.7 : 0.30000000000000004
Train set class:  0    753345
1     16786
Name: classes, dtype: int64
Test set class:  0    324233
1      6822
Name: classes, dtype: int64
--------------------
Train class ratio: 0.9782037082002932:0.021796291799706803
Test class ratio: 0.9793931521952546:0.020606847804745435
--------------------
Number of trainset patient: 5525
Number of testset patient: 2368
Number of trainset stay: 6065
Number of testset stay: 2621
--------------------
Split seed:  9756
train ratio: 0.7
Threshold: 0.1
--------------------
총 소요 시간(초):0.5920193195343018
시도한 trial 수:  0


In [7]:
len(dataset)

818286

In [8]:
embset = pd.read_csv('/Users/DAHS/MIMIC-IV-Data-Pipeline/MIMIC_pipeline/supervised_contrastive_learning/embedding_data.csv', index_col = 0)
embset = embset.rename(columns ={'184':'classes'})

eep_emb = embset[(embset['classes']==0)|(embset['classes']==1)]

mimic_train_emb, mimic_valid_emb, trn_patient, val_patient = random_split_stay(eep_emb, 0.7, Threshold=0.05, n_trial=1) 
mimic_train_emb = mimic_train_emb.reset_index(drop=True)
mimic_valid_emb = mimic_valid_emb.reset_index(drop=True)

for_feature = mimic_train_ori.drop(['subject_id', 'stay_id', 'hadm_id','Annotation','circ_next_12h', 'classes'], axis = 1)
for_feature_v = mimic_valid_ori.drop(['subject_id', 'stay_id', 'hadm_id','Annotation','circ_next_12h', 'classes'], axis = 1)

mimic_train_feature = pd.concat([for_feature, mimic_train_emb], axis = 1)
mimic_valid_feature = pd.concat([for_feature_v, mimic_valid_emb], axis = 1)

Trial:  0
Threshold 조정 + 0.05, 현재 한계값: 0.1
train set : test set = 0.7 : 0.30000000000000004
Train set class:  0    393779
1     16667
Name: classes, dtype: int64
Test set class:  0    175301
1      6946
Name: classes, dtype: int64
--------------------
Train class ratio: 0.9593929530315802:0.04060704696841972
Test class ratio: 0.9618868897704763:0.03811311022952367
--------------------
Number of trainset patient: 3525
Number of testset patient: 1512
Number of trainset stay: 3749
Number of testset stay: 1617
--------------------
Split seed:  49
train ratio: 0.7
Threshold: 0.1
--------------------
총 소요 시간(초):0.6470751762390137
시도한 trial 수:  0


In [6]:
smote = SMOTE(random_state=42)

X_train_original = mimic_train_ori.drop(['subject_id', 'stay_id', 'hadm_id','Annotation','circ_next_12h', 'classes'], axis = 1)
y_train_original = mimic_train_ori[['classes']].copy()

X_train_over, y_train_over = smote.fit_resample(X_train_original, y_train_original)
print("SMOTE 적용 전 학습용 피처/레이블 데이터 세트 : ", X_train_original.shape, y_train_original.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트 :', X_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 값의 분포 :\n', y_train_over.value_counts() )

SMOTE 적용 전 학습용 피처/레이블 데이터 세트 :  (770131, 172) (770131, 1)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트 : (1506690, 172) (1506690, 1)
SMOTE 적용 후 값의 분포 :
 classes
0          753345
1          753345
dtype: int64


In [None]:
# 데이터 프레임 저장
# emb set에 대해서도 smote 수행 및 csv 저장 하기
# UMAP 찍기(돌려놧엇음)

In [7]:
augmented_train_df = pd.concat([X_train_over.drop('Unnamed: 0', axis=1), y_train_over], axis = 1)
augmented_train_df.to_csv('augmented_train_df(input_space).csv', compression='gzip')

In [13]:
smote = SMOTE(random_state=42)

X_train_feature = mimic_train_feature.drop(['classes'], axis = 1)
y_train_feature = mimic_train_feature[['classes']].copy()

X_train_over_feat, y_train_over_feat = smote.fit_resample(X_train_feature, y_train_feature)
print("SMOTE 적용 전 학습용 피처/레이블 데이터 세트 : ", X_train_feature.shape, y_train_feature.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트 :', X_train_over_feat.shape, y_train_over_feat.shape)
print('SMOTE 적용 후 값의 분포 :\n', X_train_over_feat.value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트 :  (410446, 407) (410446, 1)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트 : (787558, 407) (787558, 1)
SMOTE 적용 후 값의 분포 :
 Unnamed: 0  Time_since_ICU_admission  Epinephrine  Nitroglycerin  Dopamine  Vasopressin  Dextrose_5%  HeparinSodium  Norepinephrine  Potassium_Chloride  Phenylephrine  Dobutamine  Phenylephrine (50/250)  Cisatracurium  Esmolol  Milrinone  Diltiazem  Labetalol  metoprolol  Vecuronium  Nitroprusside  Tirofiban  Rocuronium  Phenylephrine (200/250)  Nesiritide  Digoxin  Aminophylline  Epinephrine_rate  Nitroglycerin_rate  Dopamine_rate  Vasopressin_rate  Dextrose_5%_rate  HeparinSodium_rate  Norepinephrine_rate  Phenylephrine_rate  Dobutamine_rate  Phenylephrine (50/250)_rate  Cisatracurium_rate  Esmolol_rate  Milrinone_rate  Diltiazem_rate  Labetalol_rate  Vecuronium_rate  Nitroprusside_rate  Tirofiban_rate  Rocuronium_rate  Phenylephrine (200/250)_rate  Invasive_Ventilation  blood_cultured  CXR  None_Invasive_Ventilation  EKG  MRI  Urine_Output_out  verba

In [16]:
augmented_train_df_feat = pd.concat([X_train_over_feat.drop('Unnamed: 0', axis=1), y_train_over_feat], axis = 1)
augmented_train_df_feat.to_csv('augmented_train_df(feature_space).csv', compression='gzip')