In [1]:
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
raw = pd.read_csv("./moss_plos_one_data.csv")  # (2217958, 62)


In [3]:
def get_na_rate(dataframe):
    na_count = dataframe.isnull().sum()
    na_rate = na_count / len(dataframe)
    df_na = pd.concat([na_count, na_rate], axis=1, keys=['count', 'percent'])
    df_na = df_na[df_na['percent']>0]
    df_na = df_na.sort_values(['percent'], ascending=False)
    return df_na

df_na = get_na_rate(raw)
df_na


Unnamed: 0,count,percent
Sgy,2211055,0.996888
MET,2061720,0.929558
eid,1974558,0.890259
tte,1974558,0.890259
death,1974558,0.890259
direct,1974558,0.890259


In [4]:
all_data = raw.drop(['Sgy', 'MET','eid','tte','death','direct','n_evts','LOS', 'ICU_Pt_Days', 'Mort', 'age', 'race', 'svc'], axis=1)  # (2217958, 56)

In [5]:
all_data["num"] = all_data.groupby("id").cumcount()

In [6]:
all_data = all_data[all_data.num % 4 == 0]
all_data.drop(["num"], axis=1,inplace=True)

In [7]:
all_data.shape

(557555, 49)

### ??? Some columns (Resp) have zeros

In [8]:
id_list = list(set(all_data['id']))  # 8105
id_true = list(set(all_data[all_data['y'] == True].id))       # 367
id_false = [id for id in id_list if id not in id_true]        # 7738


In [9]:
np.random.shuffle(id_true)
np.random.shuffle(id_false)


In [35]:
id_true_1 = id_true[0:73]
id_true_2 = id_true[73:147]
id_true_3 = id_true[147:221]
id_true_4 = id_true[221:294]
id_true_5 = id_true[294:]
id_false_1 = id_false[0:1547]
id_false_2 = id_false[1547:3094]
id_false_3 = id_false[3094:4641]
id_false_4 = id_false[4641:6188]
id_false_5 = id_false[6188:]
log_vs_id = []
log_lab_id = []
log_ecg_id = []
svm_vs_id = []
svm_lab_id = []
svm_ecg_id = []
rf_vs_id = []
rf_lab_id = []
rf_ecg_id = []
#=================
log_vs_roc = []
log_lab_roc = []
log_ecg_roc = []
rf_vs_roc = []
rf_lab_roc = []
rf_ecg_roc = []
svm_vs_roc = []
svm_lab_roc = []
svm_ecg_roc = []


In [36]:
for i in range(5):
    vs_unique = []
    lab_unique = []
    ecg_unique = []
    print()
    print("new loop starts============")
    if i ==0:
        id_true_0 = id_true_1
        id_false_sample = id_false_1
    if i ==1:
        id_true_0 = id_true_2
        id_false_sample = id_false_2
    if i ==2:
        id_true_0 = id_true_3
        id_false_sample = id_false_3
    if i ==3:
        id_true_0 = id_true_4
        id_false_sample = id_false_4
    if i ==4:
        id_true_0 = id_true_5
        id_false_sample = id_false_5
    #5 folds cross validation
    id_false_train =[id for id in id_false if id not in id_false_sample]
    true_train_1 = [id for id in id_true if id not in id_true_0]
    df_train1_true = all_data[all_data['id'].isin(true_train_1)]      
    df_train1_false = all_data[all_data['id'].isin(id_false_train)] 
    df_train_1 = pd.concat([df_train1_true, df_train1_false], ignore_index=True, axis=0)  # (411272, 56)
    print ('true shape: %d  false shape: %d'%(df_train1_true.shape[0], df_train1_false.shape[0]))
    df_test1_true = all_data[all_data['id'].isin(id_true_0)]
    df_test1_false = all_data[all_data['id'].isin(id_false_sample)]
    df_test_1 = pd.concat([df_test1_true, df_test1_false], axis=0)
    print ('true shape: %d  false shape: %d'%(df_test1_true.shape[0], df_test1_false.shape[0]))
    y_train_1 = df_train_1.y
    x_train_1 = df_train_1.drop(['y'], axis=1)
    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_sample(x_train_1, y_train_1)
    print ('After SMOTE "True":%d "False": %d' % (sum(y_train == 1), sum(y_train == 0)))
    y_test_1 = df_test_1.y
    patient_id = df_test_1.id
    x_test_1 = df_test_1.drop(['y'], axis=1)
    id_array = patient_id.values
    
    #=========
    #logictic regression
    #==========
    lr = LogisticRegression()
    #  Predict with all VS variables
    cols = ["Pulse","O2.Flow","Resp","SpO2","SBP","Glasgow.Coma.Scale.Total"]
    X_train_1 = x_train_1[cols]
    X_test_1 = x_test_1[cols]
    lr = lr.fit(X_train_1, y_train_1)
    pred = lr.predict_proba(X_test_1)[:, 1]
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by log vs:",len(true_patient))
    print("non_event patient found by log vs:",len(false_patient))
    log_vs_roc.append(roc_auc_score(y_test_1, pred))
    log_vs_id += true_patient
    
    #  Predict with all lab results
    cols = ["WHITE.BLOOD.CELL.COUNT","BLOOD.UREA.NITROGEN","AST.GOT",
        "PLATELET.COUNT","GLUCOSE","PCO2","POTASSIUM","SODIUM","CO2"]
    X_train_1 = x_train_1[cols]
    X_test_1 = x_test_1[cols]
    lr = lr.fit(X_train_1, y_train_1)
    pred = lr.predict_proba(X_test_1)[:, 1]
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    log_lab_roc.append(roc_auc_score(y_test_1, pred))
    print("patient found by log lab:",len(true_patient))
    print("non_event patient found by log lab:",len(false_patient))
    log_lab_id += true_patient
    #===========================
    #  Predict with ECG results
    cols = ['hr', 's2.hr', 's8.hr', 's24.hr', 'n.edrk',
           'edrk', 's2.edrk', 's8.edrk', 's24.edrk', 'srr', 'dfa', 'cosen', 'lds',
           'af', 'AF']
    X_train_1 = x_train_1[cols]
    X_test_1 = x_test_1[cols]
    lr = lr.fit(X_train_1, y_train_1)
    pred = lr.predict_proba(X_test_1)[:, 1]
    log_ecg_roc.append(roc_auc_score(y_test_1, pred))
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by log ecg:",len(true_patient))
    print("non_event patient found by log ecg:",len(false_patient))
    log_ecg_id += true_patient
#     #=========
#     #Randomforest
#     #=========
    def c_stat(model, x_train_1, x_test_1, y_train_1, y_test_1, cols):
        X_train_1 = x_train_1[cols]
        X_test_1 = x_test_1[cols]
        model = model.fit(X_train_1, y_train_1)
        pred = model.predict_proba(X_test_1)[:, 1]
        return pred, roc_auc_score(y_test_1, pred)
    #  Predict with all VS variables
    forest = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state=0)
    cols = ["Pulse","O2.Flow","Resp","SpO2","SBP","Glasgow.Coma.Scale.Total"]
    pred, roc = c_stat(forest, x_train_1, x_test_1, y_train_1, y_test_1, cols)
    rf_vs_roc.append(roc)
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by rf vs:",len(true_patient))
    print("non_event patient found by rf vs:",len(false_patient))
    rf_vs_id += true_patient
    
    #  Predict with all lab results
    cols = ["WHITE.BLOOD.CELL.COUNT","BLOOD.UREA.NITROGEN","AST.GOT",
            "PLATELET.COUNT","GLUCOSE","PCO2","POTASSIUM","SODIUM","CO2"]
    pred, roc = c_stat(forest, x_train_1, x_test_1, y_train_1, y_test_1, cols)
    rf_lab_roc.append(roc)
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by rf lab:",len(true_patient))
    print("non_event patient found by rf lab:",len(false_patient))
    rf_lab_id += true_patient
    
    #  Predict with all ECG results
    cols = ['hr', 's2.hr', 's8.hr', 's24.hr', 'n.edrk',
           'edrk', 's2.edrk', 's8.edrk', 's24.edrk', 'srr', 'dfa', 'cosen', 'lds',
           'af', 'AF']
    pred, roc = c_stat(forest, x_train_1, x_test_1, y_train_1, y_test_1, cols)
    rf_ecg_roc.append(roc)
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by rf ecg:",len(true_patient))
    print("non_event patient found by rf ecg:",len(false_patient))
    rf_ecg_id += true_patient
    
    #=======
    #naive bayes
    #=======
    #  Predict with all VS variables
    svm = GaussianNB()
    cols = ["Pulse","O2.Flow","Resp","SpO2","SBP","Glasgow.Coma.Scale.Total"]
    pred, roc = c_stat(svm, x_train_1, x_test_1, y_train_1, y_test_1, cols)
    svm_vs_roc.append(roc)
    
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by nb vs:",len(true_patient))
    print("non_event patient found by nb vs:",len(false_patient))
    svm_vs_id += true_patient
    
    #  Predict with all lab variables
    cols = ["WHITE.BLOOD.CELL.COUNT","BLOOD.UREA.NITROGEN","AST.GOT",
            "PLATELET.COUNT","GLUCOSE","PCO2","POTASSIUM","SODIUM","CO2"]
    pred, roc = c_stat(svm, x_train_1, x_test_1, y_train_1, y_test_1, cols)
    svm_lab_roc.append(roc)
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by nb labs:",len(true_patient))
    print("non_event patient found by nb labs:",len(false_patient))
    svm_lab_id += true_patient

    #  Predict with all ECG results
    cols = ['hr', 's2.hr', 's8.hr', 's24.hr', 'n.edrk',
           'edrk', 's2.edrk', 's8.edrk', 's24.edrk', 'srr', 'dfa', 'cosen', 'lds',
           'af', 'AF']
    pred, roc = c_stat(svm, x_train_1, x_test_1, y_train_1, y_test_1, cols)
    svm_ecg_roc.append(roc)
    new_id = list(set(id_array[pred.argsort()[-len(pred)//100:][::-1]]))
    true_patient = [i for i in new_id if i in id_true]
    false_patient = [i for i in new_id if i in id_false]
    print("patient found by nb ecg:",len(true_patient))
    print("non_event patient found by nb ecg:",len(false_patient))
    svm_ecg_id += true_patient


true shape: 37635  false shape: 407666
true shape: 9905  false shape: 102349
After SMOTE "True":440620 "False": 440620
patient found by log vs: 27
non_event patient found by log vs: 92
patient found by log lab: 11
non_event patient found by log lab: 30
patient found by log ecg: 29
non_event patient found by log ecg: 194
patient found by rf vs: 31
non_event patient found by rf vs: 218
patient found by rf lab: 10
non_event patient found by rf lab: 67
patient found by rf ecg: 34
non_event patient found by rf ecg: 361
patient found by nb vs: 23
non_event patient found by nb vs: 91
patient found by nb labs: 9
non_event patient found by nb labs: 24
patient found by nb ecg: 32
non_event patient found by nb ecg: 326

true shape: 38711  false shape: 410334
true shape: 8829  false shape: 99681
After SMOTE "True":444168 "False": 444168
patient found by log vs: 28
non_event patient found by log vs: 104
patient found by log lab: 6
non_event patient found by log lab: 22
patient found by log ecg: 22

array([68862, 75385, 23823, ...,  3121, 53487, 95156])

In [37]:
vs =log_vs_id + svm_vs_id  +rf_vs_id
len(list(set(vs)))

188

In [38]:
labs =log_lab_id + svm_lab_id  +rf_lab_id
len(list(set(labs)))

92

In [39]:
ecg =log_ecg_id + svm_ecg_id  +rf_ecg_id
len(list(set(ecg)))

220

In [40]:
logist = log_vs_id + log_lab_id +log_ecg_id
len(list(set(logist)))

199

In [41]:
rondonfor= rf_vs_id + rf_lab_id +rf_ecg_id
len(list(set(rondonfor)))

227

In [42]:
nb = svm_lab_id + svm_lab_id + svm_ecg_id
len(list(set(nb)))

163

In [43]:
print(np.mean(log_ecg_roc),np.mean(log_lab_roc),np.mean(log_vs_roc))

0.6406928058828167 0.6181528968577595 0.6834974898360079


In [44]:
print(np.mean(rf_ecg_roc),np.mean(rf_lab_roc),np.mean(rf_vs_roc))

0.5771238820882061 0.6070220329653255 0.5832416491216695


In [45]:
print(np.mean(svm_ecg_roc),np.mean(svm_lab_roc),np.mean(svm_vs_roc))

0.6360463598717382 0.6151463615287016 0.681679470451214
