In [1]:
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


In [2]:
os.chdir('/Users/chloe/Desktop/UVa/Capstone/Code')
raw = pd.read_csv("../Data/moss_plos_one_data.csv")  # (2217958, 62)

In [3]:
def get_na_rate(dataframe):
    na_count = dataframe.isnull().sum()
    na_rate = na_count / len(dataframe)
    df_na = pd.concat([na_count, na_rate], axis=1, keys=['count', 'percent'])
    df_na = df_na[df_na['percent']>0]
    df_na = df_na.sort_values(['percent'], ascending=False)
    return df_na

df_na = get_na_rate(raw)
df_na

Unnamed: 0,count,percent
Sgy,2211055,0.996888
MET,2061720,0.929558
eid,1974558,0.890259
tte,1974558,0.890259
death,1974558,0.890259
direct,1974558,0.890259


In [4]:
all_data = raw.drop(['Sgy', 'MET','eid','tte','death','direct','n_evts','LOS', 'ICU_Pt_Days', 'Mort', 'age', 'race', 'svc'], axis=1)  # (2217958, 56)

### ??? Some columns (Resp) have zeros

In [5]:
id_list = list(set(all_data['id']))  # 8105
id_true = list(set(all_data[all_data['y'] == True].id))       # 367
id_false = [id for id in id_list if id not in id_true]        # 7738

In [6]:
np.random.shuffle(id_true)
np.random.shuffle(id_false)

In [7]:
id_true_1 = id_true[0:73]
id_true_2 = id_true[73:147]
id_true_3 = id_true[147:221]
id_true_4 = id_true[221:294]
id_true_5 = id_true[294:]

In [22]:
random.seed(21)
id_false_sample = random.sample(id_false,140)
id_false_train = random.sample([id for id in id_false if id not in id_false_sample], 1000)

In [23]:
true_train_1 = [id for id in id_true if id not in id_true_1]
df_train1_true = all_data[all_data['id'].isin(true_train_1)]      # (153027, 56)
df_train1_false = all_data[all_data['id'].isin(id_false_train)]   # ((158818, 56)

In [24]:
df_train_1 = pd.concat([df_train1_true, df_train1_false], ignore_index=True, axis=0)  # (411272, 56)
print ('true shape: %d  false shape: %d'%(df_train1_true.shape[0], df_train1_false.shape[0]))

true shape: 145796  false shape: 256380


In [25]:
df_test1_true = all_data[all_data['id'].isin(id_true_1)]
df_test1_false = all_data[all_data['id'].isin(id_false_sample)]
df_test_1 = pd.concat([df_test1_true, df_test1_false], axis=0)
print ('true shape: %d  false shape: %d'%(df_test1_true.shape[0], df_test1_false.shape[0]))

true shape: 43788  false shape: 40317


In [26]:
y_train_1 = df_train_1.y
x_train_1 = df_train_1.drop(['y'], axis=1)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(x_train_1, y_train_1)
print ('After SMOTE "True":%d "False": %d' % (sum(y_train == 1), sum(y_train == 0)))

After SMOTE "True":382831 "False": 382831


In [27]:
y_test_1 = df_test_1.y
x_test_1 = df_test_1.drop(['y'], axis=1)

### Logistic Regression

In [28]:
lr = LogisticRegression()

In [29]:
#  Predict with all VS variables
cols = ["Pulse","O2.Flow","Resp","SpO2","SBP","Glasgow.Coma.Scale.Total"]
X_train_1 = x_train_1[cols]
X_test_1 = x_test_1[cols]
lr = lr.fit(X_train_1, y_train_1)
pred = lr.predict_proba(X_test_1)[:, 1]
roc_auc_score(y_test_1, pred)



0.567035977204615

In [30]:
#  Predict with all lab results
cols = ["WHITE.BLOOD.CELL.COUNT","BLOOD.UREA.NITROGEN","AST.GOT",
        "PLATELET.COUNT","GLUCOSE","PCO2","POTASSIUM","SODIUM","CO2"]
X_train_1 = x_train_1[cols]
X_test_1 = x_test_1[cols]
lr = lr.fit(X_train_1, y_train_1)
pred = lr.predict_proba(X_test_1)[:, 1]
roc_auc_score(y_test_1, pred)



0.5266850271578871

In [31]:
#  Predict with ECG results
cols = ['hr', 's2.hr', 's8.hr', 's24.hr', 'n.edrk',
       'edrk', 's2.edrk', 's8.edrk', 's24.edrk', 'srr', 'dfa', 'cosen', 'lds',
       'af', 'AF']
X_train_1 = x_train_1[cols]
X_test_1 = x_test_1[cols]
lr = lr.fit(X_train_1, y_train_1)
pred = lr.predict_proba(X_test_1)[:, 1]
roc_auc_score(y_test_1, pred)



0.5901092732333542

### Random Forest

In [32]:
def c_stat(model, x_train_1, x_test_1, y_train_1, y_test_1, cols):
    X_train_1 = x_train_1[cols]
    X_test_1 = x_test_1[cols]
    model = model.fit(X_train_1, y_train_1)
    pred = model.predict_proba(X_test_1)[:, 1]
    return roc_auc_score(y_test_1, pred)

In [33]:
#  Predict with all VS variables
forest = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state=0)
cols = ["Pulse","O2.Flow","Resp","SpO2","SBP","Glasgow.Coma.Scale.Total"]
c_stat(forest, x_train_1, x_test_1, y_train_1, y_test_1, cols)

0.5262476717322511

In [34]:
#  Predict with all lab results
cols = ["WHITE.BLOOD.CELL.COUNT","BLOOD.UREA.NITROGEN","AST.GOT",
        "PLATELET.COUNT","GLUCOSE","PCO2","POTASSIUM","SODIUM","CO2"]
c_stat(forest, x_train_1, x_test_1, y_train_1, y_test_1, cols)

0.552940568946593

In [35]:
#  Predict with all ECG results
cols = ['hr', 's2.hr', 's8.hr', 's24.hr', 'n.edrk',
       'edrk', 's2.edrk', 's8.edrk', 's24.edrk', 'srr', 'dfa', 'cosen', 'lds',
       'af', 'AF']
c_stat(forest, x_train_1, x_test_1, y_train_1, y_test_1, cols)

0.5607918832795461

### SVM

In [None]:
#  Predict with all VS variables
sc = StandardScaler()
X_train_sample = sc.fit_transform(x_train_1)  
X_test_sample = sc.transform(x_test_1)
svm = SVC(kernel='linear', probability=True, random_state=0)
cols = ["Pulse","O2.Flow","Resp","SpO2","SBP","Glasgow.Coma.Scale.Total"]
c_stat(svm, X_train_sample, X_test_sample, y_train_1, y_test_1, cols)

In [None]:
#  Predict with all VS variables
cols = ["WHITE.BLOOD.CELL.COUNT","BLOOD.UREA.NITROGEN","AST.GOT",
        "PLATELET.COUNT","GLUCOSE","PCO2","POTASSIUM","SODIUM","CO2"]
c_stat(svm, X_train_1, x_test_1, y_train_1, y_test_1, cols)

In [None]:
#  Predict with all ECG results
cols = ['hr', 's2.hr', 's8.hr', 's24.hr', 'n.edrk',
       'edrk', 's2.edrk', 's8.edrk', 's24.edrk', 'srr', 'dfa', 'cosen', 'lds',
       'af', 'AF']
c_stat(forest, X_train_1, x_test_1, y_train_1, y_test_1, cols)