In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import time
from sklearn.naive_bayes import GaussianNB, ComplementNB
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, roc_curve, auc
%matplotlib inline


## Load Data

In [55]:
data = pd.read_csv('data_pre_processed.csv')

In [59]:
data.first_careunit.min()

-1.7389533620324669

In [75]:
newdata = data.copy()
for i in newdata.columns:
    newdata[i] = newdata[i]+(newdata[i].min()*-1)

In [76]:
newdata.head()

Unnamed: 0,first_careunit,last_careunit,age,age_category,gender,marital_status,insurance,urea_n_min,urea_n_max,urea_n_mean,...,spo2_max_mv,spo2_mean_mv,vent_mv,rrt_mv,urineoutput_mv,oasis_mv,lods_mv,sirs_mv,thirty_days,one_year
0,2.801143,2.834418,1.142646,6.0,2.013841,4.793453,4.921145,1.126847,1.636261,1.412541,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.400571,2.125813,0.946329,5.0,0.0,4.793453,4.921145,1.20197,2.855437,2.777256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.801143,2.834418,0.498466,3.0,2.013841,1.597818,2.460572,1.20197,0.99459,1.125983,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,1.400571,2.125813,1.065603,6.0,2.013841,4.793453,4.921145,2.103448,1.251259,1.628341,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0.700286,1.417209,0.462784,2.0,0.0,1.597818,2.460572,0.976601,0.641671,0.781508,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Separate Data into train test sets*

In [98]:
K = 5
eval_size = int(np.round(1./K))
skf = StratifiedKFold(n_splits=K)
train, test = train_test_split(newdata,test_size=0.2)
X_train = train.drop(['thirty_days','one_year'],axis=1).values
y_30day_train = train.thirty_days.values
y_1yr_train = train.one_year.values
X_test = test.drop(['thirty_days','one_year'],axis=1).values
y_30day_test = test.thirty_days.values
y_1yr_test = test.one_year.values

In [105]:
for x,y in skf.split(newdata.drop(['thirty_days','one_year'],axis=1).values, newdata.thirty_days.values):
    print(x)
    print(y)

[10949 10950 10953 ... 57993 57994 57995]
[    0     1     2 ... 11732 11733 11734]
[    0     1     2 ... 57993 57994 57995]
[10949 10950 10953 ... 23419 23420 23421]
[    0     1     2 ... 57993 57994 57995]
[21912 21925 21945 ... 35042 35043 35044]
[    0     1     2 ... 57993 57994 57995]
[33183 33203 33215 ... 46534 46535 46536]
[    0     1     2 ... 46534 46535 46536]
[45394 45395 45396 ... 57993 57994 57995]


# Naive Bayes

### Generate Gaussian Model and Fit Data

In [81]:
NBmodel30 = GaussianNB()
NBmodel1yr = GaussianNB()
NBmodel30.fit(X_train,y_30day_train)
NBmodel1yr.fit(X_train,y_1yr_train)

GaussianNB(priors=None, var_smoothing=1e-09)

### Generate Complement Model and Fit Data

In [94]:
NBCmodel30 = ComplementNB(alpha = 3)
NBCmodel1yr = ComplementNB(alpha= 3)
NBCmodel30.fit(X_train,y_30day_train)
NBCmodel1yr.fit(X_train,y_1yr_train)

ComplementNB(alpha=3, class_prior=None, fit_prior=True, norm=False)

### Test Performance

In [95]:
print(NBmodel30.score(X_test,y_30day_test))
print(NBmodel1yr.score(X_test,y_1yr_test))

0.756896551724138
0.7424137931034482


In [96]:
print(NBCmodel30.score(X_test,y_30day_test))
print(NBCmodel1yr.score(X_test,y_1yr_test))

0.7757758620689655
0.7311206896551724


In [111]:
NBmodel30.predict_proba(X_test)

array([[1.00000000e+00, 2.82296731e-16],
       [9.74888813e-01, 2.51111871e-02],
       [1.00000000e+00, 3.73147843e-20],
       ...,
       [1.00000000e+00, 1.01875828e-11],
       [9.99998681e-01, 1.31912373e-06],
       [4.68841937e-03, 9.95311581e-01]])

# kNN

### Generate Model and Fit Data

In [112]:
KNmodel30 = KNeighborsRegressor()
KNmodel1yr =  KNeighborsRegressor()
KNmodel1yr.fit(X_train,y_1yr_train)
KNmodel30.fit(X_train,y_30day_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

### Test Performance

In [89]:
output = KNmodel30.predict_proba(X_test)

In [103]:
pd.DataFrame(output)[1].value_counts()

0.0    8828
0.2    1175
1.0     634
0.4     429
0.6     322
0.8     212
Name: 1, dtype: int64

In [78]:
KNmodel30.score(X_test,y_30day_test)

0.9216379310344828

In [79]:
KNmodel1yr.score(X_test,y_1yr_test)

0.8496551724137931

# XGboost