In [None]:
import pandas as pd

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import os
DATADIR = "/home/shared"

In [None]:
# creating a df for the icu data
df_icu_2012 = pd.read_csv(os.path.join(DATADIR, "icu_2012.txt"), na_values=[-1,"nan"] )
df_icu_2012.head()


In [None]:
#creating a df for the outcomes
df_outcome = pd.read_csv(os.path.join(DATADIR, "outcomes-a.txt"))

# we only need the last column on in-hospital deaths
df_outcome = df_outcome.drop(columns=['SAPS-I','SOFA','Length_of_stay', 'Survival'])

#joining the two using inner join (to make sure we don't include any records which are not complete)
#df_icu_deaths = pd.merge(left=df_icu_2012, right=df_outcome, left_on='RecordID', right_on='RecordID')
#df_icu_deaths = df_icu_deaths.drop(columns=['In-hospital_death'])

#just some cleaning up above - the above can be deleted or ignored. 
df_icu_deaths = df_icu_2012

In [None]:
#because of missing data, must impute the missing 'nan' data for GaussianNB
from sklearn.impute import SimpleImputer

# we choose mean, but median might be better
fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
df_icu_deaths = pd.DataFrame(fill_NaN.fit_transform(df_icu_deaths))

In [None]:
# the following will split the dataframe into train and test with a test size of 25%
# we only need the last column of the outcome data set, so the:
# df_outcome[df_outcome.columns[1]] is used for this.

df_icu_deaths_train, \
df_icu_deaths_test, \
df_outcome_train, \
df_outcome_test = \
    train_test_split( \
    df_icu_deaths, \
    df_outcome[df_outcome.columns[1]], \
    test_size = 0.25, \
    random_state = 42)

In [None]:
df_icu_deaths_train

In [None]:
# there are other NB classifiers but we've chosen Gaussian here.
model = GaussianNB()
model.fit(df_icu_deaths_train, df_outcome_train)

In [None]:
df_outcome_predict = model.predict(df_icu_deaths_test)
#df_outcome_predict

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(df_outcome_test, df_outcome_predict) * 100

print("The accuracy is: " + str(accuracy) +"%") 


In [None]:
# would love to know which 'features' turned out to be the best features. 
# from here we would find these features and see which ones are correlated
# possibly doing a PCA to eliminate the features which are highly correlated
# this would allow for a much better assumption of naivety.
# then doing more feature engineering. 

## Additional Things we can try/think about

- Dealing with unblanced data (`ComplementNB`)
- Do we need to [rescale](https://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling) our data?

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [None]:
df_icu_deaths_train_sc = min_max_scaler.fit_transform(df_icu_deaths_train)

In [None]:
model2 = ComplementNB()
model2.fit(df_icu_deaths_train, df_outcome_train)
df_outcome_predict = model.predict(df_icu_deaths_test)


In [None]:
df_icu_deaths_train