<a href="https://colab.research.google.com/github/MIT-LCP/bidmc-datathon/blob/master/05_mortality_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# eICU Collaborative Research Database

# Notebook 5: Mortality prediction

This notebook explores how a logistic regression can be trained to predict in-hospital mortality of patients.


## Load libraries and connect to the database

In [0]:
# Import libraries
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

# model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import metrics
from sklearn import impute

# Make pandas dataframes prettier
from IPython.display import display, HTML

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [0]:
# authenticate
auth.authenticate_user()

In [0]:
# Set up environment variables
project_id='bidmc-datathon'
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id

## Load the patient cohort

In this example, we will load all data from the patient data, and link it to APACHE data to provide richer summary information.

In [0]:
# Link the patient and apachepatientresult tables on patientunitstayid
# using an inner join.
%%bigquery cohort

SELECT p.unitadmitsource, p.gender, p.age, p.admissionweight, 
    p.unittype, p.unitstaytype, a.acutephysiologyscore,
    a.apachescore, a.actualhospitalmortality
FROM `physionet-data.eicu_crd_demo.patient` p
INNER JOIN `physionet-data.eicu_crd_demo.apachepatientresult` a
ON p.patientunitstayid = a.patientunitstayid
WHERE apacheversion LIKE 'IVa'

In [0]:
cohort.head()

## Prepare the data for analysis

In [0]:
# review the data dataset
print(cohort.info())

In [0]:
# Encode the categorical data
encoder = preprocessing.LabelEncoder()
cohort['gender_code'] = encoder.fit_transform(cohort['gender'])
cohort['admissionweight_code'] = encoder.fit_transform(cohort['admissionweight'])
cohort['unittype_code'] = encoder.fit_transform(cohort['unittype'])
cohort['apachescore_code'] = encoder.fit_transform(cohort['apachescore'])
cohort['actualhospitalmortality_code'] = encoder.fit_transform(cohort['actualhospitalmortality'])

In [0]:
# Handle the deidentified ages
cohort['agenum'] = pd.to_numeric(cohort['age'], downcast='integer', errors='coerce')

In [0]:
# Preview the encoded data
cohort[['gender','gender_code']].head()

In [0]:
# Check the outcome variable
cohort['actualhospitalmortality_code'].unique()

## Create our train and test sets

In [0]:
predictors = ['gender_code','agenum','apachescore_code','unittype_code',
              'admissionweight_code']
outcome = 'actualhospitalmortality_code'

X = cohort[predictors]
y = cohort[outcome]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [0]:
# Review the number of cases in each set
print("Train data: {}".format(len(X_train)))
print("Test data: {}".format(len(X_test)))


## Build the model

In [0]:
# Create an instance of the model
model = LogisticRegression(solver='lbfgs')


In [0]:
# Impute missing values and scale using a pipeline
estimator = Pipeline([("imputer", impute.SimpleImputer(missing_values=np.nan, strategy="mean")),
                      ("scaler", preprocessing.StandardScaler()),
                      ("logistic_regression", model)])


In [0]:
# Fit the model to the training data
estimator.fit(X_train, y_train)

## Testing

In [0]:
y_pred = estimator.predict(X_test)
print('Accuracy of logistic regression classifier on the test set: {:.2f}'.format(estimator.score(X_test, y_test)))

In [0]:
print(metrics.classification_report(y_test, y_pred))

In [0]:
logit_roc_auc = metrics.roc_auc_score(y_test, estimator.predict(X_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, estimator.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positives')
plt.ylabel('True Positives')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()