<a href="https://colab.research.google.com/github/MIT-LCP/bidmc-datathon/blob/master/eicu_crrt_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# eICU Collaborative Research Database

# Notebook 5: Prediction with decision trees

This notebook explores how a decision trees can be trained to predict in-hospital mortality of patients.


## Load libraries and connect to the database

In [0]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# model building
from sklearn import ensemble, impute, metrics, preprocessing, tree
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline

# Make pandas dataframes prettier
from IPython.display import display, HTML, Image
plt.rcParams.update({'font.size': 20})
%matplotlib inline
plt.style.use('ggplot')

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [0]:
# authenticate
auth.authenticate_user()

In [0]:
# Set up environment variables
project_id='bidmc-datathon'
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id

To make our lives easier, we'll also install and import a set of helper functions from the `datathon2` package. We will be using the following functions from the package:
- `plot_model_pred_2d`: to visualize our data, helping to display a class split assigned by a tree vs the true class.
- `run_query()`: to run an SQL query against our BigQuery database and assign the results to a dataframe. 


In [0]:
!pip install pip --upgrade
!pip install glowyr
!pip install pandas_profiling --upgrade

In [0]:
import glowyr as dtn
import pydotplus
from tableone import TableOne
from pandas_profiling import ProfileReport

In this notebook we'll be looking at tree models, so we'll now install a package for visualizing these models.

In [0]:
!apt-get install graphviz -y

## Load the CRRT data

Let's extract the dataset for classifying need for CRRT. This will take a while!

In [0]:
%%bigquery df

-- ------------------------------------------------------------------
-- Title: ICU stay detail
-- Description: Each row represents a single ICU stay. Patient demographics
--        are summarised for each stay.
-- ------------------------------------------------------------------

-- (Optional) Define which schema to work on
-- SET search_path TO eicu_crd;

WITH icustay_detail as
(
SELECT pt.uniquepid, pt.patienthealthsystemstayid, pt.patientunitstayid, pt.unitvisitnumber,
       pt.hospitalid, h.region, pt.unittype,
       pt.hospitaladmitoffset, pt.hospitaldischargeoffset,
       0 AS unitadmitoffset, pt.unitdischargeoffset,
       ap.apachescore AS apache_score,
       pt.hospitaldischargeyear,
       pt.age,
       CASE WHEN lower(pt.hospitaldischargestatus) like '%alive%' THEN 0
            WHEN lower(pt.hospitaldischargestatus) like '%expired%' THEN 1
            ELSE NULL END AS hosp_mort,
       CASE WHEN lower(pt.gender) like '%female%' THEN 0
            WHEN lower(pt.gender) like '%male%' THEN 1
            ELSE NULL END AS gender,
       pt.ethnicity, pt.admissionheight, pt.admissionweight, pt.dischargeweight,
       ROUND(pt.unitdischargeoffset/60) AS icu_los_hours
FROM `physionet-data.eicu_crd.patient` pt
LEFT JOIN `physionet-data.eicu_crd.hospital` h
    ON pt.hospitalid = h.hospitalid
LEFT JOIN `physionet-data.eicu_crd.apachepatientresult` ap
    ON pt.patientunitstayid = ap.patientunitstayid
    AND ap.apacheversion = 'IV'
)
, crrt AS
(
    SELECT
        patientunitstayid,
        min(treatmentoffset) as crrtoffset
     FROM `physionet-data.eicu_crd.treatment`
     WHERE treatmentstring IN
     (
        'renal|dialysis|C A V H D',
        'renal|dialysis|C V V H',
        'renal|dialysis|C V V H D',
        'renal|dialysis|SLED'
     )
     GROUP BY 1
)
, all_hours as
(
  select
    patientunitstayid

    -- ceiling the intime to the nearest hour by adding 59 minutes then truncating
    , 0 as endoffset
    , crrtoffset
    -- create integers for each charttime in hours from admission
    -- so 0 is admission time, 1 is one hour after admission, etc, up to ICU disch
    , GENERATE_ARRAY(0, CAST(ceil(crrtoffset/60.0) AS INT64)) as hrs

  from crrt
)
, hourly AS
(

    SELECT
    patientunitstayid
    , CAST(hr AS INT64) as hr
    , endoffset as startoffset
    , endoffset + hr*60 as endoffset
    FROM all_hours
    CROSS JOIN UNNEST(all_hours.hrs) AS hr
)
SELECT
  h.patientunitstayid,
  h.hr,
  h.startoffset,
  h.endoffset,
  c.crrtoffset,
  -- demographics
  id.ethnicity,
  id.admissionheight,
  id.admissionweight,
  id.dischargeweight,
  id.apache_score,
  id.age,
  id.gender,
  id.hosp_mort,
  -- vitals
  heartrate,
  respiratoryrate,
  spo2,
  nibp_diastolic,
  nibp_systolic,
  nibp_mean,
  temperature,
  ibp_systolic,
  ibp_diastolic,
  ibp_mean,
  -- blood gases
  fio2,
  pao2,
  paco2,
  pH,
  aniongap,
  basedeficit,
  baseexcess,
  peep,
  -- labs
  albumin,
  bilirubin,
  BUN,
  calcium,
  creatinine,
  glucose,
  bicarbonate,
  TotalCO2,
  hematocrit,
  hemoglobin,
  INR,
  lactate,
  platelets,
  potassium,
  ptt,
  sodium,
  wbc,
  bands,
  alt,
  ast,
  alp
FROM hourly h
INNER JOIN icustay_detail id
  ON h.patientunitstayid = id.patientunitstayid
INNER JOIN crrt c
  ON h.patientunitstayid = c.patientunitstayid
-- labs
LEFT JOIN
(
    SELECT h.patientunitstayid,
    h.hr,
    AVG(albumin) AS albumin,
    AVG(bilirubin) AS bilirubin,
    AVG(BUN) AS BUN,
    AVG(calcium) AS calcium,
    AVG(creatinine) AS creatinine,
    AVG(glucose) AS glucose,
    AVG(bicarbonate) AS bicarbonate,
    AVG(TotalCO2) AS TotalCO2,
    AVG(hematocrit) AS hematocrit,
    AVG(hemoglobin) AS hemoglobin,
    AVG(INR) AS INR,
    AVG(lactate) AS lactate,
    AVG(platelets) AS platelets,
    AVG(potassium) AS potassium,
    AVG(ptt) AS ptt,
    AVG(sodium) AS sodium,
    AVG(wbc) AS wbc,
    AVG(bands) AS bands,
    AVG(alt) AS alt,
    AVG(ast) AS ast,
    AVG(alp) AS alp
    FROM hourly h
    LEFT JOIN `physionet-data.eicu_crd_derived.pivoted_lab` la
    ON h.patientunitstayid = la.patientunitstayid
    AND la.chartoffset BETWEEN h.startoffset AND h.endoffset
    GROUP BY h.patientunitstayid, h.hr
) la
  ON h.patientunitstayid = la.patientunitstayid
  AND la.hr = h.hr
-- blood gases
LEFT JOIN
(
    SELECT h.patientunitstayid,
    h.hr,
    AVG(fio2) AS fio2,
    AVG(pao2) AS pao2,
    AVG(paco2) AS paco2,
    AVG(pH) AS pH,
    AVG(aniongap) AS aniongap,
    AVG(basedeficit) AS basedeficit,
    AVG(baseexcess) AS baseexcess,
    AVG(peep) AS peep
    FROM hourly h
    LEFT JOIN `physionet-data.eicu_crd_derived.pivoted_bg` bg
    ON h.patientunitstayid = bg.patientunitstayid
    AND bg.chartoffset BETWEEN h.startoffset AND h.endoffset
    GROUP BY h.patientunitstayid, h.hr
) bg
  ON h.patientunitstayid = bg.patientunitstayid
  AND bg.hr = h.hr
-- vitals
LEFT JOIN
(
    SELECT h.patientunitstayid
    , h.hr
    , AVG(heartrate) AS heartrate
    , AVG(respiratoryrate) AS respiratoryrate
    , AVG(spo2) AS spo2
    , AVG(nibp_diastolic) AS nibp_diastolic
    , AVG(nibp_systolic) AS nibp_systolic
    , AVG(nibp_mean) AS nibp_mean
    , AVG(temperature) AS temperature
    , AVG(ibp_systolic) AS ibp_systolic
    , AVG(ibp_diastolic) AS ibp_diastolic
    , AVG(ibp_mean) AS ibp_mean
    FROM hourly h
    LEFT JOIN `physionet-data.eicu_crd_derived.pivoted_vital` vi
    ON h.patientunitstayid = vi.patientunitstayid
    AND vi.chartoffset BETWEEN h.startoffset AND h.endoffset
    GROUP BY h.patientunitstayid, h.hr
) vi
  ON h.patientunitstayid = vi.patientunitstayid
  AND vi.hr = h.hr
ORDER BY patientunitstayid, hr

In [0]:
df.head()