# Cohort selection

- Inclusion criteria: > 17y, ischemic stroke, inpatient/non-transferred, not refusing to participate
- Exclusion criteria: < 12h, hospitalisation > 7d after stroke onset


Requirements:
* MIMIC-III in a PostgreSQL database


In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

Cohort selection will begin with three tables: *patients*, *admissions*, and *icustays*:

* *patients*: information about a patient that does not change - e.g. date of birth, genotypical sex
* *admissions*: information recorded on hospital admission - admission type (elective, emergency), time of admission
* *icustays*: information recorded on intensive care unit admission - primarily admission and discharge time

As MIMIC-III is primarily an intensive care unit (ICU) database, the focus will be on patients admitted to and discharged from the ICU. That is, rather than selecting our cohort based off the individual patient (identified by `subject_id` in the database), we will usually want to select our cohort based off the ICU stay (identified by `icustay_id`).

# Exclude base on length of stay, first ICU stay, age

In [None]:
query = query_schema + """
WITH co AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id
, EXTRACT(EPOCH FROM outtime - intime)/60.0/60.0 as icu_length_of_stay_h
, EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 as age
, RANK() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order
FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
LIMIT 100
)
SELECT
  co.subject_id, co.hadm_id, co.icustay_id, co.icu_length_of_stay_h
  , co.age
  , co.icustay_id_order
  , CASE
        WHEN co.icu_length_of_stay_h < 12 then 1
    ELSE 0 END
    AS exclusion_los
  , CASE
        WHEN co.age < 17 then 1
    ELSE 0 END
    AS exclusion_age
  , CASE 
        WHEN co.icustay_id_order != 1 THEN 1
    ELSE 0 END 
    AS exclusion_first_stay
FROM co
"""
df = pd.read_sql_query(query, con)
df

# Get discharge diagnosis

ischemic stroke (ICD-9-CM codes 433, 434 and 436)

In [None]:
query = query_schema + """
WITH co AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, dx.icd9_code
, EXTRACT(EPOCH FROM outtime - intime)/60.0/60.0 as icu_length_of_stay_h
, EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 as age
, RANK() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order
FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN diagnoses_icd dx
  ON icu.hadm_id = dx.hadm_id
LIMIT 10
)
SELECT
  co.subject_id, co.hadm_id, co.icustay_id, co.icu_length_of_stay_h
  , co.age
  , co.icustay_id_order
  , co.icd9_code
  , CASE
        WHEN co.icu_length_of_stay_h < 12 then 1
    ELSE 0 END
    AS exclusion_los
  , CASE
        WHEN co.age < 17 then 1
    ELSE 0 END
    AS exclusion_age
  , CASE
        WHEN co.icustay_id_order != 1 THEN 1
    ELSE 0 END
    AS exclusion_first_stay
  , CASE
        WHEN co.icd9_code LIKE '433%' OR co.icd9_code LIKE '434%' OR co.icd9_code LIKE '436%'  THEN 0
    ELSE 1 END
    AS exclusion_discharge_diagnosis
FROM co
"""
df = pd.read_sql_query(query, con)
df

# Get admission diagnosis

In [None]:
query = query_schema + """
WITH co AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, dx.icd9_code, admissions.admission_type, admissions.diagnosis
, EXTRACT(EPOCH FROM outtime - intime)/60.0/60.0 as icu_length_of_stay_h
, EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 as age
, RANK() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order
FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN diagnoses_icd dx
  ON icu.hadm_id = dx.hadm_id
INNER JOIN admissions admissions
  ON icu.hadm_id = admissions.hadm_id
)
SELECT
  co.subject_id, co.hadm_id, co.icustay_id, co.icu_length_of_stay_h
  , co.age
  , co.icustay_id_order
  , co.admission_type
  , co.diagnosis
  , co.icd9_code
  , CASE
        WHEN co.icu_length_of_stay_h < 12 then 1
    ELSE 0 END
    AS exclusion_los
  , CASE
        WHEN co.age < 17 then 1
    ELSE 0 END
    AS exclusion_age
  , CASE
        WHEN co.icustay_id_order != 1 THEN 1
    ELSE 0 END
    AS exclusion_first_stay
  , CASE
        WHEN co.icd9_code LIKE '433%' OR co.icd9_code LIKE '434%' OR co.icd9_code LIKE '436%'  THEN 0
    ELSE 1 END
    AS exclusion_discharge_diagnosis
FROM co
"""
df = pd.read_sql_query(query, con)
df

# TODO:

- add admission type & admission diagnosis to final query
- further criteria (time from stroke) could be extracted from admission note
- time from hospital  admission  to icu admission?

## Summarise exclusion criteria

In [None]:
print('{:20s} {:5d}'.format('Observations', df.shape[0]))
idxExcl = np.zeros(df.shape[0],dtype=bool)
for col in df.columns:
    if "exclusion_" in col:
        print('{:20s} {:5d} ({:2.2f}%)'.format(col, df[col].sum(), df[col].sum()*100.0/df.shape[0]))
        idxExcl = (idxExcl) | (df[col]==1)

# print a summary of how many were excluded in total
print('')
print('{:20s} {:5d} ({:2.2f}%)'.format('Total excluded', np.sum(idxExcl), np.sum(idxExcl)*100.0/df.shape[0]))
print('{:20s} {:5d} ({:2.2f}%)'.format('Total remaining', df.shape[0]- np.sum(idxExcl), 100- np.sum(idxExcl)*100.0/df.shape[0]))

# Save final selection query

In [None]:
final_selection_query = """
WITH selection AS
(
WITH co AS
(
SELECT icu.subject_id, icu.hadm_id, icu.icustay_id, dx.icd9_code
, EXTRACT(EPOCH FROM outtime - intime)/60.0/60.0 as icu_length_of_stay_h
, EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 as age
, RANK() OVER (PARTITION BY icu.subject_id ORDER BY icu.intime) AS icustay_id_order

FROM icustays icu
INNER JOIN patients pat
  ON icu.subject_id = pat.subject_id
INNER JOIN diagnoses_icd dx
  ON icu.hadm_id = dx.hadm_id
)

SELECT
  co.subject_id, co.hadm_id, co.icustay_id, co.icu_length_of_stay_h
  , co.age
  , co.icustay_id_order
  , co.icd9_code
  , CASE
    WHEN co.icu_length_of_stay_h < 12 then 1
    ELSE 0 END
    AS exclusion_los
    , CASE
        WHEN co.age < 17 then 1
    ELSE 0 END
    AS exclusion_age
    , CASE
        WHEN co.icustay_id_order != 1 THEN 1
    ELSE 0 END
    AS exclusion_first_stay
    , CASE
        WHEN co.icd9_code LIKE '433%' OR co.icd9_code LIKE '434%' OR co.icd9_code LIKE '436%'  THEN 0
    ELSE 1 END
    AS exclusion_discharge_diagnosis
FROM co
)
"""

In [None]:
save = False

In [None]:
if save:
    save_path = 'patient_selection_query.sql'
    with open(save_path, "w") as text_file:
        text_file.write(final_selection_query)

Apply selection query

In [None]:
query = query_schema + final_selection_query + """
SELECT selection.subject_id, selection.hadm_id, selection.icustay_id  , selection.age
  , selection.icustay_id_order
  , selection.icd9_code

FROM selection

WHERE selection.exclusion_discharge_diagnosis = 0
    AND selection.exclusion_first_stay = 0
    AND selection.exclusion_age = 0
    AND selection.exclusion_los = 0
"""
df = pd.read_sql_query(query, con)
df

In [None]:
# close out the database connection
con.close()

<!-- TODO: steal from hst-953 course -->