# Generate tables

Author: Lawrence Baker

Much of the code for generating these tables is from work that was previously conducted for a different project by the students of Barbara Engelhardt. This code was modified by the author, but the structure, approach and a majority of the code, including how diabetics are defined, is their creation. Additionally, concepts from the MIMIC code repository (https://github.com/MIT-LCP/mimic-code) were used.

This code queries a local MIMIC-III install using postgres (via the psycopg2 package). 

It produces seven dataframes which contain information on: admissions, inputs, charts, labs, icd9 (diagnosis codes), sofa (SOFA scores), blood_labs. These dataframes are saved in a subfolder of the current directory called processed_data in a .pkl. These files are then used by the Glucose Analysis code.

These dataframes are extracted in several steps:

Extracted table generation comprised of 5 steps:
1. Define needed inputs (e.g. Insulin) and retrieve the relevant IDs codes from the dictionaries contained in the MIMIC ID tables.
2. Retrieve unfiltered tables and use them to define a study cohort of ICU stays. Inclusion criteria for the cohort are: presence in the MetaVision data table, age greater than or equal to 18, an ICU length of stay greater than or equal to 1 day, and at least one glucose measurement
3. Filter tables so that they only include the study cohort and the relevant inputs, Sequential OrganFailure Assessment (SOFA) scores, labs and, charts.
4. Retrieve the notes of included stays and process to establish known pre-existing diabetic status.
5. Append the diabetic information to the extracted admissions table.



## Import Packages

In [2]:
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt
import os, sys, pickle, json, time, math, re
import argparse

## Define Querying Functions and SQL Queries

In [7]:
# Create a database connection (**changing sql database credentials as appropriate**)
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'
sqlpwd = "postgres"
con = psycopg2.connect(dbname=dbname, user=sqluser, password=sqlpwd)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

def q(query):
    # Query function
    con = psycopg2.connect(dbname=dbname, user=sqluser, password=sqlpwd)
    cur = con.cursor()
    cur.execute('SET search_path to ' + schema_name)
    return pd.read_sql_query(query,con)

def d_inputs_labels(labels):
    #Get list of items given a list of labels
    ll = ','.join('\'{0}\''.format(l) for l in labels)
    query = """
SELECT itemid, label, unitname 
FROM d_items
WHERE label in (""" + ll + """);
"""
    return q(query).drop_duplicates()

def d_inputs_IDs(IDs):
     #Get list of items given a list of labels
    il = ','.join('\'{0}\''.format(i) for i in IDs)
    query = """
SELECT itemid, label, unitname 
FROM d_items
WHERE itemid in (""" + il + """);
"""
    return q(query).drop_duplicates()

def d_labinputs_labels(labels):
    #Get list of items given a list of labels
    ll = ','.join('\'{0}\''.format(l) for l in labels)
    query = """
SELECT itemid, label 
FROM d_labitems
WHERE label in (""" + ll + """);
"""
    return q(query).drop_duplicates()

def d_labinputs_IDs(IDs):
     #Get list of items given a list of labels
    il = ','.join('\'{0}\''.format(i) for i in IDs)
    query = """
SELECT itemid, label 
FROM d_labitems
WHERE itemid in (""" + il + """);
"""
    return q(query).drop_duplicates()


def get_admissions():
    # Get demographic info from ADMISSIONS table
    query = """
SELECT DISTINCT on (ie.icustay_id)
    ad.subject_id as subject
  , ad.hadm_id as hadm
  , ie.icustay_id as icustay
  , ROUND( (CAST(EXTRACT(epoch FROM ad.admittime - pa.dob)/(60*60*24*365.242) AS numeric)), 1) AS age
  , pa.gender
  , ad.ethnicity as ethnicity
  , ad.admission_type as admittype 
  , ad.diagnosis
  , ie.first_careunit
--
  , ad.admittime as admit_time
  , ie.intime as icu_admit
  , ie.outtime as icu_discharge  
  , ad.dischtime as discharge_time
  , ROUND(CAST(ie.los AS numeric), 2) AS icu_los
  , ROUND( (CAST(EXTRACT(epoch FROM ad.dischtime - ad.admittime)/(60*60*24) AS numeric)), 2) AS adm_los
  , ad.hospital_expire_flag as h_exp
  , pa.expire_flag as exp
--
--  , ad.deathtime as death_time  
--  , ad.admission_location as adm_loc
--  , ad.discharge_location as disch_loc
--  , ad.insurance as insurance
--  , ad.language as language
--  , ad.religion as religion
--  , ad.marital_status as marital
--  , ad.edregtime as emerg_admit
--  , ad.edouttime as emerg_disch
--  , ie.dbsource as dbsource
--  , ie.last_careunit
--  , ie.first_wardid
--  , ie.last_wardid
--  , pa.dob as dob
--  , pa.dod as dod
--
FROM admissions ad
INNER JOIN icustays ie
ON ad.hadm_id = ie.hadm_id
INNER JOIN patients pa
ON ad.subject_id = pa.subject_id
WHERE ad.has_chartevents_data = 1
-- ORDER BY ad.subject_id, ad.admittime, ie.inttime
"""
    
    return q(query).drop_duplicates()

def get_comorbidities():
    # Get ICD9 codes, corresponding descriptions for admission diagnoses from DIAGNOSES_ICD table
    
    query = """
SELECT  ad.subject_id as subject
  , ad.hadm_id as hadm
  , ie.icustay_id as icustay
  , di.icd9_code as code
  , did.short_title as short_desc
  , did.long_title as long_desc
FROM admissions ad
INNER JOIN icustays ie
ON ad.hadm_id = ie.hadm_id
INNER JOIN diagnoses_icd di
ON ad.hadm_id = di.hadm_id
INNER JOIN d_icd_diagnoses did
ON di.icd9_code = did.icd9_code
"""
     
    return q(query).drop_duplicates()

def get_procedures():
    # Get ICD9 codes, corresponding descriptions for admission procedures from PROCEDURES_ICD table 
    
    query = """
SELECT  ad.subject_id as subject
  , ad.hadm_id as hadm
  , ie.icustay_id as icustay
  , pi.icd9_code as code
  , pid.short_title as short_desc
  , pid.long_title as long_desc
FROM admissions ad
INNER JOIN icustays ie
ON ad.hadm_id = ie.hadm_id
INNER JOIN procedures_icd pi
ON ad.hadm_id = pi.hadm_id
INNER JOIN d_icd_procedures pid
ON pi.icd9_code = pid.icd9_code
"""
    
    return q(query).drop_duplicates()

def get_inputs(icuadms, inputIDs):
    # Get drugs administered for a given set of admissions, drugs from INPUTEVENTS_MV    
    # Considers only data from the Metavision system (patients from 2008 onwards) 

    # Takes as input a list of icu admissions, list of drugs names to be extracted

    l = ','.join('\'{0}\''.format(h) for h in icuadms)
    il = ','.join('\'{0}\''.format(i) for i in inputIDs)
    
    query = """
SELECT  ad.subject_id as subject
  , ad.hadm_id as hadm
  , ie.icustay_id as icustay
  , ad.diagnosis
  , mv.itemid as item
  , mv.ordercategoryname as ordercat
  , mv.starttime as input_start
  , mv.endtime as input_end
  , mv.amount as amount
  , mv.amountuom as unit
  , mv.rate as rate
  , mv.rateuom as rateuom
  , mv.patientweight as ptweight
  , mv.totalamount as totalamount
  , mv.totalamountuom as totalamountuom
  , mv.originalamount as originalamount
  , mv.originalrate as originalrate
  , mv.statusdescription as status
FROM admissions ad
INNER JOIN icustays ie
ON ad.hadm_id = ie.hadm_id
INNER JOIN inputevents_mv mv 
ON ad.hadm_id = mv.hadm_id
WHERE ie.icustay_id in (""" + l + """)
AND mv.itemid in (""" + il + """)
ORDER BY ad.subject_id, mv.starttime
"""

    return q(query).drop_duplicates()

def get_inputs_cv(icuadms, drugs):
    # Get drugs administered for a given set of admissions, drugs from INPUTEVENTS_CV  - not used  
    # Considers only data from the CareVue system (patients from 2001-08)

    # Takes as input a list of icu admissions, list of drugs names to be extracted

    l = ','.join('\'{0}\''.format(h) for h in icuadms)
    dl = ','.join('\'{0}\''.format(d) for d in drugs)
    
    query = """
SELECT  ad.subject_id as subject
  , ad.hadm_id as hadm
  , ie.icustay_id as icustay
  , ad.diagnosis
  , cv.itemid as item
  , it.label as label
  , cv.orderid as ordercat
  , cv.charttime as input_start
  , cv.charttime as input_end
  , cv.amount as amount
  , cv.amountuom as unit
  , cv.rate as rate
  , cv.rateuom as rateuom
  , cv.patientweight as ptweight
  , cv.originalamount as totalamount
  , cv.originalamountuom as totalamountuom
  , cv.originalamount as originalamount
  , cv.originalrate as originalrate
  , cv.statusdescription as status
FROM admissions ad
INNER JOIN icustays ie
ON ad.hadm_id = ie.hadm_id
INNER JOIN diagnoses_icd di
ON ad.hadm_id = di.hadm_id
INNER JOIN d_icd_diagnoses did
ON di.icd9_code = did.icd9_code
INNER JOIN inputevents_cv cv 
ON ad.hadm_id = mv.hadm_id
WHERE ie.icustay_id in (""" + l + """)
AND mv.itemid in (""" + dl + """)
ORDER BY ad.subject_id, mv.starttime
"""

    return q(query).drop_duplicates()

def get_charts(icuadms, vits):
    # Gets recorded vitals measurements from CHARTEVENTS table
    # Takes as input a list of icu admissions, list of vitals to be extracted
    
    l = ','.join('\'{0}\''.format(h) for h in icuadms)
    cl = ','.join('\'{0}\''.format(d) for d in vits)
    
    query = """
SELECT ad.subject_id as subject
  , ad.hadm_id as hadm
  , ch.icustay_id as icustay
  , ch.itemid as item
  , ch.charttime
  , ch.valuenum as value
  , ch.valueuom as unit
  , ch.warning as warning
  , ch.error as error
FROM admissions ad
INNER JOIN chartevents ch 
ON ad.hadm_id = ch.hadm_id
WHERE ch.icustay_id in (""" + l + """)
AND ch.itemid in (""" + cl + """)
ORDER BY ad.subject_id, ch.charttime
""" 
    
    return q(query).drop_duplicates()

def get_labs(icuadms, labs):
    # Gets recorded lab measurements from LABEVENTS table
    # Takes as input a list of icu admissions, list of labs to be extracted
    
    l = ','.join('\'{0}\''.format(h) for h in icuadms)
    ll = ','.join('\'{0}\''.format(l) for l in labs)
    
    query = """
SELECT ad.subject_id as subject
  , ad.hadm_id as hadm
  , ie.icustay_id as icustay
  , lb.itemid as item
  , lb.charttime
  , lb.valuenum as value
  , lb.valueuom as unit
FROM admissions ad
INNER JOIN icustays ie
ON ad.hadm_id = ie.hadm_id
INNER JOIN labevents lb 
ON ad.hadm_id = lb.hadm_id
INNER join d_labitems dl
ON lb.itemid = dl.itemid
WHERE ie.icustay_id in (""" + l + """)
AND lb.itemid in (""" + ll + """)
ORDER BY ad.subject_id, lb.charttime
"""
    
    return q(query).drop_duplicates()

def get_notes(icuadms):
    # Gets admission notes from NOTEEVENTS table
    
    l = ','.join('\'{0}\''.format(h) for h in icuadms)
    
    query = """
SELECT ad.subject_id as subject
  , ad.hadm_id as hadm
  , ie.icustay_id as icustay
  , ne.chartdate as note_date
  , ne.category as category
  , ne.description as description
  , ne.cgid as cgid
  , ne.iserror as iserror
  , ne.text as text
FROM admissions ad
INNER JOIN icustays ie
ON ad.hadm_id = ie.hadm_id
INNER JOIN noteevents ne
ON ad.hadm_id = ne.hadm_id
WHERE (ne.category NOT LIKE '%Radiology%')
AND ie.icustay_id in (""" + l + """)
ORDER BY ad.subject_id
"""

    return q(query).drop_duplicates()

def get_caregivers():
    # Get caregiver ids for admissions from CAREGIVERS table  
    
    query = """
select ce.icustay_id, ce.charttime, ce.cgid, cg.label
from mimiciii.chartevents ce
inner join caregivers cg
on ce.cgid = cg.cgid
where value is not null 
order by icustay_id, cgid
"""
    
    return q(query)

def get_ventilation():
    # Gets admissions with mechanical ventilation (by procedure ID),
    # with start and end times.
    
    vent_id = 225792
    query = """
SELECT pr.subject_id as subject
  , pr.hadm_id as hadm
  , pr.icustay_id as icu_stay
  , pr.starttime as vent_starttime
  , pr.endtime as vent_endtime
--  , extract(epoch from pr.starttime) as vent_starttime_epoch
--  , extract(epoch from pr.endtime) as vent_endtime_epoch
FROM  procedureevents_mv pr
WHERE pr.itemid = """ + str(vent_id) + """
ORDER BY pr.icustay_id, pr.starttime
"""
    
    return q(query).drop_duplicates().dropna()

def get_all_inputs_mv():
    query = """
SELECT  mv.subject_id as subject
  , mv.hadm_id as hadm
  , mv.icustay_id as icustay
  , mv.itemid as item
FROM inputevents_mv mv
ORDER BY mv.icustay_id
"""
    
    return q(query).drop_duplicates()

def get_sofa(icuadms):
    # Get hourly sofa score for a given set of admissions, using the pivoted_sofa materialised view    

    # Takes as input a list of icu admissions

    l = ','.join('\'{0}\''.format(h) for h in icuadms)
    
    query = """
SELECT sf.icustay_id as icustay
  , sf.hr
  , sf.starttime
  , sf.endtime
  , sf.pao2fio2ratio_novent
  , sf.pao2fio2ratio_vent
  , sf.rate_epinephrine
  , sf.rate_norepinephrine
  , sf.rate_dopamine
  , sf.rate_dobutamine
  , sf.meanbp_min
  , sf.gcs_min
  , sf.urineoutput
  , sf.bilirubin_max
  , sf.creatinine_max
  , sf.platelet_min
  , sf.respiration
  , sf.coagulation
  , sf.liver
  , sf.cardiovascular
  , sf.cns
  , sf.renal
  , sf.respiration_24hours
  , sf.coagulation_24hours
  , sf.liver_24hours
  , sf.cardiovascular_24hours
  , sf.cns_24hours
  , sf.renal_24hours
  , sf.sofa_24hours
FROM pivoted_sofa sf
WHERE sf.icustay_id in (""" + l + """)
ORDER BY sf.icustay_id, sf.starttime
"""

    return q(query).drop_duplicates()

def get_blood_labs(icuadms):
    # Get blood gas labs by time, using the pivoted_bg materialised view    

    # Takes as input a list of icu admissions

    l = ','.join('\'{0}\''.format(h) for h in icuadms)
    
    query = """
SELECT *
FROM pivoted_bg bg
WHERE bg.icustay_id in (""" + l + """)
ORDER BY bg.icustay_id, bg.charttime
"""

    return q(query).drop_duplicates()

def sepsis_status():
    #This is the modified version that only finds Sepsis, not organ failure
    # Remember to comment out the views
    
    query = """
-- ICD-9 codes for sepsis as validated by Martin et al.

-- Greg S. Martin, David M. Mannino, Stephanie Eaton, and Marc Moss. The epidemiology of
-- sepsis in the united states from 1979 through 2000. N Engl J Med, 348(16):1546–1554, Apr
-- 2003. doi: 10.1056/NEJMoa022139. URL http://dx.doi.org/10.1056/NEJMoa022139.


-- DROP MATERIALIZED VIEW IF EXISTS martin_sepsis CASCADE;
-- CREATE MATERIALIZED VIEW martin_sepsis as
WITH co_dx AS
(
	SELECT subject_id, hadm_id
  , MAX(
    	CASE
        -- septicemia
    		WHEN substring(icd9_code,1,3) = '038' THEN 1
        -- septicemic, bacteremia, disseminated fungal infection, disseminated candida infection
				-- NOTE: the paper specifies 020.0 ... but this is bubonic plague
				-- presumably, they meant 020.2, which is septicemic plague
        WHEN substring(icd9_code,1,4) in ('0202','7907','1179','1125') THEN 1
        -- disseminated fungal endocarditis
        WHEN substring(icd9_code,1,5) = '11281' THEN 1
      ELSE 0 END
    ) AS sepsis
  FROM diagnoses_icd
  GROUP BY subject_id, hadm_idfrom
)
-- procedure codes:
-- "96.7 - Ventilator management"
-- translated:
--    9670	Continuous invasive mechanical ventilation of unspecified duration
--    9671	Continuous invasive mechanical ventilation for less than 96 consecutive hours
--    9672	Continuous invasive mechanical ventilation for 96 consecutive hours or more
-- "39.95 - Hemodialysis"
--    3995	Hemodialysis
-- "89.14 - Electroencephalography"
--    8914	Electroencephalogram
, co_proc as
(
  SELECT subject_id, hadm_id
  , MAX(CASE WHEN substring(icd9_code,1,3) = '967' then 1 ELSE 0 END) as respiratory
  , MAX(CASE WHEN substring(icd9_code,1,4) = '3995' then 1 ELSE 0 END) as renal
  , MAX(CASE WHEN substring(icd9_code,1,4) = '8914' then 1 ELSE 0 END) as neurologic
  FROM procedures_icd
  GROUP BY subject_id, hadm_id
)
select adm.subject_id as subject
, adm.hadm_id as hadm
, ie.icustay_id as icustay
, co_dx.sepsis
FROM admissions adm
INNER JOIN icustays ie
ON adm.hadm_id = ie.hadm_id
left join co_dx
  on adm.hadm_id = co_dx.hadm_id
left join co_proc
  on adm.hadm_id = co_proc.hadm_id;
"""

    return q(query)

## Define Parameters of Interest and Cohort Inclusion/Exclusion Criteria

Here I define the parameters I am looking for in the MIMIC database. These will be translated into item ID codes, which will then be incorporated into the query.

The inclusion and exclsion criteria for the cohort are also defined. The unit of analysis is the ICU stay, so one individual may appear multiple times in the data. Inclusion criteria for the cohort are: presence in the MetaVision data table, age greater than or equal to 18, an ICU length of stay greater than or equal to 1 day, and at least one glucose measurement. This block only defines these functions, it does not call them.

In [9]:
def getParamLists():
    
    insulin_inputs = ['Insulin - Regular', 'Insulin - 70/30','Insulin - Humalog', 'Insulin - Humalog 75/25',
                      'Insulin - NPH', 'Insulin - Glargine']

    nutrition_inputs = ['Dextrose 5%', 'Dextrose 10%', 'Dextrose 20%', 'Dextrose 50%',
                        'Dextrose PN', 'TPN w/ Lipids', 'TPN without Lipids', 
                        'Peripheral Parenteral Nutrition']
                        #'Boost Glucose Control (1/4)', 'Boost Glucose Control (1/2)', 
                        #'Boost Glucose Control (3/4)', 'Boost Glucose Control (Full)']
            
    enteral_inputs = ['Replete with Fiber (Full)', 'Nutren Renal (Full)',
                       'Impact with Fiber (1/4)', 'Boost Glucose Control (Full)',
                       'Beneprotein', 'Nutren 2.0 (Full)', 'Fibersource HN (Full)',
                       'NovaSource Renal (Full)', 'Nutren Pulmonary (Full)',
                       'Peptamen 1.5 (Full)', 'ProBalance (Full)', 'Impact (Full)',
                       'Isosource 1.5 (Full)', 'Vivonex (Full)',
                       'Peptamen Bariatric (Full)', 'Replete (Full)', 'Pulmocare (Full)',
                       'Impact with Fiber (Full)', 'Replete with Fiber (1/2)',
                       'Two Cal HN (Full)', 'Glucerna (Full)', 'Two Cal HN (1/2)',
                       'Nepro (Full)', 'Nutren 2.0 (1/4)', 'Nutren 2.0 (1/2)',
                       'Replete with Fiber (1/4)', 'Ensure (Full)', 'Replete (1/2)',
                       'Impact (1/4)', 'Replete (3/4)', 'Replete with Fiber (3/4)',
                       'Fibersource HN (1/2)', 'Peptamen 1.5 (1/2)',
                       'Fibersource HN (1/4)', 'Peptamen 1.5 (1/4)',
                       'Nutren Pulmonary (1/2)', 'Ensure Plus (Full)', 'Ensure (1/4)',
                       'Boost Glucose Control (1/4)', 'Nutren Renal (1/2)',
                       'Nutren Renal (3/4)', 'Replete (1/4)', 'Nutren Renal (1/4)',
                       'Impact with Fiber (1/2)', 'Impact (1/2)', 'Vivonex (3/4)',
                       'Nutren 2.0 (3/4)', 'Peptamen 1.5 (3/4)', 'Impact (3/4)',
                       'Isosource 1.5 (1/2)', 'Isosource 1.5 (3/4)',
                       'NovaSource Renal (3/4)', 'Pulmocare (1/4)',
                       'NovaSource Renal (1/2)', 'Boost Glucose Control (3/4)',
                       'Nepro (1/2)', 'Impact with Fiber (3/4)', 'Fibersource HN (3/4)',
                       'Replete (2/3)', 'Boost Glucose Control (1/2)', 'Vivonex (1/2)',
                       'Nutren 2.0 (2/3)', 'Nutren Pulmonary (3/4)',
                       'Isosource 1.5 (1/4)', 'Replete with Fiber (2/3)', 'Impact (2/3)',
                       'Glucerna (1/2)', 'Ensure (3/4)', 'Isosource 1.5 (2/3)',
                       'Peptamen VHP (Full)']

    other_inputs = ['Epinephrine','Norepinephrine','Phenylephrine',
                    'Metoprolol','Labetalol','Esmolol','Calcium Gluconate']
                    # (vasopressors, betablockers, hyperkalemia treatment)

    glucose_charts = ['Glucose finger stick','Glucose (whole blood)','Glucose (serum)', 'Fingerstick Glucose',
                      'Glucose (70-105)', 'Glucose', 'Blood Glucose', 'BloodGlucose']

    patient_charts=["Admission Weight (Kg)", "Height"]
    
    glucose_labs = ['Glucose', '% Hemoglobin A1c']

    other_charts = ['Heart Rate','Respiratory Rate','O2 Saturation Pulseoxymetry','PH (Arterial)',
                    'Temperature Fahrenheit','Arterial Blood Pressure systolic','Arterial Blood Pressure diastolic',
                    'Arterial Blood Pressure mean','Non Invasive Blood Pressure mean','Admission Weight (Kg)','Height (cm)',
                    'Non Invasive Blood Pressure systolic','Non Invasive Blood Pressure diastolic','Potassium (serum)']

    other_labs = ['White Blood Cells', 'Lactate Dehydrogenase (LD)', 'Lactate', 'Cortisol']
    
    parenteral_labels=['TPN w/ Lipids', 'TPN without Lipids', 'Peripheral Parenteral Nutrition']

    enteral_labels=['Replete with Fiber (Full)', 'Nutren Renal (Full)',
                    'Impact with Fiber (1/4)', 'Boost Glucose Control (Full)',
                   'Beneprotein', 'Nutren 2.0 (Full)', 'Fibersource HN (Full)',
                   'NovaSource Renal (Full)', 'Nutren Pulmonary (Full)',
                   'Peptamen 1.5 (Full)', 'ProBalance (Full)', 'Impact (Full)',
                   'Isosource 1.5 (Full)', 'Vivonex (Full)',
                   'Peptamen Bariatric (Full)', 'Replete (Full)', 'Pulmocare (Full)',
                   'Impact with Fiber (Full)', 'Replete with Fiber (1/2)',
                   'Two Cal HN (Full)', 'Glucerna (Full)', 'Two Cal HN (1/2)',
                   'Nepro (Full)', 'Nutren 2.0 (1/4)', 'Nutren 2.0 (1/2)',
                   'Replete with Fiber (1/4)', 'Ensure (Full)', 'Replete (1/2)',
                   'Impact (1/4)', 'Replete (3/4)', 'Replete with Fiber (3/4)',
                   'Fibersource HN (1/2)', 'Peptamen 1.5 (1/2)',
                   'Fibersource HN (1/4)', 'Peptamen 1.5 (1/4)',
                   'Nutren Pulmonary (1/2)', 'Ensure Plus (Full)', 'Ensure (1/4)',
                   'Boost Glucose Control (1/4)', 'Nutren Renal (1/2)',
                   'Nutren Renal (3/4)', 'Replete (1/4)', 'Nutren Renal (1/4)',
                   'Impact with Fiber (1/2)', 'Impact (1/2)', 'Vivonex (3/4)',
                   'Nutren 2.0 (3/4)', 'Peptamen 1.5 (3/4)', 'Impact (3/4)',
                   'Isosource 1.5 (1/2)', 'Isosource 1.5 (3/4)',
                   'NovaSource Renal (3/4)', 'Pulmocare (1/4)',
                   'NovaSource Renal (1/2)', 'Boost Glucose Control (3/4)',
                   'Nepro (1/2)', 'Impact with Fiber (3/4)', 'Fibersource HN (3/4)',
                   'Replete (2/3)', 'Boost Glucose Control (1/2)', 'Vivonex (1/2)',
                   'Nutren 2.0 (2/3)', 'Nutren Pulmonary (3/4)',
                   'Isosource 1.5 (1/4)', 'Replete with Fiber (2/3)', 'Impact (2/3)',
                   'Glucerna (1/2)', 'Ensure (3/4)', 'Isosource 1.5 (2/3)',
                   'Peptamen VHP (Full)']


    feeding_labels=np.concatenate([parenteral_labels, enteral_labels])
    
    return insulin_inputs, nutrition_inputs, enteral_inputs, other_inputs, glucose_charts, patient_charts, glucose_labs, other_charts, other_labs, parenteral_labels, enteral_labels, feeding_labels 


def getIDs(labels, labs=False):
    #Get IDs given labels
    if labs==True:
        lookup=d_labinputs_labels(labels)
        ids=lookup["itemid"].tolist()
    else:
        lookup=d_inputs_labels(labels)
        ids=lookup["itemid"].tolist()
    return ids

def getLabels(ids, units=False, labs=False):
    label_dict={}
    unit_dict={}
    if labs==True:
        lookup=d_labinputs_IDs(ids).set_index("itemid")
        for id in lookup.index:
            label_dict[id]=lookup.loc[id, "label"]
        return label_dict
    else:
        lookup=d_inputs_IDs(ids).set_index("itemid")
        if units==True:
            for id in lookup.index:
                label_dict[id]=lookup.loc[id, "label"]
                unit_dict[id]=lookup.loc[id, "unitname"]
            return label_dict, unit_dict
        
        else:
            for id in lookup.index:
                label_dict[id]=lookup.loc[id, "label"]
            return label_dict
    
def cohort_size(df, column="icustay"):
    #Find the number of unique stays in a df
    stays=len(df[column].unique())
    return stays

def get_cohort(df_original, input_df, chart_df, lab_df, 
               glucose_charts, glucose_labs, mv_stays, stay_col="icustay", first_stay_only=True, age_low=18, 
               age_high=np.inf, icu_los_low=1, icu_los_high=np.inf, cohort="glucose", sepsis_only=False):
    df=df_original.copy()
    old_size=cohort_size(df_original)
    drop_dict={}
    print("Original size:", old_size)

    #Drop those who are not in the MetaVision table
    df=df[df[stay_col].isin(mv_stays)]
    new_size=cohort_size(df)
    print("\nExclusion: is in the MetaVision data table")
    print("Removed: ", old_size-new_size)
    print("Remaining: ", new_size)
    old_size=new_size
    
    #Drop visits which are not first
    if first_stay_only == True: 
        df=df.sort_values(["subject", stay_col], ascending=[True, True])
        df=df.drop_duplicates(["subject"], keep="first")
        new_size=cohort_size(df)
        print("\nExclusion: first", stay_col, "only. ")
        print("Removed: ", old_size-new_size)
        print("Remaining: ", new_size)
        old_size=new_size

    #Age exclusion
    df=df.loc[df["age"]>=age_low]
    df=df.loc[df["age"]<=age_high]
    new_size=cohort_size(df)
    print("\nExclusion: Age greater than", age_low, "years and less than", age_high, "years")
    print("Removed: ", old_size-new_size)
    print("Remaining: ", new_size)
    old_size=new_size

    #ICU length exclusion
    df=df.loc[df["icu_los"]>=icu_los_low]
    df=df.loc[df["icu_los"]<=icu_los_high]
    new_size=cohort_size(df)
    print("\nExclusion: ICU length of stay greater than", icu_los_low, "days and less than", age_high, "days")
    print("Removed: ", old_size-new_size)
    print("Remaining: ", new_size)
    old_size=new_size

    #Has relevant inputs
    if cohort=="glucose":
        glucose_chart_ids=getIDs(glucose_charts)
        glucose_lab_ids=getIDs(glucose_labs, labs=True)
        chart_glc_adms=chart_df[chart_df.item.isin(glucose_chart_ids)][stay_col].unique()
        lab_glc_adms=lab_df[lab_df.item.isin(glucose_lab_ids)][stay_col].unique()
        glc_adms=np.unique(np.concatenate((chart_glc_adms, lab_glc_adms), axis=None))

        df=df[df[stay_col].isin(glc_adms)]
        new_size=cohort_size(df)
        print("\nExclusion: has at least one glucose measurement")
        print("Removed: ", old_size-new_size)
        print("Remaining: ", new_size)
        old_size=new_size

    if cohort=="insulin":
        insulin_ids=getIDs(insulin_inputs)
        insulin_adms = input_df[input_df.item.isin(insulin_ids)][stay_col].unique()

        df=df[df[stay_col].isin(insulin_adms)]
        new_size=cohort_size(df)
        print("\nExclusion: has at least one glucose measurement")
        print("Removed: ", old_size-new_size)
        print("Remaining: ", new_size)
        old_size=new_size

    #Has sepsis
    if sepsis_only == True: 
        sepsis_icuadms=sepsis_status().query("sepsis==1").icustay.tolist()
        df=df[df[stay_col].isin(sepsis_icuadms)]
        new_size=cohort_size(df)
        print("\nExclusion: diagnosed with Sepsis")
        print("Removed: ", old_size-new_size)
        print("Remaining: ", new_size)
        old_size=new_size

    return df[stay_col].values

## Call Parameter Lists

These calls to previously defined functions create lists of IDs that will be used in the query to retrieve only relvant items.

In [10]:
# Define covariates of interest
insulin_inputs, nutrition_inputs, enteral_inputs, other_inputs, glucose_charts, patient_charts, glucose_labs, other_charts, other_labs, parenteral_labels, enteral_labels, feeding_labels = getParamLists()

#All (including vital signs)
all_inputs = np.concatenate([insulin_inputs, nutrition_inputs, enteral_inputs, other_inputs])
all_charts=np.concatenate([glucose_charts, other_charts, patient_charts])
all_labs=np.concatenate([glucose_labs, other_labs])

#Only those relevant to blood glucose
bg_inputs=np.concatenate([insulin_inputs, nutrition_inputs])
bg_charts=np.concatenate([glucose_charts, patient_charts])
bg_labs=np.concatenate([glucose_labs])

#Get list of IDs for inputs
input_ids=getIDs(all_inputs)
insulin_ids=getIDs(insulin_inputs)
#Get list of IDs for charts
chart_ids=getIDs(bg_charts)
#Get list of IDs for labs
lab_ids=getIDs(bg_labs, labs=True)

#Get glucose IDs
glucose_chart_ids=getIDs(glucose_charts)
glucose_lab_ids=getIDs(glucose_labs, labs=True)

#Get labels for inputs
input_labels=getLabels(input_ids)
#Get labels and units for charts
chart_labels= getLabels(chart_ids)
#Get labels for labs
lab_labels = getLabels(lab_ids, labs=True)

## Retrieve Unfiltered Tables and Define Cohort

The first step is define the cohort, but to do so we need to know information about each ICU stay. This first call defines a cohort by calling a subset of columns for all records in table. Once the cohort is created, these stay IDs can be used in later calls.  The number of ICU stays dropped for each exclusion are printed.

In [3]:
print('(1) Query database')
if os.path.isfile('./processed_data/raw_adms.pkl'): 
    print('(Loading)')
    (admissions, icd9, all_inputs, inputs, cohort_admissions, charts_glc, labs_glc) = pickle.load(open('./processed_data/raw_adms.pkl', 'rb'))
else:
    admissions = get_admissions()
    print("Got admissions!")
    icd9 = pd.concat([get_comorbidities(), get_procedures()])
    print("Got idc9 codes!")
    inputs = get_inputs(admissions.icustay.unique(), input_ids)
    print("Got inputs!")
    charts_glc= get_charts(admissions.icustay.unique(), glucose_chart_ids)
    print("Got glucose charts!")
    labs_glc= get_labs(admissions.icustay.unique(), glucose_lab_ids)
    print("Got glucose labs!")
    all_inputs_mv=get_all_inputs_mv()
    mv_stays=all_inputs_mv.dropna(subset=['icustay']).icustay.unique()
    print("Got all inputs!")
    cohort_admissions = get_cohort(admissions, input_df=inputs, chart_df=charts_glc, 
           lab_df=labs_glc, glucose_charts=glucose_charts, glucose_labs=glucose_labs, mv_stays= mv_stays, stay_col="icustay", first_stay_only=False, age_low=18, 
           age_high=np.inf, icu_los_low=1, icu_los_high=np.inf, cohort="glucose", sepsis_only=False)
    print("Got Cohort!")

    #add the label data back into inputs
    inputs['label'] = inputs.item.map(input_labels)

    print("Input data labeled")

    pickle.dump((admissions, icd9, all_inputs, inputs, cohort_admissions, charts_glc, labs_glc), open('./processed_data/raw_adms.pkl', 'wb'))
    print("Pickled!")
    
print("Done")

(1) Query database
(Loading)
Done


## Filter Tables based on the Cohort

Now that the cohort is known, we call all the tables we need, but only retrieve ICU stay IDs that are part of the cohort.

In [4]:
print ('(2) Filter tables for icustays')
if os.path.isfile('./processed_data/insulin_adms.pkl') :
    print('(Loading)')
    (admissions, inputs, charts, labs, icd9, sofas, blood_labs) = pickle.load(open('./processed_data/insulin_adms.pkl', 'rb'))
else:
    admissions = admissions[admissions.icustay.isin(cohort_admissions)]
    print("Got admissions")
    inputs = inputs[inputs.icustay.isin(cohort_admissions)]
    print("Got inputs")
    charts = get_charts(cohort_admissions, chart_ids)
    print("Got charts")
    labs = get_labs(cohort_admissions, lab_ids)
    print("Got labs")
    icd9 = icd9[icd9.icustay.isin(cohort_admissions)]
    print("Got icd9s")
    sofas = get_sofa(cohort_admissions)
    print("Got sofas")
    blood_labs= get_blood_labs(cohort_admissions)
    print("Got blood labs")

    #add the label and unit data back into charts
    charts['label'] = charts.item.map(chart_labels)
    #add the label and unit data back into labs
    labs['label'] = labs.item.map(lab_labels)
    print("Chart data labeled")

    pickle.dump((admissions, inputs, charts, labs, icd9, sofas, blood_labs), open('./processed_data/insulin_adms.pkl', 'wb'))
    print("Pickled!")
    
print("Done")

(2) Filter tables for icustays
(Loading)
Done


## Get Diabetes Status from Notes

There is no definition of diabetic status in MIMIC. However, this can be inferred from doctors notes. Here the notes are called and a dummy variable for diabetic status is created for every member of the cohort.

In [5]:
print('(3) Get diabetic status from notes')
if os.path.isfile('./processed_data/ins_notes.pkl') :
    print ('(Loading)')
    (prior_dm, gcnotes) = pickle.load(open('./processed_data/ins_notes.pkl', 'rb'))    
else:
    # Get pre-existing diabetes, glycemic control from notes:
    notes=get_notes(cohort_admissions)
    print("Got Notes")
    prior_dm = {}
    for i in cohort_admissions:  
        note = notes[(notes.icustay==i)&(notes.category=="Discharge summary")].head(1)
        if len(note) > 0:
            pre_existing = (any(dm in note.text.item().lower().split('social history')[0] 
                       for dm in ["type ii", "type 2","insulin dependent", "diabetes mellitus",  'dm2', 'diabetes', 'diab', 'diabetic'])
                   and ('insipidus' not in note.text.item().lower().split('social history')[0]))
            prior_dm[i] = int(pre_existing)
    print("Processed glucose notes")
    gcnotes = {}
    for h in cohort_admissions:
        n = notes[(notes.icustay==h)]
        entry = []
        for r, row in n.iterrows():
            tmp = np.where(['glycemic control' in i for i in row.text.lower().split('\n')])[0]
            #np.where(['hypoglycem' in i for i in row.text.lower().split('\n')])[0] # for hypoglycemia in notes
            if len(tmp) > 0:
                txt = row.text.lower().split('\n')[tmp[0]].split('glycemic control:')[-1].lstrip(' ')
            else:
                txt = ''
            entry.append([row.note_date, txt])
        gcnotes[h] = entry
    print ("Processed glycemic control notes")
    pickle.dump((prior_dm, gcnotes), open('./processed_data/ins_notes.pkl', 'wb'))
    print("Pickled notes!")

print("Done")

(3) Get diabetic status from notes
(Loading)
Done


## Add Diabetic Status to Dataframes

Here we write the diabetic status we found from our notes back into the tables. The final dataframes are pickled.

In [6]:
# print ('(4) Add diabetic status to tables')
if os.path.isfile('./processed_data/insulin_adms.pkl') :
    print('(Loading)')
    (admissions, inputs, charts, labs, icd9, sofa, blood_labs) = pickle.load(open('./processed_data/insulin_adms.pkl', 'rb'))
    
    
    #Adding column indicating previous diabetes diagnosis
    admissions['diabetic'] = 0

    for i in cohort_admissions:
        if i in prior_dm.keys():
            admissions.loc[admissions.icustay == i, 'diabetic'] = int(prior_dm[i])
    print("Table updated")
    pickle.dump((admissions, inputs, charts, labs, icd9, sofa, blood_labs), open('./processed_data/insulin_adms.pkl', 'wb'))
    print("Pickled!")
    print("Done")
else:
    print("Go back to stage 2!")


(Loading)
Table updated
Pickled!
Done
