In [None]:
# Import libraries
from __future__ import print_function


import numpy as np
import pandas as pd
import psycopg2
import os
import getpass

from collections import OrderedDict

# path to output data files to - can be relative to the current folder
data_path = 'data'

In [None]:
psycopg2.__version__

In [None]:
# colours for prettier plots
import matplotlib
import matplotlib.pyplot as plt
import colorsys
def gg_color_hue(n):
    hues = np.linspace(15, 375, n)
    hsv_tuples = [(x*1.0/360.0, 0.5, 0.8) for x in hues]
    rgb_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)
    return rgb_tuples

marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']

# plot settings
%matplotlib inline
plt.style.use('ggplot')
font = {'size'   : 20}
matplotlib.rc('font', **font)

In [None]:
# helper function for generating tables
# this rolls back the cursor if it fails
def execute_query_safely(sql, con):
    cur = con.cursor()
    
    # try to execute the query
    try:
        cur.execute(sql)
    except:
        # if an exception, rollback, rethrow the exception - finally closes the connection
        cur.execute('rollback;')
        raise
    finally:
        cur.close()
    
    return

In [None]:
sqluser = getpass.getuser()
print('Using username {}'.format(sqluser))

dbname = 'mimic'
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'

In [None]:
# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)

print('Connected to postgres {}.{}.{}!'.format(int(con.server_version/10000),
                                              (con.server_version - int(con.server_version/10000)*10000)/100,
                                              (con.server_version - int(con.server_version/100)*100)))

In [None]:
# check if the sepsis3_cohort table exists ... if not we must generate it
query = """
SELECT EXISTS(SELECT 1 FROM information_schema.tables 
              WHERE table_catalog = '{}'
              AND table_schema in ('public','{}')
              AND table_name = 'sepsis3');
""".format(dbname, schema_name)
tbl_exists = pd.read_sql_query(query, con)
tbl_exists = tbl_exists.loc[0,'exists']
if tbl_exists:
    print('Found the `sepsis3` table. Skipping generation of data in SQL.')
else:
    print('Running SQL code to generate tables. This may take some time.')
    
    # read through the "make-tables.sql" file in the sql subfolder
    query_path = 'query'
    
    with open(os.path.join(query_path, 'make-tables.sql'), 'r') as fp:
        for line in fp.readlines():
            if len(line)<2:
                print(line,end='')
                continue
            
            if line[0:2] != '\i':
                print(line,end='')
                continue
                
            # lines which begin with '\i' call SQL files that generate tables
            query_file = os.path.join(query_path, line[3:].replace('\n',''))
            print('Running {} ...'.format(query_file), end=' ')
            with open(query_file, 'r') as fp_query:
                query = ''.join(fp_query.readlines())
            execute_query_safely(query_schema + query, con)
            print('done.')

### Extract the text

In [None]:
# query = pd.read_sql(
#     """
#     SELECT hadm_id, subject_id, chartdate, category, description, text
#     FROM noteevents;
#     """, con)



# tex = pd.read_sql_query(query,con)
# tex

In [None]:
query = query_schema + """
SELECT *
FROM noteevents
ORDER BY subject_id"""
tex = pd.read_sql_query(query,con)
tex
# com.to_csv(os.path.join(data_path, 'diagnoses.csv'),sep=',',index=False)

In [None]:
# ## generate vasopressor doses

# # read through the "make-tables-vasopressor_doses.sql" file in the sql subfolder
# query_path = 'query'

# with open(os.path.join(query_path, 'make-tables-vasopressor_doses.sql'), 'r') as fp:
#     for line in fp.readlines():
#         if len(line)<2:
#             print(line,end='')
#             continue
        
#         if line[0:2] != '\i':
#             print(line,end='')
#             continue
            
#         # lines which begin with '\i' call SQL files that generate tables
#         query_file = os.path.join(query_path, line[3:].replace('\n',''))
#         print('Running {} ...'.format(query_file), end=' ')
#         with open(query_file, 'r') as fp_query:
#             query = ''.join(fp_query.readlines())
#         execute_query_safely(query_schema + query, con)
#         print('done.')

In [None]:
## aggregated to ts 
## labs_first_day, fixed
## urine_output, fixed
## vitals_first_day, fixed

## already ts
## blood_gas_first_day
## blood_gas_first_day_arterial

## vasopressors only have duration in hours, we created new tables to extract the dosage, but it needs preprocessing


## static:
## comorbidities elix
## demographics

# query = query_schema + "select * from sepsis3_cohort"
# co = pd.read_sql_query(query,con)
# co 

In [None]:
# exclusion criteria:
#   - less than 16 years old
#   - never have any chartevents data (i.e. likely administrative error)
#   - not cardiac surgery
#   - suspected of infection
#   - first ICU stay
#   - not a CareVue patient (i.e. admitted 2008-2012)
# these exclusion criteria are created in the sepsis3_cohort table
query = query_schema + "select * from sepsis3_cohort"
co = pd.read_sql_query(query,con)

In [None]:
co

# Exclusions - applied independently

In [None]:
# print out the exclusions
print('Cohort - initial size: {} ICU stays'.format(co.shape[0]))
idxRem = np.zeros(co.shape[0])
for c in co.columns:
    if c.startswith('exclusion_'):
        print('  {:5g} ({:2.2f}%) - {}'.format(np.sum(co[c]),np.mean(co[c])*100.0, c))
        idxRem[co[c].values==1] = 1
        
print('Final cohort size: {} ICU stays ({:2.2f}%).'.format(co.shape[0] - np.sum(idxRem), (1-np.mean(idxRem))*100.0))

# Exclusions - applied sequentially

In [None]:
# print out the exclusions *SEQUENTIALLY* - i.e. if already excluded, don't re-print
print('Cohort - initial size: {} ICU stays'.format(co.shape[0]))

COL_REM = ['exclusion_nonadult', 'exclusion_secondarystay',
           'exclusion_csurg','exclusion_carevue',
           'exclusion_early_suspicion', 'exclusion_late_suspicion', 'exclusion_bad_data']

idxRem = np.zeros(co.shape[0])
for c in COL_REM:
    N_REM = np.sum( (idxRem == 0) & (co[c].values==1) )
    print('  {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/co.shape[0], c))
    idxRem[co[c].values==1] = 1
        
print('Final non-suspected cohort size: {:g} ICU stays ({:2.2f}%).'.format(co.shape[0] - np.sum(idxRem), (1-np.mean(idxRem))*100.0))

print()
# define idxRem to exclude non-metavision admissions initially
print('=============================')
print('====== METAVISION ONLY ======')
print(' Patients admitted 2008-2012 ')
print('=============================')
print()

idxRem = co['dbsource']!='metavision'
N = np.sum(~idxRem)
N_IGNORE = np.sum(idxRem)

# print out the exclusions *SEQUENTIALLY* - i.e. if already excluded, don't re-print
print('Cohort - initial size: {} ICU stays'.format(N))

for c in COL_REM:
    N_REM = np.sum( (idxRem == 0) & (co[c].values==1) )
    print('  {:5g} ({:2.2f}%) - {}'.format(N_REM,N_REM*100.0/N, c))
    idxRem[co[c].values==1] = True
        
print('Final non-suspected cohort size: {}/{} ICU stays ({:2.2f}%).'.format(
        np.sum(~idxRem), N, np.sum(~idxRem)*100.0/N))
print('')

print('Of these patients...')
c='suspected_of_infection_poe'
N_REM = np.sum( (~idxRem) & (co[c].values==1) )
print('  {:5g} ({:2.2f}%) - {}'.format(N_REM, N_REM*100.0/ np.sum(~idxRem), c))

# Histogram of time of suspected infection

In [None]:
# generate exclusions *except* early/late suspicion
COL_REM = ['exclusion_nonadult', 'exclusion_secondarystay',
           'exclusion_csurg','exclusion_carevue',
           #'exclusion_early_suspicion', 'exclusion_late_suspicion',
           'exclusion_bad_data']
idxRem = np.zeros(co.shape[0])
for c in COL_REM:
    N_REM = np.sum( (idxRem == 0) & (co[c].values==1) )
    idxRem[co[c].values==1] = 1
    
idxRem = idxRem.astype(bool)

# distribution of time of infection
N_HR = 96
xi = np.linspace(-N_HR, N_HR, N_HR*2+1)
col = gg_color_hue(5)

pretty_dict = {'suspected_infection_time_poe_days': 'Blood culture + ABX'}
c = 'suspected_infection_time_poe_days'

idxKeep = ~co[c].isnull()
tmp = co.loc[~idxRem & idxKeep, c].values * 24.0
N_firstday = sum( (tmp>-24) & (tmp<24) )

lbl_str = pretty_dict[c]
lbl_str += '\n' + '{:2.2f}% of grp >  24hr.'.format(sum(tmp>24)*100.0 / np.sum(~idxRem & idxKeep))
lbl_str += '\n' + '{:2.2f}% of grp  +-24hr.'.format(sum((tmp>=-24) & (tmp<=24))*100.0 / np.sum(~idxRem & idxKeep))
lbl_str += '\n' + '{:2.2f}% of grp < -24hr.'.format(sum(tmp<-24)*100.0 / np.sum(~idxRem & idxKeep))

plt.figure(figsize=[14,6])
plt.hist( tmp, bins=xi, label=lbl_str)
plt.title('{} ({:2.2f}%) in [-24,24]. '.format(
        N_firstday, N_firstday*100.0 / np.sum(~idxRem)))
plt.legend(loc='upper left')
    
plt.show()

We can see that most patients are suspected of infection either before, or at the time of their ICU admission. This motivates the decision to evaluate the performance of the scores at ICU admission.

# Load final dataset from Postgres

In [None]:
# load in final dataset - note we apply the exclusion criteria with excluded=0
query = query_schema + "select * from sepsis3 where excluded = 0"

df = pd.read_sql_query(query,con)
df

In [None]:
df.mort_icu

We have: ICU intime/outtime, suspected infection time, whether the microbiology culture was positive, some demographics, comorbidities, outcomes, and the severity scores. 

The severity scores are extracted at a [0, 24] hour window centered around ICU admission - except labs have an extended [-6, 24] hour window (i.e. 'sofa' is extracted in this way).

## Add in useful variables

In [None]:
# add the composite outcome
df['composite_outcome'] = ( (df['hospital_expire_flag']==1) | (df['icu_los']>=3) ).astype(int)

labels = OrderedDict([['suspicion_poe', 'BC + ABX (Prescribed)']])

# add some other useful variables
df['blood culture'] = (~df['blood_culture_time'].isnull())
df['suspicion_poe'] = (~df['suspected_infection_time_poe_days'].isnull())

df['abx_poe'] = (~df['antibiotic_time_poe'].isnull())

df['sepsis-3'] = ((df['suspicion_poe']==1) & (df['sofa']>=2)).astype(int)
df['sofa>=2'] = (df['sofa']>=2).astype(int)

# Save the data to file

The dataframes will be loaded directly from a file, rather than the database.


In [None]:
df.columns

In [None]:
df.head()

We will keep the patients that satisfy the sepsis3 criteria and use the icustay_ids to extract the relevant data from the rest of the tables.


In [None]:
sepsis3 = df[df['sepsis-3'] == 1]
sepsis3 = sepsis3.sort_values(['hadm_id', 'intime'])

#sepsis3.to_csv(os.path.join(data_path, 'sepsis-df-3.csv'),sep=',',index=False)

sepsis3

In [None]:
sepsis3[sepsis3['sepsis_angus'] == 1].sort_values(['hadm_id'])


In [None]:
tex.columns

In [None]:

tex  = tex.sort_values(['hadm_id'])
tex = tex[tex.hadm_id.isin(sepsis3.hadm_id)]

tex.to_csv(os.path.join(data_path, 'text-df-3.csv'),sep=',',index=False)


In [None]:
## extract urine output
query = query_schema + "select * from urine_output_first_day_ts"
uo = pd.read_sql_query(query,con)
uo  = uo.sort_values(['icustay_id', 'charttime'])
uo = uo[uo.icustay_id.isin(sepsis3.icustay_id)]
uo.to_csv(os.path.join(data_path, 'urine-output-df-3.csv'),sep=',',index=False)

In [None]:
## extract vitals
query = query_schema + "select * from vitals_first_day_ts"
vt = pd.read_sql_query(query,con)
vt  = vt.sort_values(['icustay_id', 'charttime'])
vt = vt[vt.icustay_id.isin(sepsis3.icustay_id)]
vt.to_csv(os.path.join(data_path, 'vitals-df-3.csv'),sep=',',index=False)

In [None]:
## extract labs
query = query_schema + "select * from labs_first_day_ts"
lb = pd.read_sql_query(query,con)
lb  = lb.sort_values(['icustay_id', 'charttime'])
lb = lb[lb.icustay_id.isin(sepsis3.icustay_id)]
lb.to_csv(os.path.join(data_path, 'labs-df-3.csv'),sep=',',index=False)

In [None]:
## extract blood gas
query = query_schema + "select * from blood_gas_first_day"
bg = pd.read_sql_query(query,con)
bg  = bg.sort_values(['icustay_id', 'charttime'])
bg = bg[bg.icustay_id.isin(sepsis3.icustay_id)]
bg.to_csv(os.path.join(data_path, 'blood-gas-df-3.csv'),sep=',',index=False)

In [None]:
## extract blood gas
query = query_schema + "select * from blood_gas_first_day_arterial"
bga = pd.read_sql_query(query,con)
bga  = bga.sort_values(['icustay_id', 'charttime'])
bga = bga[bga.icustay_id.isin(sepsis3.icustay_id)]
bga.to_csv(os.path.join(data_path, 'blood-gas-arterial-df-3.csv'),sep=',',index=False)

In [None]:
## extract ventilation
query = query_schema + "select * from ventilation_first_day"
vent = pd.read_sql_query(query,con)
vent  = vent.sort_values(['icustay_id'])
vent = vent[vent.icustay_id.isin(sepsis3.icustay_id)]
vent.to_csv(os.path.join(data_path, 'ventilation-df-3.csv'),sep=',',index=False)

In [None]:
## extract gcs
query = query_schema + "select * from gcs_first_day"
gcs = pd.read_sql_query(query,con)
gcs  = gcs.sort_values(['icustay_id'])
gcs = gcs[gcs.icustay_id.isin(sepsis3.icustay_id)]
gcs.to_csv(os.path.join(data_path, 'gcs-df-3.csv'),sep=',',index=False)

In [None]:
## extract comorbidities
query = query_schema + "select * from elixhauser_ahrq_v37"
cmb = pd.read_sql_query(query,con)
cmb  = cmb.sort_values(['hadm_id'])
cmb = cmb[cmb.hadm_id.isin(sepsis3.hadm_id)]
cmb.to_csv(os.path.join(data_path, 'comorbidities-df-3.csv'),sep=',',index=False)

In [None]:
cmb

In [None]:
## extract sofa
query = query_schema + "select * from sofa"
sofa = pd.read_sql_query(query,con)
sofa  = sofa.sort_values(['icustay_id'])
sofa = sofa[sofa.icustay_id.isin(sepsis3.icustay_id)]
sofa.to_csv(os.path.join(data_path, 'sofa-df-3.csv'),sep=',',index=False)

In [None]:
## extract qsofa
query = query_schema + "select * from qsofa"
qsofa = pd.read_sql_query(query,con)
qsofa  = qsofa.sort_values(['icustay_id'])
qsofa = qsofa[qsofa.icustay_id.isin(sepsis3.icustay_id)]
qsofa.to_csv(os.path.join(data_path, 'qsofa-df-3.csv'),sep=',',index=False)

In [None]:
### join vasopressors, extract only sepsis3 patients and keep only the 24first hours of their ICU admission
## extract comorbidities
query = query_schema + "select * from dobutamine_dose"
dobutamine_dose = pd.read_sql_query(query,con)

query = query_schema + "select * from dopamine_dose"
dopamine_dose = pd.read_sql_query(query,con)

query = query_schema + "select * from vasopressin_dose"
vasopressin_dose = pd.read_sql_query(query,con)

query = query_schema + "select * from phenylephrine_dose"
phenylephrine_dose = pd.read_sql_query(query,con)

query = query_schema + "select * from epinephrine_dose"
epinephrine_dose = pd.read_sql_query(query,con)

query = query_schema + "select * from norepinephrine_dose"
norepinephrine_dose = pd.read_sql_query(query,con)


dobutamine_dose  = dobutamine_dose.sort_values(['icustay_id'])
dobutamine_dose = dobutamine_dose[dobutamine_dose.icustay_id.isin(sepsis3.icustay_id)]
dobutamine_dose = dobutamine_dose.reset_index(drop=True)
dobutamine_dose.rename(columns={'vaso_amount':'dobutamine_dose'}, inplace=True)
dobutamine_dose = dobutamine_dose.drop('vaso_rate', axis=1)


dopamine_dose  = dopamine_dose.sort_values(['icustay_id'])
dopamine_dose = dopamine_dose[dopamine_dose.icustay_id.isin(sepsis3.icustay_id)]
dopamine_dose = dopamine_dose.reset_index(drop=True)
dopamine_dose.rename(columns={'vaso_amount':'dopamine_dose'}, inplace=True)
dopamine_dose = dopamine_dose.drop('vaso_rate', axis=1)

vasopressin_dose  = vasopressin_dose.sort_values(['icustay_id'])
vasopressin_dose = vasopressin_dose[vasopressin_dose.icustay_id.isin(sepsis3.icustay_id)]
vasopressin_dose = vasopressin_dose.reset_index(drop=True)
vasopressin_dose.rename(columns={'vaso_amount':'vasopressin_dose'}, inplace=True)
vasopressin_dose = vasopressin_dose.drop('vaso_rate', axis=1)

phenylephrine_dose  = phenylephrine_dose.sort_values(['icustay_id'])
phenylephrine_dose = phenylephrine_dose[phenylephrine_dose.icustay_id.isin(sepsis3.icustay_id)]
phenylephrine_dose = phenylephrine_dose.reset_index(drop=True)
phenylephrine_dose.rename(columns={'vaso_amount':'phenylephrine_dose'}, inplace=True)
phenylephrine_dose = phenylephrine_dose.drop('vaso_rate', axis=1)

epinephrine_dose  = epinephrine_dose.sort_values(['icustay_id'])
epinephrine_dose = epinephrine_dose[epinephrine_dose.icustay_id.isin(sepsis3.icustay_id)]
epinephrine_dose = epinephrine_dose.reset_index(drop=True)
epinephrine_dose.rename(columns={'vaso_amount':'epinephrine_dose'}, inplace=True)
epinephrine_dose = epinephrine_dose.drop('vaso_rate', axis=1)

norepinephrine_dose  = norepinephrine_dose.sort_values(['icustay_id'])
norepinephrine_dose = norepinephrine_dose[norepinephrine_dose.icustay_id.isin(sepsis3.icustay_id)]
norepinephrine_dose = norepinephrine_dose.reset_index(drop=True)
norepinephrine_dose.rename(columns={'vaso_amount':'norepinephrine_dose'}, inplace=True)
norepinephrine_dose = norepinephrine_dose.drop('vaso_rate', axis=1)





In [None]:
vasopressors = pd.concat([dobutamine_dose, dopamine_dose, vasopressin_dose, phenylephrine_dose, epinephrine_dose, norepinephrine_dose], ignore_index=True)
vasopressors

In [None]:
vasopressors.icustay_id = vasopressors.icustay_id.astype(int)
vasopressors

In [None]:
temp = sepsis3[['icustay_id', 'intime']]
vasopressors_ts = pd.merge(temp, vasopressors, on='icustay_id', how='left')
vasopressors_ts

In [None]:
## keep 24 hours for vasopressors_ts
# Convert the columns to datetime objects
vasopressors_ts['intime'] = pd.to_datetime(df['intime'])
vasopressors_ts['starttime'] = pd.to_datetime(df['starttime'])

# Calculate the time difference in hours
vasopressors_ts['time_difference'] = (vasopressors_ts['starttime'] - vasopressors_ts['intime']).dt.total_seconds() / 3600

# Filter rows where the time difference is at most 24 hours
vasopressors_ts = vasopressors_ts[vasopressors_ts['time_difference'] <= 24]

# Drop the 'time_difference' column if not needed
vasopressors_ts = vasopressors_ts.drop(columns=['time_difference'])

vasopressors_ts.to_csv(os.path.join(data_path, 'vasopressors-ts-df-3.csv'),sep=',',index=False)


In [None]:
query = query_schema + "select * from vasopressor_durations"
vasopressor_durations = pd.read_sql_query(query,con)
vasopressor_durations  = vasopressor_durations.sort_values(['icustay_id'])
vasopressor_durations = vasopressor_durations[vasopressor_durations.icustay_id.isin(sepsis3.icustay_id)]
vasopressor_durations.to_csv(os.path.join(data_path, 'vasopressor-durations-df-3.csv'),sep=',',index=False)

In [None]:
## saving all patients that satisfy sepsis according to various criteria
df.to_csv(os.path.join(data_path, 'sepsis-df-all-criteria.csv'),sep=',',index=False)

# Dataset with no exclusions

It may be useful for others to analyze the dataset without exclusions. Here we generate an identical copy of the data, except for all `icustay_id` in MIMIC-III.

In [None]:
# load in final dataset - note we add in the individual exclusion criteria
query = query_schema + """
select ie.subject_id
, s.*
, co.exclusion_secondarystay
, co.exclusion_nonadult
, co.exclusion_csurg
, co.exclusion_carevue
, co.exclusion_early_suspicion
, co.exclusion_late_suspicion
, co.exclusion_bad_data
from sepsis3 s
-- add in subject_id
inner join icustays ie
  on s.icustay_id = ie.icustay_id
inner join sepsis3_cohort co
  on s.icustay_id = co.icustay_id
order by s.icustay_id
"""

df = pd.read_sql_query(query,con)

# add the composite outcome
df['composite_outcome'] = ( (df['hospital_expire_flag']==1) | (df['icu_los']>=3) ).astype(int)

labels = OrderedDict([['suspicion_poe', 'BC + ABX (Prescribed)']])

# add some other useful variables
df['blood culture'] = (~df['blood_culture_time'].isnull())
df['suspicion_poe'] = (~df['suspected_infection_time_poe_days'].isnull())

df['abx_poe'] = (~df['antibiotic_time_poe'].isnull())

df['sepsis-3'] = ((df['suspicion_poe']==1) & (df['sofa']>=2)).astype(int)
df['sofa>=2'] = (df['sofa']>=2).astype(int)

df.to_csv(os.path.join(data_path, 'sepsis-df-no-exclusions.csv'),sep=',',index=False)

In [None]:
#con.close()