# NOTEBOOK MIMIC DATA

In [None]:
################################################################################################################
################################################################################################################
### import libraries
from __future__ import print_function
from collections import OrderedDict
from functools import reduce
from datetime import datetime
import os
import platform
import copy
import sys
import pyodbc
import pymssql
import pandas as pd
import numpy as np
import functools 
import psycopg2
import socket
import sys
import getpass
import time
################################################################################################################
################################################################################################################
# import from parent directory with a little help from sys.path.insert()
sys.path.insert(0, '../src') 

# Settings
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = 1000

### Configuration file to determine root directory 
import conf

### check for GPU's
use_gpu = torch.cuda.is_available()

### Check everything
conf.print_python_environment()

## Define MIMIC data direcotries

In [None]:
# from configuration file set working directory
main_path = os.path.join(conf.ROOT_DIR, 'SEPSIS')

# Define the subfolders paths
data_path = 'data'
MIMIC_data_path = 'MIMIC_data'
query_path = 'MIMIC_sql'
source_path = 'MIMIC_src'

## Connect to MIMIC database

In [None]:
############################################################################
"""
SQL CONNECTION to MIMIC Database using psycopg2
"""
# Connect to local postgres version of mimic
sqluser = 'postgres'
dbname = 'mimic' 
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'
con = psycopg2.connect(dbname=dbname, user=sqluser, password="postgres")

# Query that is usefull when no return table is expected. For example for creating views.
def execute_query_safely(sql, con):
    cur = con.cursor()
    
    # try to execute the query
    try:
        cur.execute(sql)
    except:
        # if an exception, rollback, rethrow the exception - finally closes the connection
        cur.execute('rollback;')
        raise
    finally:
        cur.close()
    
    return

print('MIMIC - Using username {}'.format(sqluser))
print('Connected to postgres {}.{}.{}!'.format(int(con.server_version/10000),
                                              (con.server_version - int(con.server_version/10000)*10000)/100,
                                              (con.server_version - int(con.server_version/100)*100)))

############################################################################
# Settings for Pandas to display more then the default amount of collumns
pd.set_option("display.max_columns",150)

# SQL working
print("SQL connections established!\n") 

## Execute make-tables to create the SEPSIS COHORT from the MIMIC Github code repository


In [None]:
# check if the sepsis3_cohort table exists ... if not we must generate it
query = """
SELECT EXISTS(SELECT 1 FROM information_schema.tables 
              WHERE table_catalog = '{}'
              AND table_schema in ('public','{}')
              AND table_name = 'sepsis3');
""".format(dbname, schema_name)
tbl_exists = pd.read_sql_query(query, con)
tbl_exists = tbl_exists.loc[0,'exists']
if tbl_exists:
    print('Found the `sepsis3` table. Skipping generation of data in SQL.')
else:
    print('Running SQL code to generate tables. This may take some time.')
    
    # read through the "make-tables.sql" file in the sql subfolder
    query_path = 'query'
    
    with open(os.path.join(main_path, query_path, 'make-tables.sql'), 'r',encoding='latin-1') as fp:
        for line in fp.readlines():
            if len(line)<2:
                print(line,end='')
                continue
            
            if line[0:2] != '\i':
                print(line,end='')
                continue
                
            # lines which begin with '\i' call SQL files that generate tables
            query_file = os.path.join(main_path, query_path, line[3:].replace('\n',''))
            print('Running {} ...'.format(query_file), end=' ')
            with open(query_file, 'r') as fp_query:
                query = ''.join(fp_query.readlines())
            execute_query_safely(query_schema + query, con)
            print('done.')
    execute_query_safely(query_schema + 'COMMIT;',con)
    print('extra commit executed')

In [None]:
# exclusion criteria:
#   - less than 16 years old
#   - never have any chartevents data (i.e. likely administrative error)
#   - not cardiac surgery
#   - suspected of infection
#   - first ICU stay
#   - not a CareVue patient (i.e. admitted 2008-2012)
# these exclusion criteria are created in the sepsis3_cohort table
query = query_schema + "select * from sepsis3_cohort"
co = pd.read_sql_query(query,con)
co.head()

# Final SEPSIS dataset from Postgres with exclusions
### Add in useful variabeles
We have: ICU intime/outtime, suspected infection time, whether the microbiology culture was positive, some demographics, comorbidities, outcomes, and the severity scores. 

The severity scores are extracted at a [0, 24] hour window centered around ICU admission - except labs have an extended [-6, 24] hour window (i.e. 'sofa' is extracted in this way).
### Save the data to file
The dataframes will be loaded directly from a file, rather than the database.

In [None]:
# load in final dataset - note we apply the exclusion criteria with excluded=0
query = query_schema + "select * from sepsis3 where excluded = 0"
df = pd.read_sql_query(query,con)
#print(df.head())

# add the composite outcome
df['composite_outcome'] = ( (df['hospital_expire_flag']==1) | (df['icu_los']>=3) ).astype(int)
labels = OrderedDict([['suspicion_poe', 'BC + ABX (Prescribed)']])

# add some other useful variables
df['blood culture'] = (~df['blood_culture_time'].isnull())
df['suspicion_poe'] = (~df['suspected_infection_time_poe_days'].isnull())
df['abx_poe'] = (~df['antibiotic_time_poe'].isnull())
df['sepsis-3'] = ((df['suspicion_poe']==1) & (df['sofa']>=2)).astype(int)
df['sofa>=2'] = (df['sofa']>=2).astype(int)

df.to_csv(os.path.join(main_path, MIMIC_data_path, 'sepsis3-df.csv'),sep=',',index=False)

# Dataset with no exclusions

for completeness sake, we generate an identical copy of the data, except for all `icustay_id` in MIMIC-III.

In [None]:
# load in final dataset - note we add in the individual exclusion criteria
query = query_schema + """
select ie.subject_id
, s.*
, co.exclusion_secondarystay
, co.exclusion_nonadult
, co.exclusion_csurg
, co.exclusion_carevue
, co.exclusion_early_suspicion
, co.exclusion_late_suspicion
, co.exclusion_bad_data
from sepsis3 s
-- add in subject_id
inner join icustays ie
  on s.icustay_id = ie.icustay_id
inner join sepsis3_cohort co
  on s.icustay_id = co.icustay_id
order by s.icustay_id
"""
df = pd.read_sql_query(query,con)

# add the composite outcome
df['composite_outcome'] = ( (df['hospital_expire_flag']==1) | (df['icu_los']>=3) ).astype(int)
labels = OrderedDict([['suspicion_poe', 'BC + ABX (Prescribed)']])

# add some other useful variables
df['blood culture'] = (~df['blood_culture_time'].isnull())
df['suspicion_poe'] = (~df['suspected_infection_time_poe_days'].isnull())
df['abx_poe'] = (~df['antibiotic_time_poe'].isnull())
df['sepsis-3'] = ((df['suspicion_poe']==1) & (df['sofa']>=2)).astype(int)
df['sofa>=2'] = (df['sofa']>=2).astype(int)

df.to_csv(os.path.join(main_path, MIMIC_data_path, 'sepsis3-df-no-exclusions.csv'),sep=',',index=False)

# Create RL Vieuws based on SEPSIS3 Cohort
Start with a check if the sepsis3 table exists (should have been created as a materialized view here above). If the sepsis3 table exists, create materialized views for the RL dataset

In [None]:
# check if the sepsis3_cohort table exists ... if not we did not generate it, start again from the top of this notebook
query = """
SELECT EXISTS(SELECT 1 FROM information_schema.tables 
              WHERE table_catalog = '{}'
              AND table_schema in ('public','{}')
              AND table_name = 'sepsis3');
""".format(dbname, schema_name)
tbl_exists = pd.read_sql_query(query, con)
tbl_exists = tbl_exists.loc[0,'exists']
if tbl_exists:
    print('Found the `sepsis3` table. Will now create the RL_dataset.\n')
      # read through the "make-tables.sql" file in the sql subfolder
    query_path = 'MIMIC_sql'
    
    with open(os.path.join(main_path, query_path, 'RL_views.sql'), 'r') as fp:
        for line in fp.readlines():
            if len(line)<2:
                print(line,end='')
                continue
            
            if line[0:2] != '\i':
                print(line,end='')
                continue
                
            # lines which begin with '\i' call SQL files that generate tables
            query_file = os.path.join(main_path, query_path, line[3:].replace('\n',''))
            print('Running {} ...'.format(query_file), end=' ')
            with open(query_file, 'r') as fp_query:
                query = ''.join(fp_query.readlines())
            execute_query_safely(query_schema + query, con)
            print('done.')
        execute_query_safely(query_schema + 'COMMIT;',con)
        print('extra commit executed')
else:
    print('sepsis3 table not found, cannot continue to create the RL cohort.')

### END OF RL VIEWS

# Create RL Sepsis cohort
    If the sepsis3 table exists, we start querying the database with these newly created materialized views and we save the output to csv

### Get Cohort

In [None]:
# define the query based on the new location and sql files
query_file = 'get_cohort.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'cohort.csv'),sep=',',index=True)
print('get_cohort done.')
df.head()

## Get MIMIC vasopressor doses
    --  /* SOURCE:  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3590882/
    --    Vasopressor Norepinephrine equivalent dose =
    --	  Norepinephrine    1
    --	  Epinephrine       1
    --	  Dopamine          0.01
    --	  Vasopressin       5*
    --	  Phenylephrine     0.45
    --  * Approximate conversion of vasopressin dose in units/min to equivalent norepinephrine dose in mcg/kg/min, normalized to 100kg body weight
    --   ALSO: https://www.ncbi.nlm.nih.gov/pubmed/22407285
    -- */

In [None]:
# define the query based on the new location and sql files
query_file = 'get_vassopressor_cv.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'vassopressors_cv_cohort.csv'),sep=',',index=False)
print('vassopressors_cv_cohort done.')
df.head()

In [None]:
# define the query based on the new location and sql files
query_file = 'get_vassopressor_mv.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'vassopressors_mv_cohort.csv'),sep=',',index=False)
print('vassopressors_mv_cohort done.')
df.head()

## Get IV Fluid input

In [None]:
# define the query based on the new location and sql files
query_file = 'get_inputevents_cv.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'inputevents_cv_cohort.csv'),sep=',',index=False)
print('inputevents_cv_cohort done.')
df.head()

In [None]:
# define the query based on the new location and sql files
query_file = 'get_inputevents_mv.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'inputevents_mv_cohort.csv'),sep=',',index=False)
print('inputevents_mv_cohort done.')
df.head()

## Get Labs

In [None]:
# define the query based on the new location and sql files
query_file = 'get_labs_cohort.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'labs_cohort.csv'),sep=',',index=False)
print('labs_cohort done.')
df.head()

## Get Vitals

In [None]:
# define the query based on the new location and sql files
query_file = 'get_vitals_cohort.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'vitals_cohort.csv'),sep=',',index=False)
print('vitals_cohort done.')
df.head()

## Get Demographics

In [None]:
# define the query based on the new location and sql files
query_file = 'get_demographics_cohort.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'demographics_cohort.csv'),sep=',',index=False)
print('demographics_cohort done.')
print(df.head())

## Get Urine Output

In [None]:
# define the query based on the new location and sql files
query_file = 'get_urineoutput_cohort.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'Urineoutput_cohort.csv'),sep=',',index=False)
print('UrineOutput_cohort done.')
df.head()

## Get FIO2 values

In [None]:
# define the query based on the new location and sql files
query_file = 'get_FiO2_cohort.sql'
with open(os.path.join(main_path, query_path, query_file), 'r') as fp_query:
    query = ''.join(fp_query.readlines())
df = pd.read_sql_query(query_schema + query, con)
df.to_csv(os.path.join(main_path, MIMIC_data_path, 'FiO2_cohort.csv'),sep=',',index=False)
print('FiO2_cohort done.')
df.head()

# End of get_MIMIC_data notebook
safely close connection to the database

In [None]:
con.close()
print("connection closed")