# Load data from SQL, parse it appropriately

This script loads the data from a MIMIC-III database and parses the data for concepts required for the GOSSIS project. The script outputs the `mimic-iii-gossis-data.csv` file for later use.

In [None]:
from __future__ import print_function

import psycopg2
import numpy as np
import pandas as pd
import os 

# cursors need to be rolled back if they fail
def execute_query_safely(sql, con):
    cur = con.cursor()
    
    # try to execute the query
    try:
        cur.execute(sql)
    except:
        # if an exception, rollback, rethrow the exception - finally closes the connection
        cur.execute('rollback;')
        raise
    finally:
        cur.close()
    
    return

import getpass

import matplotlib.pyplot as plt

%matplotlib inline

## Connect to database

In [None]:
# prompt user for username/password
host='localhost'
port=5432
print('Connecting to {} on port {} ...'.format(host,port))
sqluser = getpass.getuser()
sqlpass = getpass.getpass(prompt='Username: {}\nPassword (blank for peer auth): '.format(sqluser))

if sqlpass=='':
    # try peer authentication
    con = psycopg2.connect(dbname='mimic', user=sqluser)
else:
    con = psycopg2.connect(dbname='mimic', host=host, port=port, user=sqluser, password=sqlpass)


print('Connected to postgres {}.{}.{}!'.format(int(con.server_version/10000),
                                              (con.server_version - int(con.server_version/10000)*10000)/100,
                                              (con.server_version - int(con.server_version/100)*100)))

# default is to write to public and read from both public and mimiciii
query_schema = "set search_path to public,mimiciii;"

## Create initial cohort

In [None]:
# read in file/create base cohort
f = 'sql/cohort.sql'
with open(f, 'r') as fp:
    query = ''.join(fp.readlines())

# Execute the query
print('Generating table using {} ...'.format(f),end=' ')
execute_query_safely(query_schema + query, con)
print('done.')

In [None]:
# take a peek at the base cohort
query = query_schema + """select * from gossis_cohort"""
co = pd.read_sql_query(query,con)

# print out the exclusions
print('Cohort - initial size: {} ICU stays'.format(co.shape[0]))
idxRem = np.zeros(co.shape[0])
for c in co.columns:
    if c.startswith('exclusion_'):
        print('  {:5g} ({:2.2f}%) - {}'.format(np.sum(co[c]),np.mean(co[c])*100.0, c))
        idxRem[co[c].values==1] = 1
        
print('  {:5g} ({:2.2f}%) - exclusion_missing_outcome'.format(0, 0))
print('Final cohort size: {} ICU stays ({:2.2f}%).'.format(co.shape[0] - np.sum(idxRem), (1-np.mean(idxRem))*100.0))

## Create necessary materialized views

In [None]:
# get a list of all SQL files in the subfolder
query_path = './sql/'
queries = [f for f in os.listdir(query_path) 
             # only keep the filename if it is actually a file (and not a directory)
            if os.path.isfile(os.path.join(query_path,f))
             # and only keep the filename if it is an SQL file
            & f.endswith('.sql')
            # and we do *not* want the cohort - it's generated above
            & (f != 'cohort.sql')]

queries = sorted(queries)[::-1]

# make sure 'apsiii.sql' is the second to last query run
if 'apsiii.sql' in queries:
    queries.remove('apsiii.sql')
    queries.append('apsiii.sql')

# make sure 'data.sql' is not run
if 'data.sql' in queries:
    queries.remove('data.sql')

# execute each SQL file to generate the materialized views
for f in queries:
    print('Executing {} ...'.format(f), end=' ')
    
    with open(os.path.join(query_path,f)) as fp:
        query = ''.join(fp.readlines())
        
    execute_query_safely(query_schema + query, con)
        
    print('done.')

In [None]:
f = 'data.sql'
print('Executing {} ...'.format(f), end=' ')

with open(os.path.join(query_path,f)) as fp:
    query = ''.join(fp.readlines())

execute_query_safely(query_schema + query, con)

print('done.')

# 2 - Extract all covariates and outcome measures

We now aggregate all the data from the various views into a single dataframe.

In [None]:
# Load in the query from file
query = query_schema + """
--FINAL QUERY
select
  g.*
from gossis g
"""

# Load the result of the query into a dataframe
df = pd.read_sql_query(query, con)
print('Loaded data for {} patients and {} features.'.format(df.shape[0],df.shape[1]-2))

## Load in the header

In [None]:
hdr = pd.read_csv('../hdr/header.csv',header=None,sep=',')[0].values

Map the data into a consistent header which is used for all databases. Warn if data is not found in the current dataset.

In [None]:
df_new = pd.DataFrame()
for c in hdr:
    # did not find a mapping for the given variable
    if c not in df.columns:
        print('WARNING: {} not found in MIMIC-III data!'.format(c))
        df_new[c] = None
    else:
        # call the mapping
        df_new[c] = df[c]

# 3 - Output the data to a csv file

In [None]:
df_new.to_csv('mimic-iii-gossis-data.csv',index=False)

In [None]:
con.close()