Get the first laboratory measurements for patients admitted to the ICU. Plot the distribution of measurements for survival and non-survival groups.

In [None]:
# Import libraries
from __future__ import print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import yaml
import os
from scipy.signal import medfilt
import tableone

# used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

font = {'family' : 'DejaVu Sans',
        'size'   : 20}

matplotlib.rc('font', **font)

## Functions

# Load in merged GOSSIS data

In [None]:
df = pd.read_csv('gossis-data.csv.gz', header=0, sep=",", compression='gzip')
df['data_source'].value_counts()

# Load in the header and the data type for each column

In [None]:
hdr = pd.read_csv('hdr/header.csv',header=None,sep=',')[0].values

# load yaml definitions
with open("hdr/variable-definitions.yaml", 'r') as stream:
    try:
        varlist = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# convert to dataframe
df_var = pd.DataFrame.from_dict(varlist, orient='index')
df_var['varname'] = df_var.index

# specify the order of the categories - data is output in this order
category_order = {'identifier': 1,
                  'demographic': 2,
                  'APACHE covariate': 3,
                  'vitals': 4,
                  'labs': 5,
                  'labs blood gas': 6,
                  'APACHE prediction': 10}
df_var['category_order'] = df_var['category'].map(category_order)

# sort df by the category, then by the variable name
df_var.sort_values(['category_order','varname'],inplace=True)

In [None]:
df_var['category'].value_counts()

In [None]:
df_var['dataType'].value_counts()

# Data Processing

In [None]:
print(df[df['age']>150]['data_source'].value_counts())
df.loc[(df['age']>150)&(df['data_source']=='mimic'),'age']=91.4
print(df[df['age']>100]['data_source'].value_counts())

In [None]:
pd.crosstab(df['icu_admit_source'],df['data_source']).reset_index()

# Comparisons

For comparisons, we are interested assessing each variable across the databases.

For numeric/integer variables:

* Compare the mean, median, standard deviation, 5th and 95th percentiles
* Plot the distribution using histograms

For string/binary (categorical) variables:

* Compare the categories to see if they overlap
* Compare the frequency of each category (cross-tab)

## Compare APACHE-III score in eicu and anzics patients

In [None]:
bins = np.linspace(0, 200, 101)
plt.figure(figsize=[16,10])
#plt.hist(df_all.loc[df_all['db']=='mimic','apsiii'].values, bins,
#         normed=True, alpha=0.5, label='mimic')
plt.hist(df.loc[df['data_source']=='eicu','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='eicu', color = 'blue')
plt.hist(df.loc[df['data_source']=='anzics','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='anzics', color= 'black')
plt.legend(loc='upper right')
plt.xlabel('APACHE III-j score')
plt.ylabel('Proportion of patients')
plt.show()

As we can see the distributions are very similar, though ANZICS seems to have lower acuity.

In [None]:
bins = np.linspace(0, 200, 101)
plt.figure(figsize=[16,10])
plt.rcParams.update({'font.size': 20})
plt.hist(df.loc[df['data_source']=='eicu','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='eicu')
plt.hist(df.loc[df['data_source']=='anzics','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='anzics')
plt.hist(df.loc[df['data_source']=='mimic','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='mimic')
plt.legend(loc='upper right')
plt.show()

Note that we don't have any data for MIMIC for the APACHE-III score.

In [None]:
idxE = df['data_source']=='eicu'

In [None]:
pd.crosstab(df['hospital_admit_source'],df['data_source'])

# Summarize dataset

In [None]:
categorical=['country', 'elective_surgery', 'ethnicity', 'gender', 'hospital_death', 'icu_death', 'pregnant', 'smoking_status', 'teaching_hospital', 'arf_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'intubated_apache', 'ventilated_apache']
t1 = tableone.TableOne(df, columns=categorical, categorical=categorical, groupby='data_source')
t1

## Cross-tabulate all categorical data

In [None]:
fac_list = df_var.loc[(df_var['dataType']!='numeric')&(df_var['category']!='identifier'),'varname']
for a in fac_list:
    if a in ['hospital_death','bmi','apache_3j_score','icu_id', 'hospital_bed_size_numeric', 'icu_admit_type', 'apsiii']:
        continue
    tbl = pd.crosstab(df[a], df['data_source'], margins = True)
    #display(HTML(tbl.to_html().replace('NaN', '')))
    #get percentage table
    tbl_perc = tbl/tbl.xs('All', axis=0)
    display(HTML(tbl_perc.to_html().replace('NaN', '')))

# Completion table

In [None]:
index = list(df_var['category'].astype('category').cat.categories)
category_gp=df_var.groupby('category')
header=['anzics','eicu','mimic','nicst','satiq']

for i in index:
    try:
        display(HTML('<h2>'+i+'</h2>'))
        comp_dict={}
        for column in df_var.loc[df_var['category']==i,'varname']:
                if column in ['data_source','hospital_bed_size_numeric','icu_admit_type']:
                    continue
                comp_dict[column] = list()

                for dataname in header:
                    N = np.sum(df['data_source']==dataname)
                    N_COUNT = df.loc[df['data_source']==dataname,column].count()
                    if N>0:
                        comp_dict[column].append( N_COUNT*100.0/N )     
        tb = pd.DataFrame(comp_dict, index=header)
        tb = tb.replace(to_replace=float(0), value='')
        tb=tb.transpose()
        display(HTML(tb.to_html()))
    except:
        continue
           

In [None]:
index = list(df_var['category'].astype('category').cat.categories)
category_gp=df_var.groupby('category')
header=['anzics','eicu','mimic','nicst','satiq']

for i in index:
    try:
        display(HTML('<h2>'+i+'</h2>'))
        comp_dict={}
        for column in df_var.loc[df_var['category']==i,'varname']:
                if column in ['data_source','hospital_bed_size_numeric','icu_admit_type']:
                    continue
                comp_dict[column] = list()

                for dataname in header:
                    N = np.sum(df['data_source']==dataname)
                    N_COUNT = df.loc[df['data_source']==dataname,column].count()
                    if N>0:
                        comp_dict[column].append( N_COUNT*100.0/N )
        tb = pd.DataFrame(comp_dict, index=header)
        tb = tb.replace(to_replace=float(0), value='')
        tb=tb.transpose()
        # only display data if it is present in anzics
        tb = tb.loc[ tb['anzics']!='', : ]
        display(HTML(tb.to_html()))
    except:
        continue
           

In [None]:
df['hospital_type'].value_counts()

In [None]:
index = list(df_var['category'].astype('category').cat.categories)
category_gp=df_var.groupby('category')
header=['anzics','eicu','mimic','orchestra']

for i in index:
    display(HTML('<h2>'+i+'</h2>'))
    comp_dict={}
    for column in df_var.loc[df_var['category']==i,'varname']:
        if column in ['data_source','hospital_bed_size_numeric','icu_admit_type']:
            continue
        comp_dict[column] = list()
            
        for dataname in header:
            N = np.sum(df['data_source']==dataname)
            N_COUNT = df.loc[df['data_source']==dataname,column].count()
            if N>0:
                comp_dict[column].append( N_COUNT*100.0/N )
                
    tb = pd.DataFrame(comp_dict, index=header)
    tb = tb.replace(to_replace=float(0), value='')
    tb=tb.transpose()
    display(HTML(tb.to_html()))
    tb.to_csv(i+'-completion.csv',index=True)

# Ventilated vs Not

# Compare Hospital Death vs No Hospital Death

In [None]:
fac_list = df_var.loc[(df_var['dataType']!='numeric')&(df_var['category']!='identifier'),'varname']
for a in fac_list:
    if a in ['hospital_death','bmi','apache_3j_score','icu_id', 'hospital_bed_size_numeric', 'icu_admit_type', 'apsiii']:
        continue
    tbl = pd.crosstab(df[a], [df['data_source'],df['hospital_death']], margins = True)
    #get percentage table
    tbl_perc = tbl*100.0/tbl.xs('All', axis=0)
    display(HTML(tbl_perc.to_html()))

# Day 1 vs Hour 1

# About Databases (Demographics)

In [None]:
categorical = ['elective_surgery', 'gender', 'hospital_death', 'icu_death', 'pregnant', 'readmission_status', 'smoking_status', 'teaching_hospital']
continuous = ['age', 'height', 'hospital_los_days', 'icu_los_days', 'pre_icu_los_days', 'weight']

t1 = tableone.TableOne(df, columns=categorical + continuous, categorical=categorical, groupby = 'data_source', pval=False)
t1