# Laboratory values vs outcome

Get the first laboratory measurements for patients admitted to the ICU. Plot the distribution of measurements for survival and non-survival groups.

In [None]:
# Import libraries
from __future__ import print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import yaml
import os

# used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

font = {'family' : 'normal',
        'size'   : 20}

matplotlib.rc('font', **font)

# Load in merged GOSSIS data

In [None]:
df = pd.read_csv('gossis-data.csv',header=0,sep=",")
df['data_source'].value_counts()

# Load in the header and the data type for each column

In [None]:
hdr = pd.read_csv('hdr/header.csv',header=None,sep=',')[0].values

# load yaml definitions
with open("hdr/variable-definitions.yaml", 'r') as stream:
    try:
        varlist = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# convert to dataframe
df_var = pd.DataFrame.from_dict(varlist, orient='index')
df_var['varname'] = df_var.index

# specify the order of the categories - data is output in this order
category_order = {'identifier': 1,
                  'demographic': 2,
                  'APACHE covariate': 3,
                  'vitals': 4,
                  'labs': 5,
                  'labs blood gas': 6,
                  'APACHE prediction': 10}
df_var['category_order'] = df_var['category'].map(category_order)

# sort df by the category, then by the variable name
df_var.sort_values(['category_order','varname'],inplace=True)

In [None]:
df_var['category'].value_counts()

In [None]:
df_var['dataType'].value_counts()

# Comparisons

For comparisons, we are interested assessing each variable across the databases.

For numeric/integer variables:

* Compare the mean, median, standard deviation, 5th and 95th percentiles
* Plot the distribution using histograms

For string/binary (categorical) variables:

* Compare the categories to see if they overlap
* Compare the frequency of each category (cross-tab)

## Compare readmissions

In [None]:
fac_list = ['elective_surgery','gender','hospital_death','icu_death','pregnant','readmission_status','smoking_status',
           'gcs_unable_apache','gcs_eyes_apache','gcs_motor_apache','gcs_verbal_apache','intubated_apache','ventilated_apache',
           'country','ethnicity','hospital_admit_source','hospital_disch_location','icu_admit_source',
           'icu_admit_type','icu_disch_location','icu_stay_type','icu_type','hospital_bed_size']

#bed size on eicu is a factor, but not the others

for a in fac_list:
    tbl = pd.crosstab(df[a], df['data_source'], margins = True)
    display(HTML(tbl.to_html().replace('NaN', '')))
    #get percentage table
    tbl_perc = tbl/tbl.xs('All', axis=0)
    display(HTML(tbl_perc.to_html()))

## Compare APACHE-III score in eicu and anzics patients

In [None]:
bins = np.linspace(0, 200, 101)
plt.figure(figsize=[16,10])
#plt.hist(df_all.loc[df_all['db']=='mimic','apsiii'].values, bins,
#         normed=True, alpha=0.5, label='mimic')
plt.hist(df.loc[df['data_source']=='eicu','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='eicu')
plt.hist(df.loc[df['data_source']=='anzics','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='anzics')
plt.legend(loc='upper right')
plt.xlabel('APACHE III-j score')
plt.ylabel('Proportion of patients')
plt.show()

As we can see the distributions are very similar, though ANZICS seems to have lower acuity.

In [None]:
bins = np.linspace(0, 200, 101)
plt.figure(figsize=[16,10])
plt.rcParams.update({'font.size': 20})
plt.hist(df.loc[df['data_source']=='eicu','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='eicu')
plt.hist(df.loc[df['data_source']=='anzics','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='anzics')
plt.hist(df.loc[df['data_source']=='mimic','apache_3j_score'].dropna().values, bins,
         normed=True, alpha=0.5, label='mimic')
plt.legend(loc='upper right')
plt.show()

Note that we don't have any data for MIMIC for the APACHE-III score.

In [None]:
from scipy.signal import medfilt

#hospital_los_days has '#REF!' under orchestra
cont = ['age','bmi','height','icu_los_days','pre_icu_los_days','weight',
       'albumin_apache','arf_apache','bilirubin_apache','bun_apache','creatinine_apache','fio2_apache','glucose_apache',
       'heart_rate_apache','hematocrit_apache','map_apache','paco2_apache','paco2_for_ph_apache','pao2_apache', 'ph_apache',
       'resprate_apache','sodium_apache','temp_apache','urineoutput_apache','wbc_apache','d1_diasbp_invasive_max',
       'd1_diasbp_invasive_min','d1_diasbp_max','d1_diasbp_min','d1_diasbp_noninvasive_max','d1_diasbp_noninvasive_min',
       'd1_heartrate_max','d1_heartrate_min','d1_mbp_invasive_max','d1_mbp_invasive_min','d1_mbp_max','d1_mbp_min',
       'd1_mbp_noninvasive_max','d1_mbp_noninvasive_min','d1_resprate_max','d1_resprate_min','d1_spo2_max','d1_spo2_min',
       'd1_sysbp_invasive_max','d1_sysbp_invasive_min','d1_sysbp_max','d1_sysbp_min','d1_sysbp_noninvasive_max',
       'd1_sysbp_noninvasive_min','d1_temp_max','d1_temp_min','h1_diasbp_invasive_max','h1_diasbp_invasive_min',
       'h1_diasbp_max','h1_diasbp_min','h1_diasbp_noninvasive_max','h1_diasbp_noninvasive_min','h1_heartrate_max',
       'h1_heartrate_min','h1_mbp_invasive_max','h1_mbp_invasive_min','h1_mbp_max','h1_mbp_min','h1_mbp_noninvasive_max',
       'h1_mbp_noninvasive_min','h1_resprate_max','h1_resprate_min','h1_spo2_max','h1_spo2_min','h1_sysbp_invasive_max',
       'h1_sysbp_invasive_min','h1_sysbp_max','h1_sysbp_min','h1_sysbp_noninvasive_max','h1_sysbp_noninvasive_min',
       'h1_temp_max','h1_temp_min','d1_albumin_max','d1_albumin_min','d1_bilirubin_max','d1_bilirubin_min','d1_bun_max',
       'd1_bun_min','d1_calcium_max','d1_calcium_min','d1_creatinine_max','d1_creatinine_min','d1_glucose_max','d1_glucose_min',
       'd1_hco3_max','d1_hco3_min','d1_hemaglobin_max','d1_hemaglobin_min','d1_hematocrit_max','d1_hematocrit_min',
       'd1_inr_max','d1_inr_min','d1_lactate_max','d1_lactate_min','d1_platelets_max','d1_platelets_min','d1_potassium_max',
       'd1_potassium_min','d1_sodium_max','d1_sodium_min','d1_wbc_max','d1_wbc_min','h1_albumin_max','h1_albumin_min',
       'h1_bilirubin_max','h1_bilirubin_min','h1_bun_max','h1_bun_min','h1_calcium_max','h1_calcium_min','h1_creatinine_max',
       'h1_creatinine_min','h1_glucose_max','h1_glucose_min','h1_hco3_max','h1_hco3_min','h1_hemaglobin_max','h1_hemaglobin_min',
       'h1_hematocrit_max','h1_hematocrit_min','h1_inr_max','h1_inr_min','h1_lactate_max','h1_lactate_min','h1_platelets_max',
       'h1_platelets_min','h1_potassium_max','h1_potassium_min','h1_sodium_max','h1_sodium_min','h1_wbc_max','h1_wbc_min',
       'd1_arterial_pco2_max','d1_arterial_pco2_min','d1_arterial_ph_max','d1_arterial_ph_min','d1_arterial_po2_max',
       'd1_arterial_po2_min','d1_pao2fio2ratio_min','h1_arterial_pco2_max','h1_arterial_pco2_min','h1_arterial_ph_max',
        'h1_arterial_ph_min','h1_arterial_po2_max','h1_arterial_po2_min','h1_pao2fio2ratio_max','h1_pao2fio2ratio_min',
       'apache_3j_score']

tb_header = ['anzics','eicu','mimic','orchestra']
d = {}
#removed bins
#bins = np.linspace(0, 200, 101)
for c in cont:
    
    avg_a = np.mean(df.loc[df['data_source']=='anzics',c].dropna().values)
    avg_e = np.mean(df.loc[df['data_source']=='eicu',c].dropna().values)
    avg_m = np.mean(df.loc[df['data_source']=='mimic',c].dropna().values)
    avg_o = np.mean(df.loc[df['data_source']=='orchestra',c].dropna().values)
    d[c+'_avg']= [avg_a, avg_e, avg_m, avg_o]
    std_a = np.std(df.loc[df['data_source']=='anzics',c].dropna().values)
    std_e = np.std(df.loc[df['data_source']=='eicu',c].dropna().values)
    std_m = np.std(df.loc[df['data_source']=='mimic',c].dropna().values)
    std_o = np.std(df.loc[df['data_source']=='orchestra',c].dropna().values)
    d[c+'_std']= [std_a, std_e, std_m, std_o]
    med_a = np.median(df.loc[df['data_source']=='anzics',c].dropna().values)
    med_e = np.median(df.loc[df['data_source']=='eicu',c].dropna().values)
    med_m = np.median(df.loc[df['data_source']=='mimic',c].dropna().values)
    med_o = np.median(df.loc[df['data_source']=='orchestra',c].dropna().values)
    d[c+'_med']= [med_a, med_e, med_m, med_o]
    
    # create a histogram of the data
    plt.figure(figsize=[16,10])
    plt.rcParams.update({'font.size': 20})
    bins = np.linspace( df.loc[:,c].dropna().quantile(0.01), df.loc[:,c].dropna().quantile(0.99), 51 )
    
    if avg_e is np.nan:
        # no data
        binned_e = [0,0]
        bins_e = [0,0]
        lbl_e = 'eicu (no data)'
        plt.plot(binned_e, bins_e, 'o', label=lbl_e)
    else:
        lbl_e = 'eicu'
        binned_e,bins_e,patches_e = plt.hist(df.loc[df['data_source']=='eicu',c].dropna().values,
                                                      bins=bins, histtype='bar',normed=True, lw=2, alpha=0.5,
                                                      label=lbl_e)
        
    if avg_a is np.nan:
        # no data
        binned_a = [0,0]
        bins_a = [0,0]
        lbl_a = 'anzics (no data)'
        plt.plot(binned_a, bins_a, 'o', label=lbl_a)
    else:
        lbl_a = 'anzics'
        binned_a,bins_a,patches_a = plt.hist(df.loc[df['data_source']=='anzics',c].dropna().values,
                                             bins=bins, histtype='bar',normed=True, lw=2, alpha=0.5,
                                             label=lbl_a) 
    if avg_m is np.nan:
        # no data
        binned_m = [0,0]
        bins_m = [0,0]
        lbl_m = 'mimic (no data)'
        plt.plot(binned_m, bins_m, 'o', label=lbl_m)
    else:
        lbl_m = 'mimic'
        binned_m,bins_m,patches_m = plt.hist(df.loc[df['data_source']=='mimic',c].dropna().values,
                                             bins=bins, histtype='bar',normed=True, lw=2, alpha=0.5,
                                             label=lbl_m)
    if avg_o is np.nan:
        # no data
        binned_o = [0,0]
        bins_o = [0,0]
        lbl_o = 'orchestra (no data)'
        plt.plot(binned_o, bins_o, 'o', label=lbl_o)
    else:
        lbl_o = 'orchestra'
        binned_o,bins_o,patches_o = plt.hist(df.loc[df['data_source']=='orchestra',c].dropna().values,
                                             bins=bins, histtype='bar',normed=True, lw=2, alpha=0.5,
                                             label=lbl_o)
    plt.legend(loc='upper right')
    plt.savefig(os.path.join('results','hist',c+'.png'))
    plt.close()
    
    # just plot a line plot of the data
    plt.figure(figsize=[16,10])
    plt.rcParams.update({'font.size': 20})
    plt.plot(bins_e[0:-1] + (bins_e[1]-bins_e[0])/2.0, medfilt(binned_e,3), '-', lw=2, alpha=0.5, label=lbl_e)
    plt.plot(bins_a[0:-1] + (bins_a[1]-bins_a[0])/2.0, medfilt(binned_a,3), '-', lw=2, alpha=0.5, label=lbl_a) 
    plt.plot(bins_m[0:-1] + (bins_m[1]-bins_m[0])/2.0, medfilt(binned_m,3), '-', lw=2, alpha=0.5, label=lbl_m)
    plt.plot(bins_o[0:-1] + (bins_o[1]-bins_o[0])/2.0, medfilt(binned_o,3), '-', lw=2, alpha=0.5, label=lbl_o)
    plt.legend(loc='upper right')
    plt.savefig(os.path.join('results',c+'.png'))
    plt.close()
    
    
    
tb = pd.DataFrame(d, index=tb_header)
tb=tb.transpose()
display(HTML(tb.to_html().replace('NaN', '')))
        

In [None]:
tb.to_csv('gossis-stats.csv',index=True)

In [None]:
bins = np.linspace(0, 200, 101)
plt.figure(figsize=[16,10])
plt.hist(df.loc[df['data_source']=='mimic','d1_creatinine_max'].dropna().values, bins,
         normed=True, alpha=0.5, label='mimic')
plt.hist(df.loc[df['data_source']=='eicu','d1_creatinine_max'].dropna().values, bins,
         normed=True, alpha=0.5, label='eicu')
plt.hist(df.loc[df['data_source']=='anzics','d1_creatinine_max'].dropna().values, bins,
         normed=True, alpha=0.5, label='anzics')
plt.legend(loc='upper right')
plt.show()