In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df_eicu = pd.read_csv('eicu/eicu-gossis-data.csv',header=0,sep=',')
df_anzics = pd.read_csv('anzics/anzics-gossis-data.csv',header=0,sep=',')
df_mimic = pd.read_csv('mimic-iii/mimic-iii-gossis-data.csv',header=0,sep=',')
df_orchestra = pd.read_csv('orchestra/orchestra-gossis-data.csv',header=0,sep=',')
df_nicst = pd.read_csv('nicst/nicst-gossis-data.csv',header=0,sep=',')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# load in the header columns
hdr = pd.read_csv('hdr/header.csv',header=None,sep=',')[0].values

In [4]:
# loop through and print whether the data is present
print('Is the data available for the given dataset?')
print('A blank indicates it is available. N/A specifies not available.')
print('')

print('{:30s}'.format('dataset'), end='\t')
data_source = ['anzics','eicu','mCV','mMV','orch','nicst']
print('\t'.join(data_source))

print_list = ['','','','','','']
idxCV = df_mimic['data_source']=='carevue'
df_list = [df_anzics, df_eicu, df_mimic[idxCV], df_mimic[~idxCV], df_orchestra, df_nicst]
for c in hdr:
    print('{:30s}'.format(c),end="\t")
    
    for i, d in enumerate(df_list):
        if np.all(d[c].isnull()):
            print_list[i] = 'N/A'
        else:
            print_list[i] = ''
        
    print('\t'.join(print_list))

Is the data available for the given dataset?
A blank indicates it is available. N/A specifies not available.

dataset                       	anzics	eicu	mCV	mMV	orch	nicst
encounter_id                  						
patient_id                    						
data_source                   						
country                       						
hospital_id                   						N/A
teaching_hospital             					N/A	N/A
hospital_bed_size             	N/A					N/A
hospital_bed_size_numeric     	N/A	N/A				N/A
hospital_type                 		N/A	N/A	N/A	N/A	N/A
icu_id                        						N/A
icu_type                      	N/A					N/A
icu_stay_type                 					N/A	N/A
age                           						N/A
gender                        						
weight                        						
height                        						
bmi                           						
ethnicity                     					N/A	N/A
pregnant                      		N/A	N/A		N/A	N/A
smoking_status                		N/A			N/A	N/A
hosp

In [5]:
# merge together metavision/carevue in df_list
data_source = ['anzics','eicu','mimic','orch', 'nicst']
df_list = [df_anzics, df_eicu, df_mimic, df_orchestra, df_nicst]
print_list = ['','','','','']

In [6]:
hdr

array(['encounter_id', 'patient_id', 'data_source', 'country',
       'hospital_id', 'teaching_hospital', 'hospital_bed_size',
       'hospital_bed_size_numeric', 'hospital_type', 'icu_id', 'icu_type',
       'icu_stay_type', 'age', 'gender', 'weight', 'height', 'bmi',
       'ethnicity', 'pregnant', 'smoking_status', 'hospital_admit_source',
       'hospital_disch_location', 'hospital_los_days', 'hospital_death',
       'icu_admit_source', 'icu_disch_location', 'pre_icu_los_days',
       'icu_los_days', 'icu_death', 'elective_surgery',
       'readmission_status', 'd1_heartrate_min', 'd1_heartrate_max',
       'd1_resprate_min', 'd1_resprate_max', 'd1_spo2_min', 'd1_spo2_max',
       'd1_temp_min', 'd1_temp_max', 'd1_sysbp_invasive_min',
       'd1_sysbp_invasive_max', 'd1_diasbp_invasive_min',
       'd1_diasbp_invasive_max', 'd1_mbp_invasive_min',
       'd1_mbp_invasive_max', 'd1_sysbp_noninvasive_min',
       'd1_sysbp_noninvasive_max', 'd1_diasbp_noninvasive_min',
       'd1_dias

In [7]:
# print the same as above to a file - with descriptions and categories
# load yaml definitions
with open("hdr/variable-definitions.yaml", 'r') as stream:
    try:
        varlist = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# convert to dataframe
df_var = pd.DataFrame.from_dict(varlist, orient='index')
df_var['varname'] = df_var.index

# we only keep definitions which are in the header file
varRemove = list()
for i in df_var.index:
    if i not in hdr:
        varRemove.append(i)
df_var.drop(varRemove,axis=0,inplace=True)

# specify the order of the categories - data is output in this order
category_order = {'identifier': 1,
                  'demographic': 2,
                  'APACHE covariate': 3,
                  'vitals': 4,
                  'labs': 5,
                  'labs blood gas': 6,
                  'APACHE prediction': 10}
df_var['category_order'] = df_var['category'].map(category_order)


# sort df by the category, then by the variable name
df_var.sort_values(['category_order','varname'],inplace=True)

with open("GOSSIS_VARIABLE_COMPLETION.csv","w") as fp:
    fp.write('variables,category,description,unitofmeasure')
    fp.write(','.join(data_source))
    fp.write('\n')
    
    for c in df_var.index:
        for i, d in enumerate(df_list):
            if np.all(d[c].isnull()):
                print_list[i] = 'N/A'
            else:
                print_list[i] = ''
        fp.write(c + ',')
        
        # write the category/description of the column
        fp.write('"' + df_var.loc[c, 'category'] + '",')
        fp.write('"' + df_var.loc[c, 'description'] + '",')
        fp.write('"' + df_var.loc[c, 'unitofmeasure'] + '",')
        
        # write whether data is available
        fp.write(','.join(print_list))
        fp.write('\n')
        

In [8]:
# merge and spit out the data
df = pd.concat([df_eicu, df_anzics, df_mimic, df_orchestra, df_nicst],ignore_index=True)
df['data_source'].value_counts()

anzics        266136
eicu          122893
orchestra      59693
carevue        22097
metavision     15964
nicst           3419
both              78
Name: data_source, dtype: int64

In [9]:
# for the purposes of data comparison, combine carevue/metavision/both
df.loc[np.in1d(df['data_source'],['carevue','metavision','both']),'data_source'] = 'mimic'
df['data_source'].value_counts()

anzics       266136
eicu         122893
orchestra     59693
mimic         38139
nicst          3419
Name: data_source, dtype: int64

In [10]:
# load the three dataframes
df.to_csv('gossis-data.csv',index=False)