In [8]:
from __future__ import print_function

import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt

from collections import OrderedDict

%matplotlib inline

In [10]:
datasets = OrderedDict([
                        ['eicu', 'eicu/eicu-gossis-data.csv.gz'],
                        ['anzics', 'anzics/anzics-gossis-data.csv.gz'],
                        ['mimic-iii', 'mimic-iii/mimic-iii-gossis-data.csv'],
                        ['orchestra', 'orchestra/orchestra-gossis-data.csv'],
                        ['nicst', 'nicst/nicst-gossis-data.csv'],
                        ['satiq', 'satiq/satiq-gossis-data.csv']
    ])

df_list = list()
for x in datasets:
    print('Loading {}...'.format(x), end=' ')
    df_list.append( pd.read_csv( datasets[x], header=0, sep=',' ) )
    print('done.')

Loading eicu... done.
Loading anzics... done.
Loading mimic-iii... done.
Loading orchestra... done.
Loading nicst... done.
Loading satiq... done.


In [11]:
# load in the header columns
hdr = pd.read_csv('hdr/header.csv',header=None,sep=',')[0].values

In [18]:
# loop through and print whether the data is present
print('Is the data available for the given dataset?')
print('A blank indicates it is available. N/A specifies not available.')
print('')

print('{:30s}'.format('dataset'), end='\t')
for x in datasets:
    print('{}'.format(x[0:4]),end='\t')
print('')

print_list = ['']*len(df_list)
#idxCV = df_mimic['data_source']=='carevue'
#df_list = [df_anzics, df_eicu, df_mimic[idxCV], df_mimic[~idxCV], df_orchestra, df_nicst]
for c in hdr:
    print('{:30s}'.format(c),end="\t")
    
    for i, d in enumerate(df_list):
        if np.all(d[c].isnull()):
            print_list[i] = 'N/A'
        else:
            print_list[i] = ''
        
    print('\t'.join(print_list))

Is the data available for the given dataset?
A blank indicates it is available. N/A specifies not available.

dataset                       	eicu	anzi	mimi	orch	nics	sati	
encounter_id                  						
patient_id                    						
data_source                   						
country                       						
hospital_id                   					N/A	N/A
teaching_hospital             				N/A	N/A	N/A
hospital_bed_size             		N/A			N/A	N/A
hospital_bed_size_numeric     	N/A	N/A			N/A	N/A
hospital_type                 	N/A		N/A	N/A	N/A	N/A
icu_id                        					N/A	N/A
icu_type                      		N/A			N/A	N/A
icu_stay_type                 				N/A	N/A	N/A
age                           					N/A	
gender                        						
weight                        						
height                        						N/A
bmi                           						N/A
ethnicity                     				N/A	N/A	N/A
pregnant                      	N/A			N/A	N/A	N/A
smoking_status      

In [20]:
data_source = datasets.keys()

# print the same as above to a file - with descriptions and categories
# load yaml definitions
with open("hdr/variable-definitions.yaml", 'r') as stream:
    try:
        varlist = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# convert to dataframe
df_var = pd.DataFrame.from_dict(varlist, orient='index')
df_var['varname'] = df_var.index

# we only keep definitions which are in the header file
varRemove = list()
for i in df_var.index:
    if i not in hdr:
        varRemove.append(i)
df_var.drop(varRemove,axis=0,inplace=True)

# specify the order of the categories - data is output in this order
category_order = {'identifier': 1,
                  'demographic': 2,
                  'APACHE covariate': 3,
                  'vitals': 4,
                  'labs': 5,
                  'labs blood gas': 6,
                  'APACHE prediction': 10}
df_var['category_order'] = df_var['category'].map(category_order)


# sort df by the category, then by the variable name
df_var.sort_values(['category_order','varname'],inplace=True)

with open("GOSSIS_VARIABLE_COMPLETION.csv","w") as fp:
    fp.write('variables,category,description,unitofmeasure')
    fp.write(','.join(data_source))
    fp.write('\n')
    
    for c in df_var.index:
        for i, d in enumerate(df_list):
            if np.all(d[c].isnull()):
                print_list[i] = 'N/A'
            else:
                print_list[i] = ''
        fp.write(c + ',')
        
        # write the category/description of the column
        fp.write('"' + df_var.loc[c, 'category'] + '",')
        fp.write('"' + df_var.loc[c, 'description'] + '",')
        fp.write('"' + df_var.loc[c, 'unitofmeasure'] + '",')
        
        # write whether data is available
        fp.write(','.join(print_list))
        fp.write('\n')
        

In [21]:
# merge and spit out the data
df = pd.concat(df_list,ignore_index=True)
df['data_source'].value_counts()

anzics        266136
eicu          122893
orchestra      59693
carevue        22097
metavision     15964
nicst           3419
satiq            579
both              78
Name: data_source, dtype: int64

In [22]:
# for the purposes of data comparison, combine carevue/metavision/both
df.loc[np.in1d(df['data_source'],['carevue','metavision','both']),'data_source'] = 'mimic'
df['data_source'].value_counts()

anzics       266136
eicu         122893
orchestra     59693
mimic         38139
nicst          3419
satiq           579
Name: data_source, dtype: int64

In [23]:
# load the three dataframes
df.to_csv('gossis-data.csv',index=False)