In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt

from collections import OrderedDict

%matplotlib inline

In [None]:
datasets = OrderedDict([
                          ['eicu', 'eicu/eicu-gossis-data.csv.gz']
                        , ['anzics', 'anzics/anzics-gossis-data.csv.gz']
                        , ['mimic-iii', 'mimic-iii/mimic-iii-gossis-data.csv.gz']
                        #, ['orchestra', 'orchestra/orchestra-gossis-data.csv.gz']
                        , ['nicst', 'nicst/nicst-gossis-data.csv.gz']
                        #, ['satiq', 'satiq/satiq-gossis-data.csv.gz']
    ])

df_list = list()
for x in datasets:
    print('Loading {}...'.format(x), end=' ')
    df_list.append( pd.read_csv( datasets[x], header=0, sep=',',
                               dtype = {'apache_3j_diagnosis': str, 'apache_2_diagnosis': str}) )
    print('done.')

In [None]:
# load in the header columns
hdr = pd.read_csv('hdr/header.csv',header=None,sep=',')[0].values

In [None]:
# loop through and print whether the data is present
print('Is the data available for the given dataset?')
print('A blank indicates it is available. N/A specifies not available.')
print('')

print('{:30s}'.format('dataset'), end='\t')
for x in datasets:
    print('{}'.format(x[0:4]),end='\t')
print('')

print_list = ['']*len(df_list)
#idxCV = df_mimic['data_source']=='carevue'
#df_list = [df_anzics, df_eicu, df_mimic[idxCV], df_mimic[~idxCV], df_orchestra, df_nicst]
for c in hdr:
    print('{:30s}'.format(c),end="\t")
    
    for i, d in enumerate(df_list):
        if np.all(d[c].isnull()):
            print_list[i] = 'N/A'
        else:
            print_list[i] = ''
        
    print('\t'.join(print_list))

In [None]:
data_source = datasets.keys()

# print the same as above to a file - with descriptions and categories
# load yaml definitions
with open("hdr/variable-definitions.yaml", 'r') as stream:
    try:
        varlist = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# convert to dataframe
df_var = pd.DataFrame.from_dict(varlist, orient='index')
df_var['varname'] = df_var.index

# we only keep definitions which are in the header file
varRemove = list()
for i in df_var.index:
    if i not in hdr:
        varRemove.append(i)
df_var.drop(varRemove,axis=0,inplace=True)

# specify the order of the categories - data is output in this order
category_order = {'identifier': 1,
                  'demographic': 2,
                  'APACHE covariate': 3,
                  'vitals': 4,
                  'labs': 5,
                  'labs blood gas': 6,
                  'APACHE prediction': 10}
df_var['category_order'] = df_var['category'].map(category_order)


# sort df by the category, then by the variable name
df_var.sort_values(['category_order','varname'],inplace=True)

with open("GOSSIS_VARIABLE_COMPLETION.csv","w") as fp:
    fp.write('variables,category,description,unitofmeasure')
    fp.write(','.join(data_source))
    fp.write('\n')
    
    for c in df_var.index:
        for i, d in enumerate(df_list):
            if np.all(d[c].isnull()):
                print_list[i] = 'N/A'
            else:
                print_list[i] = ''
        fp.write(c + ',')
        
        # write the category/description of the column
        fp.write('"' + df_var.loc[c, 'category'] + '",')
        fp.write('"' + df_var.loc[c, 'description'] + '",')
        fp.write('"' + df_var.loc[c, 'unitofmeasure'] + '",')
        
        # write whether data is available
        fp.write(','.join(print_list))
        fp.write('\n')
        

In [None]:
# merge and spit out the data
df = pd.concat(df_list,ignore_index=True)
df['data_source'].value_counts()

In [None]:
# for the purposes of data comparison, combine carevue/metavision/both
df.loc[np.in1d(df['data_source'],['carevue','metavision','both']),'data_source'] = 'mimic'
df['data_source'].value_counts()

In [None]:
# add in the apache 3 body system
ap3_map = pd.read_csv('etc/apache3-to-apache2.csv',sep=',',
                     dtype={'apache_3j_diagnosis': str,
                            'apache_2_diagnosis': str}
                    )

ap3_map.drop(['apache_2_diagnosis','ANZICS Added','apache_3j_name'],
            axis=1, inplace=True)
# create a column containing only the digits before '.'
# this is the apache3 diagnosis
def get_ap3_code(x):
    if 'str' in str(type(x)):
        if '.' in x:
            return x.split('.')[0]
        else:
            return x
    else:
        return None
    
df['apache3dx'] = df['apache_3j_diagnosis'].map(get_ap3_code)

df = df.merge(ap3_map,
              how='left', suffixes=('','_ap'),
              left_on='apache3dx',
              right_on='apache_3j_diagnosis')

df.drop(['apache_3j_diagnosis_ap', 'apache3dx','apache_3j_operative'],
        axis=1, inplace=True)

In [None]:
# add in the apache 2 body system
ap2_map = pd.read_csv('etc/apache2-definitions.csv',sep=',',
                     dtype={'apache_2_diagnosis': str}
                    )

ap2_map.drop(['apache_2_name','apache_2_coefficient'],
            axis=1,inplace=True)
ap2_map['apache_2_operative'] = (ap2_map['apache_2_operative'] == 'Post-operative').astype(int)

In [None]:
df = df.merge(ap2_map,
              how='left', suffixes=('','_ap'),
              left_on=['apache_2_diagnosis','apache_post_operative'],
              right_on=['apache_2_diagnosis', 'apache_2_operative'])
df.drop('apache_2_operative',axis=1,inplace=True)

In [None]:
# load the three dataframes
df.to_csv('gossis-data.csv.gz',index=False, compression='gzip')