# Load data from XLSX, parse it appropriately

This script loads the data from a XLSX file and parses the data for concepts required for the GOSISS project. The script outputs the `nicst-gosiss-data.csv` file for later use.

Requirements:

* `SA_dataset_2017417.xlsx`

In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt

from collections import OrderedDict

%matplotlib inline

## Load in the data

In [2]:
df = pd.read_csv('sa_dataset_2017417.csv',header=0,sep=',')
#df = pd.read_csv('anzics_mini.csv',header=0,sep=',')

# convert columns to lower case
df.columns = [c.lower() for c in df.columns]

# hard code the data source as a field
df['data_source'] = 'nicst'
df['country'] = 'South Asia'

In [3]:
print('Exclusions which already existed during study design: age > 18.')
print('Initial cohort: {} ICU stays.'.format(df.shape[0]))

# remove missing outcomes
idxRem = df['mortality'].isnull()
print('\t{} ({:2.2f}%) - missing outcome.'.format(np.sum(idxRem), np.sum(idxRem)*100.0/df.shape[0]))
idxKeep = ~idxRem

# remove readmissions
idxRem = np.zeros(df.shape[0], dtype=bool)
print('\t{} ({:2.2f}%) - readmissions.'.format(np.sum(idxRem), np.sum(idxRem)*100.0/df.shape[0]))
idxKeep = (~idxRem) & idxKeep

# missing ap-ii pred
idxRem = ~(df['a_score']>0)
print('\t{} ({:2.2f}%) - patients missing apache prediction.'.format(np.sum(idxRem), np.sum(idxRem)*100.0/df.shape[0]))
idxKeep = (~idxRem) & idxKeep

# missing heart rate
idxRem = (  (df['heart_rate_lowest'].isnull()) | (df['heart_rate_highest'].isnull())  )
print('\t{} ({:2.2f}%) - missing data.'.format(np.sum(idxRem), np.sum(idxRem)*100.0/df.shape[0]))
idxKeep = (~idxRem) & idxKeep

df = df.loc[idxKeep, :]

print('Final cohort: {} ICU stays.'.format(df.shape[0]))

np.sum(df['mortality'].isnull())
print('Final cohort size: {}'.format(df.shape[0]))

Exclusions which already existed during study design: age > 18.
Initial cohort: 3855 ICU stays.
	234 (6.07%) - missing outcome.
	0 (0.00%) - readmissions.
	0 (0.00%) - patients missing apache prediction.
	264 (6.85%) - missing data.
Final cohort: 3419 ICU stays.
Final cohort size: 3419


## Fix some data

Things we can fix:

* `height` of "NA###" are weights accidentally placed next to an NA height

In [4]:
# fix 3 missing weights
idxFix = ['NA' in str(x) for x in df['height']]
df.loc[idxFix,'weight'] = [x.replace('NA','').strip(' ') for x in df.loc[idxFix,'height']]
df.loc[idxFix,'height'] = 'na'

## Create the mapping from NICST variables to GOSSIS variables

First we define functions/dictionaries necessary for mapping any coded data into a general format.

In [5]:
# dictionaries
dict_site = {
    3: 'India', # number of obs: 2283
    4: 'Sri Lanka', # 817
    1: 'Bangladesh', # 430
    2: 'Nepal' # 325
}

dict_gender = {
    'Male': 'M',
    'Female': 'F',
    '.': None,
    np.nan: None,
    'female': 'F',
    'male': 'M'}

def fixWeight(x):
    if 'float' in str(type(x)):
        return x
    elif 'str' in str(type(x)):
        try:
            return float(x.replace('kg','').strip(' '))
        except:
            return np.nan
    else:
        return np.nan
    
    
def fixHeight(x):
    if 'float' in str(type(x)):
        return x
    elif 'str' in str(type(x)):
        try:
            # convert to cm
            z = x.lower()
            if 'inch' in z:
                z = z.replace('inch','').strip(' ')
                z = float(z)*2.54
                return z
            elif 'cm' in z:
                z = z.replace('cm','').strip(' ')
                return float(z)
            else:
                z = z.replace('na','').strip(' ')
                z = float(z)
                return z
        except:
            return np.nan
    else:
        return np.nan
    
    
def fixAge(x):
    if 'float' in str(type(x)):
        return x
    elif 'str' in str(type(x)):
        try:
            # convert to cm
            z = x.lower()
            if 'months' in z:
                z = z.replace('months','').strip(' ')
                z = float(z)/12.0
                return z
            elif 'yrs' in z:
                z = z.replace('yrs','').strip(' ')
                return float(z)
            elif 'years' in z:
                z = z.replace('years','').strip(' ')
                return float(z)
            else:
                z = z.replace('na','').strip(' ')
                z = float(z)
                if z > 150:
                    # bad data
                    return np.nan
                else:
                    return z
        except:
            return np.nan
    else:
        return np.nan
    
    
def ensurePercentage(x):
    if x > 100:
        return np.nan
    elif x < 0:
        return np.nan
    else:
        return x

Now we define a dictionary which maps from the GOSISS variable name (the key) to the NICST data (the value) - where the latter is either:
* a direct copy-paste of the data (in which case it the value is the string name of the column in the ANZICS data)
* a function of the data (usually involves calling the dictionary to map from coded values to the general form)

In [6]:
# encounter id is a concatenation of of:
#   - siteid, patientid, admepisode (incrementing integer for each ICU stay)
encFcn = lambda x: 'nicst_' + x['id'].astype(str)

field_map = OrderedDict([
['data_source', 'data_source']
, ['encounter_id', encFcn]
, ['patient_id', encFcn]
, ['country', lambda x: x['site'].map(dict_site)]
, ['hospital_id', None]
, ['teaching_hospital', None]
, ['hospital_bed_size', None]
, ['hospital_type', None]
, ['icu_id', None]
, ['icu_type', None]
, ['icu_stay_type', None]
, ['age', fixAge]
, ['gender', lambda x: x['sex'].map(dict_gender)]
, ['weight', lambda x: x['weight'].map(fixWeight)]
, ['height', lambda x: x['height'].map(fixHeight)]
, ['bmi', lambda x: x['weight'].map(fixWeight) / ( (x['height'].map(fixHeight)/100) ** 2)]
, ['ethnicity', None]
, ['pregnant', None]
, ['smoking_status', None]
, ['hospital_admit_source', None]
, ['hospital_disch_location', None]
, ['hospital_los_days', None]
, ['hospital_death', None]
, ['icu_admit_source', None]
, ['icu_admit_type', None]
, ['icu_disch_location', None]
, ['pre_icu_los_days', None]
, ['icu_los_days', 'lengthofstay']
, ['icu_death', 'mortality']
, ['elective_surgery', None]
, ['readmission_status', None]
# === VITALS === #
, ['d1_heartrate_min', 'heart_rate_lowest']
, ['d1_heartrate_max', 'heart_rate_highest']
, ['d1_resprate_min', 'resp_rate_lowest']
, ['d1_resprate_max', 'resp_rate_highest']
, ['d1_spo2_min', lambda x: x['saturate_lowest'].map(ensurePercentage)]
, ['d1_spo2_max', lambda x: x['saturate_highest'].map(ensurePercentage)]
, ['d1_temp_min', 'temp_lowest']
, ['d1_temp_max', 'temp_highest']
, ['d1_sysbp_invasive_min', None]
, ['d1_sysbp_invasive_max', None]
, ['d1_diasbp_invasive_min', None]
, ['d1_diasbp_invasive_max', None]
, ['d1_mbp_invasive_min', None]
, ['d1_mbp_invasive_max', None]
, ['d1_sysbp_noninvasive_min', None]
, ['d1_sysbp_noninvasive_max', None]
, ['d1_diasbp_noninvasive_min', None]
, ['d1_diasbp_noninvasive_max', None]
, ['d1_mbp_noninvasive_min', None]
, ['d1_mbp_noninvasive_max', None]
, ['d1_sysbp_min', 'sys_bp_lowest']
, ['d1_sysbp_max', 'sys_bp_highest']
, ['d1_diasbp_min', None]
, ['d1_diasbp_max', None]
, ['d1_mbp_min', 'map_lowest']
, ['d1_mbp_max', 'map_highest']
, ['d1_pasys_invasive_min', None]
, ['d1_pasys_invasive_max', None]
, ['d1_padias_invasive_min', None]
, ['d1_padias_invasive_max', None]
, ['d1_pamean_invasive_min', None]
, ['d1_pamean_invasive_max', None]
# hourly
, ['h1_heartrate_min', None]
, ['h1_heartrate_max', None]
, ['h1_resprate_min', None]
, ['h1_resprate_max', None]
, ['h1_spo2_min', None]
, ['h1_spo2_max', None]
, ['h1_temp_min', None]
, ['h1_temp_max', None]
, ['h1_sysbp_invasive_min', None]
, ['h1_sysbp_invasive_max', None]
, ['h1_diasbp_invasive_min', None]
, ['h1_diasbp_invasive_max', None]
, ['h1_mbp_invasive_min', None]
, ['h1_mbp_invasive_max', None]
, ['h1_sysbp_noninvasive_min', None]
, ['h1_sysbp_noninvasive_max', None]
, ['h1_diasbp_noninvasive_min', None]
, ['h1_diasbp_noninvasive_max', None]
, ['h1_mbp_noninvasive_min', None]
, ['h1_mbp_noninvasive_max', None]
, ['h1_sysbp_min', None]
, ['h1_sysbp_max', None]
, ['h1_diasbp_min', None]
, ['h1_diasbp_max', None]
, ['h1_mbp_min', None]
, ['h1_mbp_max', None]
, ['h1_pasys_invasive_min', None]
, ['h1_pasys_invasive_max', None]
, ['h1_padias_invasive_min', None]
, ['h1_padias_invasive_max', None]
, ['h1_pamean_invasive_min', None]
, ['h1_pamean_invasive_max', None]
# === LABS/BLOOD GASES === #
, ['d1_albumin_min', None]
, ['d1_albumin_max', None]
, ['d1_bilirubin_min', None]
, ['d1_bilirubin_max', None]
, ['d1_bun_min', 'bun_lowest']
, ['d1_bun_max', 'bun_highest']
, ['d1_calcium_min', None]
, ['d1_calcium_max', None]
, ['d1_creatinine_min', 'creatinine_lowest'] # mg/dL
, ['d1_creatinine_max', 'creatinine_highest'] # mg/dL
, ['d1_glucose_min', None] 
, ['d1_glucose_max', None] 
, ['d1_inr_min', None]
, ['d1_inr_max', None]
, ['d1_hco3_min', None]
, ['d1_hco3_max', None]
, ['d1_hematocrit_min', lambda x: x['pcv_lowest'].map(ensurePercentage)]
, ['d1_hematocrit_max', lambda x: x['pcv_highest'].map(ensurePercentage)]
, ['d1_hemaglobin_min', 'hb_lowest'] # g/dL
, ['d1_hemaglobin_max', 'hb_highest'] # g/dL
, ['d1_lactate_min', None]
, ['d1_lactate_max', None]
, ['d1_platelets_min', None]
, ['d1_platelets_max', None]
, ['d1_potassium_min', 'pottasium_lowest'] # mmol/L == mEq/L
, ['d1_potassium_max', 'pottasium_highest'] # mmol/L == mEq/L
, ['d1_sodium_min', 'sodium_lowest'] # mmol/L == mEq/L
, ['d1_sodium_max', 'sodium_highest'] # mmol/L == mEq/L
, ['d1_wbc_min', 'wbc_lowest']
, ['d1_wbc_max', 'wbc_highest']
, ['d1_arterial_ph_min', 'ph_lowest']
, ['d1_arterial_ph_max', 'ph_highest']
, ['d1_arterial_po2_min', 'pao2_lowest']
, ['d1_arterial_po2_max', 'pao2_highest']
, ['d1_arterial_pco2_min', None]
, ['d1_arterial_pco2_max', None]
, ['d1_pao2fio2ratio_min', None]
, ['d1_pao2fio2ratio_max', None]
# hourly
, ['h1_albumin_min', None]
, ['h1_albumin_max', None]
, ['h1_bilirubin_min', None]
, ['h1_bilirubin_max', None]
, ['h1_bun_min', None]
, ['h1_bun_max', None]
, ['h1_calcium_min', None]
, ['h1_calcium_max', None]
, ['h1_creatinine_min', None]
, ['h1_creatinine_max', None]
, ['h1_glucose_min', None]
, ['h1_glucose_max', None]
, ['h1_inr_min', None]
, ['h1_inr_max', None]
, ['h1_hco3_min', None]
, ['h1_hco3_max', None]
, ['h1_hematocrit_min', None]
, ['h1_hematocrit_max', None]
, ['h1_hemaglobin_min', None]
, ['h1_hemaglobin_max', None]
, ['h1_lactate_min', None]
, ['h1_lactate_max', None]
, ['h1_platelets_min', None]
, ['h1_platelets_max', None]
, ['h1_potassium_min', None]
, ['h1_potassium_max', None]
, ['h1_sodium_min', None]
, ['h1_sodium_max', None]
, ['h1_wbc_min', None]
, ['h1_wbc_max', None]
, ['h1_arterial_ph_min', None]
, ['h1_arterial_ph_max', None]
, ['h1_arterial_po2_min', None]
, ['h1_arterial_po2_max', None]
, ['h1_arterial_pco2_min', None]
, ['h1_arterial_pco2_max', None]
, ['h1_pao2fio2ratio_min', None]
, ['h1_pao2fio2ratio_max', None]
# === COMORBIDITIES === #
#, [None, 'diabetes']
#, [None, 'diabetes_type']
#, [None, 'aids']
#, [None, 'tuberculosis']
#, [None, 'past_tb']
#, [None, 'hepatitis_b']
#, [None, 'other_chronic']
#, [None, 'other_chronic_specify']
#, [None, 'antidiabetic']
#, [None, 'antidiabetic_specify']
#, [None, 'stayhr']

# === APACHE III VARIABLES === #
, ['albumin_apache', None]
, ['bilirubin_apache', None]
, ['creatinine_apache', None]
, ['glucose_apache', None]
, ['hematocrit_apache', None]
, ['heart_rate_apache', None]
, ['map_apache', None]
, ['sodium_apache', None]
# aps iii oxygenation blood gas
, ['fio2_apache', None]
, ['paco2_apache', None]
, ['pao2_apache', None]
# aps iii acid-base components
, ['ph_apache', None]
, ['paco2_for_ph_apache', None]
, ['resprate_apache', None]
, ['temp_apache', None]
, ['bun_apache', None]
, ['urineoutput_apache', None]
, ['wbc_apache', None]
, ['gcs_eyes_apache', None]
, ['gcs_motor_apache', None]
, ['gcs_verbal_apache', None]
, ['gcs_apache', 'gcs_lowest']
, ['gcs_unable_apache', None]
, ['arf_apache', None]
, ['ventilated_apache', 'mechanical_first_24h']
# === TREATMENTS === #
#, [None, 'vasoactive_first_h']
#, [None, 'antibiotic_drugs']
# === OUTPUT OF SCORING SYSTEMS === #
#, [None, 'a_score'] # probably apache ii
#, [None, 'prob'] # probably apache ii
, ['apsiii', None]
, ['apache_3j_score', None]
, ['apache_3j_hospital_death_prob', None]
, ['apache_4a_icu_death_prob', None]
, ['apache_4a_hospital_death_prob', None]
])
        
# === OTHER UNUSED VARIABLES === #
#, [None, 'apachecode']
#, [None, 'diagnosis']
#, [None, 'temp_onadm'],
#, [None, 'map_onadm']
#, [None, 'heart_rate_onadm']
#, [None, 'resp_rate_onadm']
#, [None, 'pao2_onadm']
#, [None, 'ph_onadm'],
#, [None, 'sodium_onadm'],
#, [None, 'pottasium_onadm'],
#, [None, 'creatinine_onadm'],
#, [None, 'pcv_onadm']
#, [None, 'wbc_onadm']
#, [None, 'gcs_highest']
#, [None, 'gcs_onadm']
#, [None, 'hb_onadm'],
#, [None, 'sys_bp_onadm'],
#, [None, 'saturate_onadm'],
#, [None, 'fio2_highest']
#, [None, 'fio2_lowest']
#, [None, 'fio2_onadm']

# Load in the header

In [7]:
hdr = pd.read_csv('../hdr/header.csv',header=None,sep=',')[0].values

In [8]:
df_new = pd.DataFrame()
for c in hdr:
    # did not find a mapping for the given variable
    if c not in field_map:
        print('WARNING: {} not found in field mapping for NICST data!'.format(c))
        df_new[c] = None
    # there is a mapping, but it indicates that we don't have any data
    elif field_map[c] is None:
        # plug in missing data into final dataframe
        #print('WARNING: {} not available in NICST data!'.format(c))
        df_new[c] = None
    # there is a mapping, and the anzics definition matches the GOSISS definition
    elif type(field_map[c]) == str:
        # check the mapping refers to a column available in data
        if field_map[c].lower() in df.columns.values:
            # data exists, copy it over
            df_new[c] = df[field_map[c]]
        else:
            print('WARNING: {} equivalent not found in NICST data! (Looked for "{}".)'.format(c, field_map[c].lower()))
    # there is a mapping, and it's a function (usually a dictionary)
    else:
        # call the mapping
        df_new[c] = field_map[c](df)



# 3 - Output the data to a csv file

In [9]:
df_new.to_csv('nicst-gossis-data.csv',index=False)