# Load data from CSV, parse it appropriately

This script loads the data from a CSV file and parses the data for concepts required for the GOSISS project. The script outputs the `anzics-gosiss-data.csv` file for later use.

Requirements:

* `anzics_2008_2016_deidentified.csv`

In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt

from collections import OrderedDict

%matplotlib inline

## Load in the data

In [2]:
df = pd.read_csv('anzics_2008_2016_deidentified.csv',header=0,sep=',')
#df = pd.read_csv('anzics_mini.csv',header=0,sep=',')

# convert columns to lower case
df.columns = [c.lower() for c in df.columns]

# the first column, site_id, has some weird text that we strip
c = df.columns[0]
if 'site_id' in c:
    df.columns = ['site_id' if (i==0) & ('site_id' in c) else c for i, c in enumerate(df.columns)]
else:
    print('WARNING: The first column may have changed. Check it for bad characters e.g. \xef\xbb\xbf')
    
# hard code the data source as a field
df['data_source'] = 'anzics'

  interactivity=interactivity, compiler=compiler, result=result)


We have decided to only look at data from 2014 onward - so we will subselect these patients from ANZICS.

In [3]:
# convert hospital discharge time to a timestamp
df['hosp_ds_dtm'] = pd.to_datetime(df['hosp_ds_dtm'])

# filter down ANZICS data to 2014-2015 anchored by their hospital discharge date
idx = df['hosp_ds_dtm'].map(lambda x: (x.year >= 2014) & (x.year <= 2015))
print('Removing {} admissions outside of [2014,2015].'.format(np.sum(~idx)))
df = df.loc[idx, :]

# remove readmissions
idx = df['readmitted']==1
print('Removing {} readmissions.'.format(np.sum(~idx)))
df = df.loc[idx, :]

# remove children
idx = df['age']>=16
print('Removing {} patients under 16 years old.'.format(np.sum(~idx)))
df = df.loc[idx, :]

Removing 863627 admissions outside of [2014,2015].
Removing 285807 readmissions.
Removing 212 patients under 16 years old.


## Create the mapping from ANZICS variables to GOSSIS variables

First we define functions/dictionaries necessary for mapping any coded data into a general format.

In [4]:
# dictionaries

# yes=1, no=2 dictionaries
dict_hospdied = {1: 1, 2: 0}
dict_icudied = {1: 1, 2: 0}
dict_elect = {1: 1, 2: 0}
dict_readmit = {1: 1, 2: 0}
dict_intub = {1: 1, 2: 0}
dict_vent = {1: 1, 2: 0}
dict_arf = {1: 1, 2: 0}
dict_preg = {1: 1, 2: 0}

# more specific dictionaries
dict_hosptype = {
    1: 'Rural/Regional',
    2: 'Metropolitan',
    3: 'Tertiary/Teaching',
    4: 'Private'}

dict_caretype = {
    1: 'ICU',
    2: 'HDU'
}

dict_hospadmit = {
    1: 'Home',
    2: 'Other Acute Hospital',
    3: 'Chronic Care Hospital (including nursing homes)',
    4: 'Other hospital ICU'
}

dict_hospoutcome = {
    2: 'Died in Hospital',
    3: 'Discharged home',
    4: 'Transferred to Chronic Care/Rehabilitation Hospital',
    5: 'Transferred to Other Hospital ICU',
    6: 'Transferred to Other Acute Care Hospital'
}

dict_icuoutcome = {
    2: 'Died in ICU',
    3: 'Survived ICU',
    5: 'Transferred to another ICU',
    6: 'Transferred to another hospital'
}

dict_indig = {
    1: 'Indigenous',
    0: 'Australian',
    -1:'Unknown'
}

dict_smokingstatus = {
    1: 'Current Smoker',
    2: 'Ex-Smoker',
    3: 'Never Smoked',
    4: 'Unknown'
}

dict_icuadmit = {
    1: 'OT/Recovery',
    2: 'Accident & Emergency',
    3: 'Ward',
    4: 'Other ICU, same Hospital',
    5: 'Other Hospital',
    6: 'Other Hospital ICU'}

Now we define a dictionary which maps from the GOSISS variable name (the key) to the ANZICS data (the value) - where the latter is either:
* a direct copy-paste of the data (in which case it the value is the string name of the column in the ANZICS data)
* a function of the data (usually involves calling the dictionary to map from coded values to the general form)

In [5]:
# encounter id is a concatenation of of:
#   - siteid, patientid, admepisode (incrementing integer for each ICU stay)
encFcn = lambda x: 'anzics_' + x['site_id'].astype(str) + '_' + \
(x['patientid']).astype(str) + '_' + (x['admepisode']).astype(str)

field_map = OrderedDict([
['data_source', 'data_source']
, ['encounter_id', encFcn]
, ['patient_id', 'patientid']
#, [None, 'countrycode']
, ['country', 'country']
, ['hospital_id', 'site_id']
, ['teaching_hospital', lambda x: x['cicmlevel'] == '3/PICU']
, ['hospital_bed_size', None]
, ['hospital_type', 'hospitalclassification']
#, [None, 'locationcode']
#, [None, 'jurisdictionid']
#, [None, 'jurisdictionname']
#, [None, 'publicprivateid']
#, [None, 'publicprivate']
#, [None, 'cicmlevel']
#, [None, 'cicmlevelid']
, ['icu_id', 'site_id'] # anzics doesn't identify individual ICUs
, ['icu_type', None]
, ['icu_stay_type', lambda x: x['caretype'].map(dict_caretype)]
, ['age', 'age']
, ['gender', 'sex']
, ['weight', 'weight']
, ['height', 'height']
, ['bmi', lambda x: x['weight'] / ( (x['height']/100) ** 2)]
, ['ethnicity', lambda x: x['indigenous'].map(dict_indig)]
#, [None, 'postcode']
, ['pregnant', lambda x: x['preg_stat'].map(dict_preg)]
, ['smoking_status', lambda x: x['smokingstatus'].map(dict_smokingstatus)]
#, [None, 'smokingintensity']
#, [None, 'icuadmissioncount']
#, [None, 'hospitaladmissioncount']
#, [None, 'readmissioncount']
#, [None, 'prior_icu_ad_dtm']
#, [None, 'prior_icu_ds_dtm']
#, [None, 'hosp_ad_dtm']
#, [None, 'hosp_ds_dtm']
, ['hospital_admit_source', lambda x: x['hosp_srce'].map(dict_hospadmit)]
, ['hospital_disch_location', lambda x: x['hosp_outcm'].map(dict_hospoutcome)]
, ['hospital_los_days', lambda x: x['hosp_hrs']/24.0]
, ['hospital_death', lambda x: x['died_hosp'].map(dict_hospdied)]
, ['icu_admit_source', lambda x: x['icu_srce'].map(dict_icuadmit)]
, ['icu_admit_type', None]
, ['icu_disch_location', lambda x: x['icu_outcm'].map(dict_icuoutcome)]
, ['pre_icu_los_days', lambda x: x['pre_icu_hrs']/24.0]
, ['icu_los_days', lambda x: x['icu_hrs']/24.0]
, ['icu_death', lambda x: x['died_icu'].map(dict_icudied)]
#, [None, 'admepisode']
#, [None, 'icuadmitfinyr']
#, [None, 'icuadmityyyymm']
#, [None, 'icuadmityyyy']
#, [None, 'icu_ad_dtm']
#, [None, 'icu_ds_dtm']
, ['elective_surgery', lambda x: x['elect'].map(dict_elect)]
, ['readmission_status', lambda x: x['readmitted'].map(dict_readmit)]
#, [None, 'icu_ds_dec_dtm']
#, [None, 'discharge_delay_hrs']
#, [None, 'readmission_lag_hrs']
#, [None, 'icu_outcm']
#, [None, 'emg_rsp_adm']
# === VITALS === #
, ['d1_heartrate_min', 'hrlo']
, ['d1_heartrate_max', 'hrhi']
, ['d1_resprate_min', 'rrlo']
, ['d1_resprate_max', 'rrhi']
, ['d1_spo2_min', None]
, ['d1_spo2_max', None]
, ['d1_temp_min', 'templo']
, ['d1_temp_max', 'temphi']
, ['d1_sysbp_invasive_min', None]
, ['d1_sysbp_invasive_max', None]
, ['d1_diasbp_invasive_min', None]
, ['d1_diasbp_invasive_max', None]
, ['d1_mbp_invasive_min', None]
, ['d1_mbp_invasive_max', None]
, ['d1_sysbp_noninvasive_min', None]
, ['d1_sysbp_noninvasive_max', None]
, ['d1_diasbp_noninvasive_min', None]
, ['d1_diasbp_noninvasive_max', None]
, ['d1_mbp_noninvasive_min', None]
, ['d1_mbp_noninvasive_max', None]
, ['d1_sysbp_min', 'systoliclo']
, ['d1_sysbp_max', 'systolichi']
, ['d1_diasbp_min', 'diastoliclo']
, ['d1_diasbp_max', 'diastolichi']
, ['d1_mbp_min', 'maplo']
, ['d1_mbp_max', 'maphi']
, ['d1_pasys_invasive_min', None]
, ['d1_pasys_invasive_max', None]
, ['d1_padias_invasive_min', None]
, ['d1_padias_invasive_max', None]
, ['d1_pamean_invasive_min', None]
, ['d1_pamean_invasive_max', None]
, ['h1_heartrate_min', None]
, ['h1_heartrate_max', None]
, ['h1_resprate_min', None]
, ['h1_resprate_max', None]
, ['h1_spo2_min', None]
, ['h1_spo2_max', None]
, ['h1_temp_min', None]
, ['h1_temp_max', None]
, ['h1_sysbp_invasive_min', None]
, ['h1_sysbp_invasive_max', None]
, ['h1_diasbp_invasive_min', None]
, ['h1_diasbp_invasive_max', None]
, ['h1_mbp_invasive_min', None]
, ['h1_mbp_invasive_max', None]
, ['h1_sysbp_noninvasive_min', None]
, ['h1_sysbp_noninvasive_max', None]
, ['h1_diasbp_noninvasive_min', None]
, ['h1_diasbp_noninvasive_max', None]
, ['h1_mbp_noninvasive_min', None]
, ['h1_mbp_noninvasive_max', None]
, ['h1_sysbp_min', None]
, ['h1_sysbp_max', None]
, ['h1_diasbp_min', None]
, ['h1_diasbp_max', None]
, ['h1_mbp_min', None]
, ['h1_mbp_max', None]
, ['h1_pasys_invasive_min', None]
, ['h1_pasys_invasive_max', None]
, ['h1_padias_invasive_min', None]
, ['h1_padias_invasive_max', None]
, ['h1_pamean_invasive_min', None]
, ['h1_pamean_invasive_max', None]
# === LABS/BLOOD GASES === #
, ['d1_albumin_min', None]
, ['d1_albumin_max', None]
, ['d1_bilirubin_min', None]
, ['d1_bilirubin_max', None]
, ['d1_bun_min', None]
, ['d1_bun_max', None]
, ['d1_calcium_min', None]
, ['d1_calcium_max', None]
, ['d1_creatinine_min', lambda x: x['creatlo']/88.42] # mmol/L -> mg/dL
, ['d1_creatinine_max', lambda x: x['creathi']/88.42] # mmol/L -> mg/dL
, ['d1_glucose_min', lambda x: x['gluclo']/18.018018] # mmol/L -> mg/dL
, ['d1_glucose_max', lambda x: x['gluchi']/18.018018] # mmol/L -> mg/dL
, ['d1_inr_min', None]
, ['d1_inr_max', None]
, ['d1_hco3_min', lambda x: x['hco3lo']] # mmol/L == mEq/L for bicarb
, ['d1_hco3_max', lambda x: x['hco3hi']] # mmol/L == mEq/L for bicarb
, ['d1_hematocrit_min', 'hctlo']
, ['d1_hematocrit_max', 'hcthi']
, ['d1_hemaglobin_min', 'hmgnlo'] # g/dL
, ['d1_hemaglobin_max', 'hmgnhi'] # g/dL
, ['d1_lactate_min', None]
, ['d1_lactate_max', None]
, ['d1_platelets_min', 'platlo']
, ['d1_platelets_max', 'plathi']
, ['d1_potassium_min', lambda x: x['klo']] # mmol/L == mEq/L
, ['d1_potassium_max', lambda x: x['khi']] # mmol/L == mEq/L
, ['d1_sodium_min', lambda x: x['nalo']] # mmol/L == mEq/L
, ['d1_sodium_max', lambda x: x['nahi']] # mmol/L == mEq/L
, ['d1_wbc_min', 'wcclo']
, ['d1_wbc_max', 'wcchi']
, ['d1_arterial_ph_min', None]
, ['d1_arterial_ph_max', None]
, ['d1_arterial_po2_min', None]
, ['d1_arterial_po2_max', None]
, ['d1_arterial_pco2_min', None]
, ['d1_arterial_pco2_max', None]
, ['d1_pao2fio2ratio_min', None]
, ['d1_pao2fio2ratio_max', None]
, ['h1_albumin_min', None]
, ['h1_albumin_max', None]
, ['h1_bilirubin_min', None]
, ['h1_bilirubin_max', None]
, ['h1_bun_min', None]
, ['h1_bun_max', None]
, ['h1_calcium_min', None]
, ['h1_calcium_max', None]
, ['h1_creatinine_min', None]
, ['h1_creatinine_max', None]
, ['h1_glucose_min', None]
, ['h1_glucose_max', None]
, ['h1_inr_min', None]
, ['h1_inr_max', None]
, ['h1_hco3_min', None]
, ['h1_hco3_max', None]
, ['h1_hematocrit_min', None]
, ['h1_hematocrit_max', None]
, ['h1_hemaglobin_min', None]
, ['h1_hemaglobin_max', None]
, ['h1_lactate_min', None]
, ['h1_lactate_max', None]
, ['h1_platelets_min', None]
, ['h1_platelets_max', None]
, ['h1_potassium_min', None]
, ['h1_potassium_max', None]
, ['h1_sodium_min', None]
, ['h1_sodium_max', None]
, ['h1_wbc_min', None]
, ['h1_wbc_max', None]
, ['h1_arterial_ph_min', None]
, ['h1_arterial_ph_max', None]
, ['h1_arterial_po2_min', None]
, ['h1_arterial_po2_max', None]
, ['h1_arterial_pco2_min', None]
, ['h1_arterial_pco2_max', None]
, ['h1_pao2fio2ratio_min', None]
, ['h1_pao2fio2ratio_max', None]
# === COMORBIDITIES === #
#, [None, 'cardarrest']
#, [None, 'resparrest']
#, [None, 'cabg_graft']
#, [None, 'cabg_redo']
#, [None, 'thromb_therapy']
#, [None, 'chr_resp']
#, [None, 'chr_liv']
#, [None, 'chr_ren'] # APACHE-II chronic renal failure == use of dialysis
#, [None, 'thrombpro']
#, [None, 'iddm']
#, [None, 'immundis']
#, [None, 'immunrx']
#, [None, 'chr_cvs']
#, [None, 'cirrhos']
#, [None, 'hepfail']
#, [None, 'lymphoma']
#, [None, 'leukaem']
#, [None, 'metast']
#, [None, 'immunsup']
#, [None, 'aids']
#, [None, 'ap2diag']
#, [None, 'ap3diag']
#, [None, 'ap3_subcode']

# === APACHE III VARIABLES === #
, ['albumin_apache', lambda x: x['albumin_anz']*0.1] # g/L -> g/dL
, ['bilirubin_apache', lambda x: x['bili_anz']/17.1] # micromol/L -> mg/dL
, ['creatinine_apache', lambda x: x['creat_anz']/88.42] # mmol/L -> mg/dL
, ['glucose_apache', lambda x: x['glucose_anz']/18.018018] # mmol/L -> mg/dL
, ['hematocrit_apache', 'hct_anz']
, ['heart_rate_apache', 'hr_anz']
, ['map_apache', 'map_anz']
, ['sodium_apache', lambda x: x['na_anz']] # mmol/L == mEq/L
# aps iii oxygenation blood gas
, ['fio2_apache', 'fio2_anz']
, ['paco2_apache', 'paco2_anz']
, ['pao2_apache', 'pao2_anz']
# aps iii acid-base components
, ['ph_apache', 'ph_anz']
, ['paco2_for_ph_apache', 'ap3co2p']
, ['resprate_apache', 'rr_anz']
, ['temp_apache', 'temp_anz']
, ['bun_apache', lambda x: x['urea_anz']/0.3571]  # mmol/L -> mg/dL
, ['urineoutput_apache', 'urineop']
, ['wbc_apache', 'wcc_anz']
, ['gcs_eyes_apache', 'gcseye']
, ['gcs_motor_apache', 'gcsmotor']
, ['gcs_verbal_apache', 'gcsverb']
, ['gcs_apache', 'gcs']
, ['gcs_unable_apache', None]
, ['arf_apache', lambda x: x['arf'].map(dict_arf)]
, ['intubated_apache', lambda x: x['intubated'].map(dict_intub)]
, ['ventilated_apache', lambda x: x['ventilated'].map(dict_vent)]
# === APACHE II VARIABLES === #
#, [None, 'temp_ap2']
#, [None, 'map_ap2']
#, [None, 'hr_ap2']
#, [None, 'rr_ap2']
#, [None, 'fio2_ap2']
#, [None, 'pao2_ap2']
#, [None, 'paco2_ap2']
#, [None, 'ph_ap2']
#, [None, 'hco3_ap2']
#, [None, 'na_ap2']
#, [None, 'k_ap2']
#, [None, 'creat_ap2']
#, [None, 'hct_ap2']
#, [None, 'wcc_ap2']
# === OUTPUT OF SCORING SYSTEMS === #
#, [None, 'APIIScore']
, ['apache_3j_score', 'apache3score']
, ['apache_3j_hospital_death_prob', 'apache3riskofdeath']
#, [None, 'apache3isincluded']
#, [None, 'apache3issmr']
, ['apache_4a_icu_death_prob', None]
, ['apache_4a_hospital_death_prob', None]
#, [None, 'anzrodriskofdeath']
#, [None, 'anzrodisincluded']
#, [None, 'anzrodissmr']
#, [None, 'agescore']
#, [None, 'albuminscore']
#, [None, 'bilirubinscore']
#, [None, 'creatininescore']
#, [None, 'glucosescore']
#, [None, 'haematocritscore']
#, [None, 'heartratescore']
#, [None, 'meanarterialpressurescore']
#, [None, 'sodiumscore']
#, [None, 'neurologicalscore']
#, [None, 'oxygenationscore']
#, [None, 'phscore']
#, [None, 'respiratoryratescore']
#, [None, 'temperaturescore']
#, [None, 'ureascore']
#, [None, 'urineoutputscore']
#, [None, 'whitecellcountscore']
#, [None, 'riskmodelversionid']
])

# Load in the header

In [6]:
hdr = pd.read_csv('../hdr/header.csv',header=None,sep=',')[0].values

In [7]:
df_new = pd.DataFrame()
for c in hdr:
    # did not find a mapping for the given variable
    if c not in field_map:
        print('WARNING: {} not found in field mapping for ANZICS data!'.format(c))
        df_new[c] = None
    # there is a mapping, but it indicates that we don't have any data
    elif field_map[c] is None:
        # plug in missing data into final dataframe
        print('WARNING: {} not available in ANZICS data!'.format(c))
        df_new[c] = None
    # there is a mapping, and the anzics definition matches the GOSISS definition
    elif type(field_map[c]) == str:
        # check the mapping refers to a column available in ANZICS data
        if field_map[c].lower() in df.columns.values:
            # data exists, copy it over
            df_new[c] = df[field_map[c]]
        else:
            print('WARNING: {} equivalent not found in ANZICS data! (Looked for "{}".)'.format(c, field_map[c].lower()))
    # there is a mapping, and it's a function (usually a dictionary)
    else:
        # call the mapping
        df_new[c] = field_map[c](df)



In [8]:
# add in the APS-III
if 'apsiii' in df_new.columns:
    if np.sum(df_new['apsiii'].isnull()) == df_new.shape[0]:
        # calculate the APS-III using the components
        print('TODO: calculate APS-III')
    else:
        print('APS-III already populated!')

# 3 - Output the data to a csv file

In [9]:
df_new.to_csv('anzics-gosiss-data.csv',index=False)