# Load data from CSV, parse it appropriately

This script loads the data from a CSV file and parses the data for concepts required for the GOSSIS project. The script outputs the `orchestra-gossis-data.csv` file for later use.

Requirements:

* `orchestra_study_all_centers_excl_readm_mimic_project.csv`

In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import math

from collections import OrderedDict

%matplotlib inline

# Load in the data

In [2]:
# In theory we can load from the excel document, but it was slow
#df = pd.read_excel('orchestra_study_all_centers_excl_readm_mimic_project.xlsx',
#                   sheetname=0,header=0)
# The excel document was opened in OpenOffice LibreOffice and saved as a .csv
df = pd.read_csv('orchestra_study_all_centers_excl_readm_mimic_project.csv',header=0,sep=',')

# convert columns to lower case
df.columns = [c.lower() for c in df.columns]

# hard code the data source as a field
df['data_source'] = 'orchestra'

  interactivity=interactivity, compiler=compiler, result=result)


## Create the mapping from ORCHESTRA variables to GOSSIS variables

First we define functions/dictionaries necessary for mapping any coded data into a general format.

In [3]:
# dictionaries
dict_hosp_destination = {
    'Death': 'Died in Hospital',
    'Home-care': 'Discharged home',
    'Residence': 'Transferred to Chronic Care/Rehabilitation Hospital',
    'Hospice / Casa de apoio': 'Transferred to Other Hospital ICU',
    'Other hospital': 'Transferred to Other Acute Care Hospital',
    'Other / Unkown': 'Transferred to Other Acute Care Hospital'
}

dict_icu_destination = {
    'WardFloor': 'Survived ICU',
    'OtherICUSameHosp': 'Transferred to another ICU',
    'Death': 'Died in ICU',
    'Residence': 'Survived ICU',
    'IntermediateUnit': 'Transferred to another ICU',
    'Other hospital': 'Transferred to another hospital',
    'Home-care': 'Survived ICU',
    'Other / Unkown': 'Survived ICU',
    'Hospice / Casa de apoio': 'Survived ICU'
}

dict_icuadmit = {
    'Emergency': 'Accident & Emergency',
    'Operating room': 'OT/Recovery',
    'Ward / Floor': 'Ward',
    'Transfer from other hospital': 'Other Hospital',
    'Intervention room': 'Intervention room',
    'Other ICU at your hosp': 'Other ICU, same Hospital',
    'Intermediate care unit': 'Intermediate care unit',
    'Other unkown': 'Other unkown',
    'Home-care': 'Home-care'
}

dict_elect = {
    'Medical':0,
    'Scheduled Surgery':1,
    'Urgent Surgery':0
}

dict_gender = {
    'M':'M',
    'F':'F',
    ' ':None,
    'I':None,
    float('nan'):None
}

def bedCollapse(x):
    if x < 0:
        return None
    elif x < 100:
        return '<100'
    elif x < 250:
        return '100 - 249'
    elif x < 499:
        return '250 - 499'
    elif x > 499:
        return '>= 500'
    else:
        return None

Now we define a dictionary which maps from the GOSSIS variable name (the key) to the ORCHESTRA data (the value) - where the latter is either:
* a direct copy-paste of the data (in which case it the value is the string name of the column in the ORCHESTRA data)
* a function of the data (usually involves calling the dictionary to map from coded values to the general form)

In [4]:
# encounter id is a concatenation of of:
#   - siteid, patientid, admepisode (incrementing integer for each ICU stay)
encFcn = lambda x: 'orchestra_' + x['hosp_code'].astype(str) + '_' + \
(x['medicalrecord']).astype(str) + '_' + (x['admissionrecordid']).astype(str)

field_map = OrderedDict([
  ['data_source', 'data_source']
, ['encounter_id', encFcn]
, ['patient_id', 'medicalrecord']
, ['country', lambda x: 'brazil']
, ['hospital_id', 'hosp_code']
#, [None, 'hosptype'] # public or private
#, [None, 'trainprogrcritcare'] #TODO: ???
, ['teaching_hospital', None]
, ['hospital_bed_size', lambda x: x['hospbeds'].map(bedCollapse)]
, ['hospital_bed_size_numeric', 'hospbeds']
, ['hospital_type', None]
, ['icu_id', 'icu_code']
, ['icu_type', 'icu_type'] # TODO: dictionary, this is coded
# , [None, 'icubeds']
, ['icu_stay_type', None]
, ['age', 'age']
, ['gender', lambda x: x['gender'].map(dict_gender) ]
, ['weight', 'weight']
, ['height', 'height']
, ['bmi', 'bmi']
, ['ethnicity', None]
, ['pregnant', None]
, ['smoking_status', None]
, ['hospital_admit_source', None]
, ['hospital_disch_location', lambda x: x['hospitaldestinationname'].map(dict_hosp_destination)]
, ['hospital_los_days', 'hospitallengthstay']
, ['hospital_death', 'hospdeath']
, ['icu_admit_source', lambda x: x['admissionsourcename'].map(dict_icuadmit)]
, ['icu_admit_type', None]
, ['icu_disch_location', lambda x: x['unitdestinationname'].map(dict_icu_destination)]
, ['pre_icu_los_days', 'lengthhospitalstaypriorunitadmission']
, ['icu_los_days', 'unitlengthstay']
, ['icu_death', 'icudeath']
#, [None, 'unitdischargename']
#, [None, 'admissionreasonname']
#, [None, 'admissionmaindiagnosisname']
, ['elective_surgery', lambda x: x['admissiontypename'].map(dict_elect)]
, ['readmission_status', lambda x: 0] # orchestra study has no readmissions
#, [None, 'month year']
#, [None, 'infection at admission']
#, [None, 'readmission_lag_hrs']
# === VITALS === #
, ['h1_heartrate_min', None]
, ['h1_heartrate_max', 'highest heart rate1h']
, ['h1_resprate_min', None]
, ['h1_resprate_max', 'highest respiratory rate1h']
, ['h1_spo2_min', None]
, ['h1_spo2_max', None]
, ['h1_temp_min', None]
, ['h1_temp_max', 'highest temperature1h']
, ['h1_sysbp_invasive_min', None]
, ['h1_sysbp_invasive_max', None]
, ['h1_diasbp_invasive_min', None]
, ['h1_diasbp_invasive_max', None]
, ['h1_mbp_invasive_min', None]
, ['h1_mbp_invasive_max', None]
, ['h1_sysbp_noninvasive_min', None]
, ['h1_sysbp_noninvasive_max', None]
, ['h1_diasbp_noninvasive_min', None]
, ['h1_diasbp_noninvasive_max', None]
, ['h1_mbp_noninvasive_min', None]
, ['h1_mbp_noninvasive_max', None]
, ['h1_sysbp_min', 'lowest systolic blood pressure1h']
, ['h1_sysbp_max', None]
, ['h1_diasbp_min', 'lowest diastolic blood pressure1h']
, ['h1_diasbp_max', None]
, ['h1_mbp_min', 'lowest mean arterial pressure1h']
, ['h1_mbp_max', None]
, ['h1_pasys_invasive_min', None]
, ['h1_pasys_invasive_max', None]
, ['h1_padias_invasive_min', None]
, ['h1_padias_invasive_max', None]
, ['h1_pamean_invasive_min', None]
, ['h1_pamean_invasive_max', None]
#[None, 'lowest glasgow coma scale1h']
# === LABS/BLOOD GASES === #
, ['h1_albumin_min', None]
, ['h1_albumin_max', None]
, ['h1_bilirubin_min', None]
, ['h1_bilirubin_max', 'highest bilirubin1h']
, ['h1_bun_min', None]
, ['h1_bun_max', None]
, ['h1_calcium_min', None]
, ['h1_calcium_max', None]
, ['h1_creatinine_min', None]
, ['h1_creatinine_max', 'highest creatinine1h']
, ['h1_glucose_min', None]
, ['h1_glucose_max', None]
, ['h1_inr_min', None]
, ['h1_inr_max', None]
, ['h1_hco3_min', None]
, ['h1_hco3_max', None]
, ['h1_hematocrit_min', None]
, ['h1_hematocrit_max', None]
, ['h1_hemaglobin_min', None]
, ['h1_hemaglobin_max', None]
, ['h1_lactate_min', None]
, ['h1_lactate_max', 'highest arterial lactate1h']
, ['h1_platelets_min', 'lowest platelets count1h']
, ['h1_platelets_max', None]
, ['h1_potassium_min', None]
, ['h1_potassium_max', None]
, ['h1_sodium_min', None]
, ['h1_sodium_max', None]
, ['h1_wbc_min', None]
, ['h1_wbc_max', 'highest leukocyte count1h']
, ['h1_arterial_ph_min', 'lowest ph1h']
, ['h1_arterial_ph_max', 'highest ph1h']
, ['h1_arterial_po2_min', 'lowest pa o21h']
, ['h1_arterial_po2_max', 'highest pa o21h']
, ['h1_arterial_pco2_min', 'lowest pa co21h']
, ['h1_arterial_pco2_max', 'highest pa co21h']
#, [None, 'highest fi o21h']
#, [None, 'lowest fi o21h']
, ['h1_pao2fio2ratio_min', 'lowest pa o2fi o21h']
, ['h1_pao2fio2ratio_max', 'highest pa o2fi o21h']
# === COMORBIDITIES APACHE OR OTHERWISE === #
#, [None, 'chfnyhaclass23']
#, [None, 'chfnyhaclass4']
#, [None, 'crfnodialysis']
#, [None, 'crfdialysis']
#, [None, 'cirrhosischildab']
#, [None, 'cirrhosischildc']
#, [None, 'hepaticfailure']
#, [None, 'solidtumorlocoregional']
#, [None, 'solidtumormetastatic']
#, [None, 'hematologicalmalignancy']
#, [None, 'immunossupression']
#, [None, 'severecopd']
#, [None, 'steroidsuse']
#, [None, 'aids']
#, [None, 'arterialhypertension']
#, [None, 'asthma']
#, [None, 'diabetesuncomplicated']
#, [None, 'diabetescomplicated']
#, [None, 'angina']
#, [None, 'previousmi']
#, [None, 'cardiacarrhythmia']
#, [None, 'dvt']
#, [None, 'peripheralarterydisease']
#, [None, 'chronicatrialfibrilation']
#, [None, 'rheumaticdisease']
#, [None, 'strokesequelae']
#, [None, 'strokenosequelae']
#, [None, 'dementia']
#, [None, 'tobaccoconsumption']
#, [None, 'alcoholism']
#, [None, 'psychiatricdisease']
#, [None, 'morbidobesity']
#, [None, 'malnourishment']
#, [None, 'pepticdisease']
#, [None, 'chronichealthstatuscode(ps)']
# === TREATMENTS IN FIRST HOUR === #
#, [None, 'mechvent_1h']
#, [None, 'nivent_1h']
#, [None, 'vaspressor_1h']
#, [None, 'card_arrhythmias_1h']
#, [None, 'cardiopulmonaryarrest_1h']
#, [None, 'asystole_1h']
#, [None, 'pulselesselectricalactivity_1h']
#, [None, 'vent_tachyc_1h']
#, [None, 'gibleeding_1h']
#, [None, 'intracranialmasseffect_1h']
#, [None, 'neutropenia_1h']
#, [None, 'rrt_1h']
# === TREATMENTS IN FIRST DAY === #
, ['ventilated_apache', 'mechvent_24h']
#, [None, 'nivent_24h']
#, [None, 'vaspressor_24h']
#, [None, 'card_arrhythmias_24h']
#, [None, 'cardiopulmonaryarrest_24h']
#, [None, 'asystole_24h']
#, [None, 'pulselesselectricalactivity_24h']
#, [None, 'vent_tachyc_24h']
#, [None, 'gibleeding_24h']
#, [None, 'intracranialmasseffect_24h']
#, [None, 'neutropenia_24h']
, ['dialysis_apache', 'rrt_24h']
# === OUTPUT OF SCORING SYSTEMS === #
, ['apache_3j_score', None]
, ['apache_3j_hospital_death_prob', None]
, ['apache_4a_icu_death_prob', None]
, ['apache_4a_hospital_death_prob', None]
#. [None, 'charlson']
#. [None, 'saps3points']
#. [None, 'saps3deathprobabilitystandardequation']
#. [None, 'saps3deathprobabilitycustomizedequation']
#. [None, 'sofapt']
])

# Load in the header

In [5]:
hdr = pd.read_csv('../hdr/header.csv',header=None,sep=',')[0].values

In [8]:
df_new = pd.DataFrame()
for c in hdr:
    # did not find a mapping for the given variable
    if c not in field_map:
        print('WARNING: {} not found in field mapping for ORCHESTRA data!'.format(c))
        df_new[c] = None
    # there is a mapping, but it indicates that we don't have any data
    elif field_map[c] is None:
        # plug in missing data into final dataframe
        #print('WARNING: {} not available in ORCHESTRA data!'.format(c))
        df_new[c] = None
    # there is a mapping, and the anzics definition matches the GOSSIS definition
    elif type(field_map[c]) == str:
        # check the mapping refers to a column available in ORCHESTRA data
        if field_map[c].lower() in df.columns.values:
            # data exists, copy it over
            df_new[c] = df[field_map[c]]
        else:
            print('WARNING: {} equivalent not found in ORCHESTRA data! (Looked for "{}".)'.format(c, field_map[c].lower()))
    # there is a mapping, and it's a function (usually a dictionary)
    else:
        # call the mapping
        df_new[c] = field_map[c](df)



# 3 - Output the data to a csv file

In [9]:
df_new.to_csv('orchestra-gossis-data.csv',index=False)