# Load data from SQL, parse it appropriately

This script loads the data from a MIMIC-III database and parses the data for concepts required for the GOSSIS project. The script outputs the `mimic-iii-gossis-data.csv` file for later use.

In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
import os

import sqlite3

## Connect to database

In [2]:
con = sqlite3.connect('SATIQ.sqlite')

In [3]:
df = pd.read_sql_query("""
select

'satiq' as data_source
, fva.tipodni || fva.dni || fva.fecing as encounter_id
, NULL as hospital_id
, fva.tipodni || fva.dni as patient_id
, fva.edad as age
, fp.imc as bmi
, 'argentina' as country
, case when fva.patologia = 3 then 1 else 0 end as elective_surgery
, NULL as ethnicity
, fva.sexo as gender
, case when p.altura = 0 then null else p.altura end as height
, null as hospital_admit_source
, null as hospital_bed_size
, case when fva.resultadoegresoh = 'Fallece' then 1 else 0 end as hospital_death
, fva.resultadoegresoh as hospital_disch_location
, fva.fecegrh - fva.fecingh as hospital_los_days
, NULL as hospital_type
, fva.PROCEDENCIA as icu_admit_source
, fva.PATOLOGIA as icu_admit_type
, case when fva.RESTRAT = 5 then 1 else 0 end as icu_death
, fva.RESTRAT as icu_disch_location
, NULL as icu_id
, fva.DIAS as icu_los_days
, NULL as icu_stay_type
, NULL as icu_type
, fva.FECHAING - FECINGH as pre_icu_los_days
, NULL as pregnant
, fva.REINGRESO as readmission_status
, NULL as smoking_status
, NULL as teaching_hospital
, p.PESO as weight
, NULL as albumin_apache
, fva.IRA as arf_apache
, NULL as bilirubin_apache
, fva.NA as bun_apache
, fva.CREATIN as creatinine_apache
, fva.VALFIO2 as fio2_apache
, NULL as gcs_eyes_apache
, NULL as gcs_motor_apache
, NULL as gcs_unable_apache
, NULL as gcs_verbal_apache
, NULL as glucose_apache
, fva.FRECCARD as heart_rate_apache
, fva.HEMATROC as hematocrit_apache
, NULL as intubated_apache
, fva.TENSART as map_apache
, fva.PCO2 as paco2_apache
, NULL as paco2_for_ph_apache
, fva.PAO2 as pao2_apache
, fva.PHART as ph_apache
, fva.FRECRESP as resprate_apache
, fva.K as sodium_apache
, fva.TEMPERATURA as temp_apache
, NULL as urineoutput_apache
, fva.FRECRESP = 0 as ventilated_apache
, fva.LEUCOC as wbc_apache
, NULL as d1_diasbp_invasive_max
, NULL as d1_diasbp_invasive_min
, NULL as d1_diasbp_max
, NULL as d1_diasbp_min
, NULL as d1_diasbp_noninvasive_max
, NULL as d1_diasbp_noninvasive_min
, NULL as d1_heartrate_max
, NULL as d1_heartrate_min
, NULL as d1_mbp_invasive_max
, NULL as d1_mbp_invasive_min
, NULL as d1_mbp_max
, NULL as d1_mbp_min
, NULL as d1_mbp_noninvasive_max
, NULL as d1_mbp_noninvasive_min
, NULL as d1_padias_invasive_max
, NULL as d1_padias_invasive_min
, NULL as d1_pamean_invasive_max
, NULL as d1_pamean_invasive_min
, NULL as d1_pasys_invasive_max
, NULL as d1_pasys_invasive_min
, NULL as d1_resprate_max
, NULL as d1_resprate_min
, NULL as d1_spo2_max
, NULL as d1_spo2_min
, NULL as d1_sysbp_invasive_max
, NULL as d1_sysbp_invasive_min
, NULL as d1_sysbp_max
, NULL as d1_sysbp_min
, NULL as d1_sysbp_noninvasive_max
, NULL as d1_sysbp_noninvasive_min
, NULL as d1_temp_max
, NULL as d1_temp_min
, NULL as h1_diasbp_invasive_max
, NULL as h1_diasbp_invasive_min
, NULL as h1_diasbp_max
, NULL as h1_diasbp_min
, NULL as h1_diasbp_noninvasive_max
, NULL as h1_diasbp_noninvasive_min
, NULL as h1_heartrate_max
, NULL as h1_heartrate_min
, NULL as h1_mbp_invasive_max
, NULL as h1_mbp_invasive_min
, NULL as h1_mbp_max
, NULL as h1_mbp_min
, NULL as h1_mbp_noninvasive_max
, NULL as h1_mbp_noninvasive_min
, NULL as h1_padias_invasive_max
, NULL as h1_padias_invasive_min
, NULL as h1_pamean_invasive_max
, NULL as h1_pamean_invasive_min
, NULL as h1_pasys_invasive_max
, NULL as h1_pasys_invasive_min
, NULL as h1_resprate_max
, NULL as h1_resprate_min
, NULL as h1_spo2_max
, NULL as h1_spo2_min
, NULL as h1_sysbp_invasive_max
, NULL as h1_sysbp_invasive_min
, NULL as h1_sysbp_max
, NULL as h1_sysbp_min
, NULL as h1_sysbp_noninvasive_max
, NULL as h1_sysbp_noninvasive_min
, NULL as h1_temp_max
, NULL as h1_temp_min
, NULL as d1_albumin_max
, NULL as d1_albumin_min
, NULL as d1_bilirubin_max
, NULL as d1_bilirubin_min
, NULL as d1_bun_max
, NULL as d1_bun_min
, NULL as d1_calcium_max
, NULL as d1_calcium_min
, NULL as d1_creatinine_max
, NULL as d1_creatinine_min
, NULL as d1_glucose_max
, NULL as d1_glucose_min
, NULL as d1_hco3_max
, NULL as d1_hco3_min
, NULL as d1_hemaglobin_max
, NULL as d1_hemaglobin_min
, NULL as d1_hematocrit_max
, NULL as d1_hematocrit_min
, NULL as d1_inr_max
, NULL as d1_inr_min
, NULL as d1_lactate_max
, NULL as d1_lactate_min
, NULL as d1_platelets_max
, NULL as d1_platelets_min
, NULL as d1_potassium_max
, NULL as d1_potassium_min
, NULL as d1_sodium_max
, NULL as d1_sodium_min
, NULL as d1_wbc_max
, NULL as d1_wbc_min
, NULL as h1_albumin_max
, NULL as h1_albumin_min
, NULL as h1_bilirubin_max
, NULL as h1_bilirubin_min
, NULL as h1_bun_max
, NULL as h1_bun_min
, NULL as h1_calcium_max
, NULL as h1_calcium_min
, NULL as h1_creatinine_max
, NULL as h1_creatinine_min
, NULL as h1_glucose_max
, NULL as h1_glucose_min
, NULL as h1_hco3_max
, NULL as h1_hco3_min
, NULL as h1_hemaglobin_max
, NULL as h1_hemaglobin_min
, NULL as h1_hematocrit_max
, NULL as h1_hematocrit_min
, NULL as h1_inr_max
, NULL as h1_inr_min
, NULL as h1_lactate_max
, NULL as h1_lactate_min
, NULL as h1_platelets_max
, NULL as h1_platelets_min
, NULL as h1_potassium_max
, NULL as h1_potassium_min
, NULL as h1_sodium_max
, NULL as h1_sodium_min
, NULL as h1_wbc_max
, NULL as h1_wbc_min
, NULL as d1_arterial_pco2_max
, NULL as d1_arterial_pco2_min
, NULL as d1_arterial_ph_max
, NULL as d1_arterial_ph_min
, NULL as d1_arterial_po2_max
, NULL as d1_arterial_po2_min
, NULL as d1_pao2fio2ratio_max
, NULL as d1_pao2fio2ratio_min
, NULL as h1_arterial_pco2_max
, NULL as h1_arterial_pco2_min
, NULL as h1_arterial_ph_max
, NULL as h1_arterial_ph_min
, NULL as h1_arterial_po2_max
, NULL as h1_arterial_po2_min
, NULL as h1_pao2fio2ratio_max
, NULL as h1_pao2fio2ratio_min
, NULL as apache_3j_hospital_death_prob
, NULL as apache_3j_score
, NULL as apache_4a_hospital_death_prob
, NULL as apache_4a_icu_death_prob
from fivarapa fva
left join fipeso fp
on fva.tipodni = fp.tipodni
and fva.dni = fp.dni
and fva.fecing = fp.fecing
left join pacientes p
on fva.tipodni = p.dnitipo
and fva.dni = p.dni
left join FiMotingD fmd
on fva.tipodni = fp.tipodni
and fva.dni = fp.dni
and fva.fecing = fp.fecing
""", con)

print('Loaded data for {} patients and {} features.'.format(df.shape[0],df.shape[1]-2))

Loaded data for 579 patients and 198 features.


## Load in the header

In [4]:
hdr = pd.read_csv('../hdr/header.csv',header=None,sep=',')[0].values

Map the data into a consistent header which is used for all databases. Warn if data is not found in the current dataset.

In [15]:
def encode_text(x):
    if x:
        return x.encode('utf-8')
    else:
        return x
    

df_new = pd.DataFrame()
for c in hdr:
    # did not find a mapping for the given variable
    if c not in df.columns:
        print('WARNING: {} not found in SATIQ data!'.format(c))
        df_new[c] = None
    else:
        # call the mapping
        if df.dtypes[c] == 'O':
            df_new[c] = df[c].map(encode_text)
        else:
            df_new[c] = df[c]



# 3 - Output the data to a csv file

In [16]:
df_new.to_csv('satiq-gossis-data.csv',index=False)

In [17]:
con.close()