In [225]:
import numpy as np
import pandas as pd
import h5py
import scipy.sparse as sps
from datetime import timedelta
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)


In [226]:
filename = '../data/all_hourly_data.h5'

In [227]:
with h5py.File(filename, "r") as f:
    print("Keys: %s" % f.keys())


Keys: <KeysViewHDF5 ['codes', 'interventions', 'patients', 'vitals_labs', 'vitals_labs_mean']>


In [228]:
patients = pd.read_hdf(filename, key="patients")

one_hot = pd.get_dummies(patients[["gender", "admission_type", "first_careunit"]])
patients = patients.join(one_hot)
#display(patients.columns)

# include only patients with 6h < los < 600h or same for los_icu, but los_icu is in days
patients["los"] = (patients["dischtime"] - patients["admittime"])
patients = patients[((patients["los"] > timedelta(hours=6)) & (patients["los"] < timedelta(hours=600))) | ((patients["los_icu"]*24 > 6) & (patients["los_icu"]*24 < 600))]

patients = patients[patients["age"] > 18]

# ignore following dummies, common practice to exclude 1 dummy per category
# gender_M
# admission_type_ELECTIVE
# first_careunit_CCU

patients = patients[["age", "gender_F", "admission_type_EMERGENCY", "admission_type_URGENT", "first_careunit_CSRU", "first_careunit_MICU", "first_careunit_SICU", "first_careunit_TSICU"]]
print(patients.shape)
display(patients.head())


(37543, 8)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,gender_F,admission_type_EMERGENCY,admission_type_URGENT,first_careunit_CSRU,first_careunit_MICU,first_careunit_SICU,first_careunit_TSICU
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,145834,211552,76.53,0,1,0,0,1,0,0
4,185777,294638,47.85,1,1,0,0,1,0,0
6,107064,228232,65.94,1,0,0,0,0,1,0
9,150750,220597,41.79,0,1,0,0,1,0,0
11,194540,229441,50.15,1,1,0,0,0,1,0


In [229]:
interventions = pd.read_hdf(filename, key="interventions")
interventions["vasopressor"] = interventions[['vaso', 'vasopressin']].any(axis='columns').astype(int)
interventions = interventions[["vasopressor"]]
print(interventions.shape)
display(interventions.head())

(3183638, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,vasopressor
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_1
3,145834,211552,0,0
3,145834,211552,1,1
3,145834,211552,2,1
3,145834,211552,3,1
3,145834,211552,4,1


In [230]:
# exclude patients that get vasopressor in the first 6h
excluded_subject_ids = interventions[interventions.index.get_level_values("hours_in").isin(range(6)) & interventions["vasopressor"] == 1]
excluded_subject_ids = excluded_subject_ids.index.get_level_values("subject_id").unique()
excluded_subject_ids.shape

(8406,)

In [231]:
outcome = pd.DataFrame(interventions.groupby(["subject_id", "hadm_id", "icustay_id"])["vasopressor"].agg("max"))
print(outcome.shape)
display(outcome.head())


(37543, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,vasopressor
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1
3,145834,211552,1
4,185777,294638,0
6,107064,228232,0
9,150750,220597,1
11,194540,229441,0


In [232]:
vitals_labs_mean = pd.read_hdf(filename, key="vitals_labs_mean")

# Paper name -> df name
# map (mean arterial pressure) -> pulmonary artery pressure mean
# spontaneousrr (spontaneous respiratory rate) -> respiratory rate
# urine (urine output) -> not found! :/
# alt (alanine transaminase) -> alanine aminotransferase
# ast (aspartate aminotransferase) -> asparate aminotransferase
# inr (international normalised ratio) -> prothrombin time inr
mask = vitals_labs_mean.columns.get_level_values("LEVEL2").isin(["diastolic blood pressure", "fraction inspired oxygen", "glascow coma scale total", "heart rate", 
                                                                 "pulmonary artery pressure mean", "systolic blood pressure", "respiratory rate", "oxygen saturation",
                                                                 "temperature", "urine output", "blood urea nitrogen", "magnesium", "platelets", "sodium", "alanine aminotransferase",
                                                                 "hematocrit", "partial pressure of oxygen", "asparate aminotransferase", "potassium", "white blood cell count",
                                                                 "bicarbonate", "creatinine", "lactate", "partial pressure of carbon dioxide", "glucose", "prothrombin time inr",
                                                                 "hemoglobin", "bilirubin"])

#display(vitals_labs_mean.columns[vitals_labs_mean.columns.get_level_values("LEVEL2").str.contains("rate")])
vitals_labs_mean = vitals_labs_mean.iloc[:, mask]
print(vitals_labs_mean.shape)
display(vitals_labs_mean.head())


(3183638, 27)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,asparate aminotransferase,bicarbonate,bilirubin,blood urea nitrogen,creatinine,diastolic blood pressure,fraction inspired oxygen,glascow coma scale total,glucose,heart rate,hematocrit,hemoglobin,lactate,magnesium,oxygen saturation,partial pressure of carbon dioxide,partial pressure of oxygen,platelets,potassium,prothrombin time inr,pulmonary artery pressure mean,respiratory rate,sodium,systolic blood pressure,temperature,white blood cell count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
3,145834,211552,0,25.0,69.0,16.33,0.8,44.2,2.6,39.67,,,198.56,143.0,27.03,8.72,7.03,2.92,74.0,32.0,224.67,233.6,4.16,1.62,,16.0,142.67,95.17,,14.84
3,145834,211552,1,,,,,,,44.12,,,,153.75,,,,,,,,,,,,15.5,,81.0,,
3,145834,211552,2,,,,,,,47.33,,,267.0,137.75,30.0,10.1,5.0,,98.5,33.0,287.0,,3.9,,,7.0,139.0,90.67,,
3,145834,211552,3,,,,,,,64.5,,,,129.25,,,,,99.75,,,,,,14.0,5.25,,117.0,,
3,145834,211552,4,,,,,,,63.0,,,,146.0,,,,,98.0,,,,,,30.0,13.67,,102.0,,


In [239]:
# expect (N x V x T)
T = 6
N = vitals_labs_mean.index.get_level_values("subject_id").unique().size
V = vitals_labs_mean.shape[1]

# use first 6 hours

vitals_labs_mean1 = vitals_labs_mean.to_numpy().reshape(N, V, T)

29137
27


In [233]:
indicators = pd.DataFrame(~vitals_labs_mean.isna())
print(indicators.shape)
display(indicators.head())


(3183638, 27)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,asparate aminotransferase,bicarbonate,bilirubin,blood urea nitrogen,creatinine,diastolic blood pressure,fraction inspired oxygen,glascow coma scale total,glucose,heart rate,hematocrit,hemoglobin,lactate,magnesium,oxygen saturation,partial pressure of carbon dioxide,partial pressure of oxygen,platelets,potassium,prothrombin time inr,pulmonary artery pressure mean,respiratory rate,sodium,systolic blood pressure,temperature,white blood cell count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
3,145834,211552,0,True,True,True,True,True,True,True,False,False,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,False,True
3,145834,211552,1,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False
3,145834,211552,2,False,False,False,False,False,False,True,False,False,True,True,True,True,True,False,True,True,True,False,True,False,False,True,True,True,False,False
3,145834,211552,3,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,False,False
3,145834,211552,4,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,False,False


In [234]:
vitals_labs_mean.fillna(0, inplace=True)
print(vitals_labs_mean.shape)
display(vitals_labs_mean.head())


(3183638, 27)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,asparate aminotransferase,bicarbonate,bilirubin,blood urea nitrogen,creatinine,diastolic blood pressure,fraction inspired oxygen,glascow coma scale total,glucose,heart rate,hematocrit,hemoglobin,lactate,magnesium,oxygen saturation,partial pressure of carbon dioxide,partial pressure of oxygen,platelets,potassium,prothrombin time inr,pulmonary artery pressure mean,respiratory rate,sodium,systolic blood pressure,temperature,white blood cell count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
3,145834,211552,0,25.0,69.0,16.33,0.8,44.2,2.6,39.67,0.0,0.0,198.56,143.0,27.03,8.72,7.03,2.92,74.0,32.0,224.67,233.6,4.16,1.62,0.0,16.0,142.67,95.17,0.0,14.84
3,145834,211552,1,0.0,0.0,0.0,0.0,0.0,0.0,44.12,0.0,0.0,0.0,153.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.5,0.0,81.0,0.0,0.0
3,145834,211552,2,0.0,0.0,0.0,0.0,0.0,0.0,47.33,0.0,0.0,267.0,137.75,30.0,10.1,5.0,0.0,98.5,33.0,287.0,0.0,3.9,0.0,0.0,7.0,139.0,90.67,0.0,0.0
3,145834,211552,3,0.0,0.0,0.0,0.0,0.0,0.0,64.5,0.0,0.0,0.0,129.25,0.0,0.0,0.0,0.0,99.75,0.0,0.0,0.0,0.0,0.0,14.0,5.25,0.0,117.0,0.0,0.0
3,145834,211552,4,0.0,0.0,0.0,0.0,0.0,0.0,63.0,0.0,0.0,0.0,146.0,0.0,0.0,0.0,0.0,98.0,0.0,0.0,0.0,0.0,0.0,30.0,13.67,0.0,102.0,0.0,0.0


In [235]:
print(f"Exclude {excluded_subject_ids.size} patients...")

patients = patients[~patients.index.get_level_values("subject_id").isin(excluded_subject_ids)]
print(patients.shape)

outcome = outcome[~outcome.index.get_level_values("subject_id").isin(excluded_subject_ids)]
print(outcome.shape)

vitals_labs_mean = vitals_labs_mean[~vitals_labs_mean.index.get_level_values("subject_id").isin(excluded_subject_ids)]
print(vitals_labs_mean.shape)

indicators = indicators[~indicators.index.get_level_values("subject_id").isin(excluded_subject_ids)]
print(indicators.shape)

Exclude 8406 patients...
(29137, 8)
(29137, 1)
(2368063, 27)
(2368063, 27)
