In [1]:
eicu_path = '/scratch/wiensj_root/wiensj/shared_data/datasets/eicu-2.0/'
data_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/'
save_path = '/scratch/wiensj_root/wiensj/shared_data/FIDDLE_project/extracted/'

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_P = pd.read_csv(eicu_path + 'patient.csv')
print('Total', len(df_P))

Total 200859


In [4]:
df_ID = df_P[['uniquepid', 'patienthealthsystemstayid', 'patientunitstayid', 'hospitalid', 'gender', 'age']]
df_ID = df_ID.rename(columns={
    'uniquepid': 'PatientID',
    'patienthealthsystemstayid': 'AdmissionID',
    'patientunitstayid': 'ICUStayID',
})
df_ID.loc[df_ID['age'] == '> 89', 'age'] = 300
df_ID['age'] = pd.to_numeric(df_ID['age'])

In [7]:
## Create train-val-test split
patients = df_ID['PatientID'].unique()

from collections import Counter
from sklearn.model_selection import ShuffleSplit
sss1 = ShuffleSplit(1, test_size=0.3, random_state=0)
sss2 = ShuffleSplit(1, test_size=0.5, random_state=0)

y = patients
train_idx, val_test_idx = next(sss1.split(y, y))
y_val_test = y[val_test_idx]
val_idx, test_idx = next(sss2.split(y_val_test, y_val_test))
val_idx = val_test_idx[val_idx]
test_idx = val_test_idx[test_idx]

df_ID['partition'] = 'null'
df_ID.loc[df_ID['PatientID'].isin(patients[train_idx]), 'partition'] = 'train'
df_ID.loc[df_ID['PatientID'].isin(patients[val_idx]), 'partition'] = 'val'
df_ID.loc[df_ID['PatientID'].isin(patients[test_idx]), 'partition'] = 'test'

print('Patient splits:', len(train_idx), len(val_idx), len(test_idx), '/', len(patients))
c = Counter(df_ID['partition'])
print('ICU stay splits:', c['train'], c['val'], c['test'], '/', len(df_ID))


Patient splits: 97556 20905 20906 / 139367
ICU stay splits: 140646 30137 30076 / 200859


In [9]:
df_ID.to_csv(save_path + 'icustays.csv', index=False, quoting=2)

## mortality

In [11]:
print('Mortality status unknown', pd.isnull(df_P['hospitaldischargestatus']).sum())

Mortality status unknown 1751


In [12]:
df_out = df_P[['patientunitstayid', 'unitdischargeoffset', 'hospitaldischargestatus']]
df_out = df_out.rename(columns={
    'patientunitstayid': 'ID',
    'unitdischargeoffset': 'ICUDischargeTime',
})
assert df_out['hospitaldischargestatus'].nunique() == 2 # Alive or Expired
df_out['mortality_LABEL'] = (df_out['hospitaldischargestatus'] == 'Expired').astype(int).astype('Int64')
df_out.loc[pd.isnull(df_P['hospitaldischargestatus']), 'mortality_LABEL'] = np.nan # unknown mortality

In [13]:
df_out.to_csv(data_path + 'labels/mortality.csv', index=False)

## ARF
- value of ventStartOffset (this is a timestamp column) in respiratoryCare table
- time of peepLimit in respiratoryCare table
- time of recorded PEEP or PEEP/CPAP in respiratoryCharting table

In [15]:
df_R = pd.read_pickle(save_path + 'respiratoryCare.pickle')
df_RC = pd.read_pickle(save_path + 'respiratoryCharting.pickle')

In [16]:
df_PEEP_RC = df_RC[df_RC['variable_name'].isin(['PEEP', 'PEEP/CPAP'])].drop(columns=['variable_value'])
df_PEEP_R = df_R[df_R['variable_name'].isin(['peeplimit'])].drop(columns=['variable_value'])

In [19]:
df_vent_R = df_R[df_R['variable_name'].isin(['ventstartoffset'])] \
    .drop(columns=['t']).rename(columns={'variable_value': 't'})

In [28]:
df_ARF = pd.concat([df_PEEP_RC, df_PEEP_R, df_vent_R], sort=True)

In [29]:
df_ARF = df_ARF.sort_values(by=['ID', 't', 'variable_name'])

In [30]:
df_ARF.head()

Unnamed: 0,ID,t,variable_name
0,141168,1566,ventstartoffset
402141,141168,1926,PEEP
535859,141168,2046,PEEP
387074,141168,2136,PEEP
587362,141168,2166,PEEP


In [31]:
df_ARF.to_csv(data_path + 'labels/ARF_raw.csv', index=False)

In [32]:
df_ARF = df_ARF.drop_duplicates(subset=['ID'], keep='first').reset_index(drop=True)

In [33]:
df_ARF = df_ARF.rename(columns={'t': 'ARF_ONSET_TIME',})
df_ARF['ARF_LABEL'] = 1

In [34]:
df_ARF.to_csv(data_path + 'labels/ARF.csv', index=False)

## Shock

earliest record in infusionDrug/medication table, of any drugName containing
- 'norepinephrine', 'levophed',
- 'epinephrine',
- 'dopamine',
- 'vasopressin',
- 'phenylephrine', 'neo-synephrine', 'neosynephrine'

In [3]:
df_I = pd.read_pickle(save_path + 'infusionDrug.pickle')
df_M = pd.read_parquet(save_path + 'medication.parquet')

In [4]:
vasopressors = [
    'norepinephrine', 'levophed', 
    'epinephrine', 
    'dopamine', 
    'vasopressin', 
    'phenylephrine', 'neo-synephrine', 'neosynephrine'
]

### infusion table

In [25]:
df_drug = df_I['variable_name'].astype(str).unique()
df_drug.sort()
vaso = [d for d in df_drug if any(v in d.lower() for v in vasopressors)]
df_vaso_I = df_I[df_I['variable_name'].isin(vaso)].copy()

In [26]:
# rows with text values are values are ignored
df_vaso_I[~df_vaso_I['variable_value'].apply(pd.to_numeric, errors='coerce')\
          .apply(np.isfinite)]['variable_value'].unique()

array(['UD', 'ERROR', nan, 'Date\\Time Correction', 'OFF\\.br\\\\.br\\',
       'OFF', 'OFF\\.br\\', '30\\.br\\', '50 mcg/min', '50mcg/min\\.br\\',
       'Documentation undone'], dtype=object)

#### fixing some errors

In [30]:
df_vaso_I[df_vaso_I['variable_value'] == '30\\.br\\']

Unnamed: 0,ID,t,variable_name,variable_value
4691347,3207324,34,Epinephrine (mcg/min),30\.br\


In [31]:
df_vaso_I.loc[df_vaso_I['variable_value'] == '30\\.br\\', 'variable_value'] = 30

In [32]:
df_vaso_I[df_vaso_I['variable_value'] == '50 mcg/min']

Unnamed: 0,ID,t,variable_name,variable_value
4702167,3211784,3661,Epinephrine (mcg/min),50 mcg/min


In [34]:
df_vaso_I.loc[df_vaso_I['variable_value'] == '50 mcg/min', 'variable_value'] = 50

In [35]:
df_vaso_I[df_vaso_I['variable_value'] == '50mcg/min\\.br\\']

Unnamed: 0,ID,t,variable_name,variable_value
4702192,3211784,3901,Epinephrine (mcg/min),50mcg/min\.br\


In [36]:
df_vaso_I.loc[df_vaso_I['variable_value'] == '50mcg/min\\.br\\', 'variable_value'] = 50

#### continue processing...

In [37]:
# convert non-numeric drug values to nan
df_vaso_I['variable_value'] = df_vaso_I['variable_value'].apply(pd.to_numeric, errors='coerce')
# remove nan drug values
df_vaso_I = df_vaso_I.dropna(axis=0)
len(df_vaso_I) # -> 1000825 rows

1000825

In [38]:
df_vaso_I = df_vaso_I[df_vaso_I['variable_value'] > 0] # positive drug values
df_vaso_I = df_vaso_I.drop('variable_value', axis=1)
len(df_vaso_I) # -> 904398 rows

904398

### medication table

In [39]:
df_drug_M = df_M['variable_name'].astype(str).unique()
df_drug_M.sort()
vaso_M = [d for d in df_drug_M if any(v in d.lower() for v in vasopressors)]
df_vaso_M = df_M[df_M['variable_name'].isin(vaso_M)].copy()

In [40]:
# values are messy but should not be ignored
df_vaso_M[~df_vaso_M['variable_value'].apply(pd.to_numeric, errors='coerce')\
          .apply(np.isfinite)]['variable_value'].unique()

array(['SUBQ', 'ONDEM', '0.5 MG', 'IM', 'ONCE', 'MG', '0.4 MG', '8 mg',
       'IV', '40 mg', 'Manual Charge', 'MISC', 'Once X1', 'PYXIS',
       'Pyxis', '100 mcg', 'as directed', '0.1 mg', 'q5min H4', '0.3 mg',
       'Subcut', '10 MG', '20 MG', '10 mg', 'Intravenous',
       '5-20 mcg/kg/min', 'IntraVENOUS', '1 MG', '0 MG', 'SUBCUT',
       '.STK-MED', '2 spray', 'Each Nostril', 'Every 6 hours PRN', '2 MG',
       'IntraMUSCULAR', 'Once as needed', 'Nasal', 'TITRATE', '1 drop',
       '1 spray', '4 MG', 'Titrated', '8 MG', '16 MG', '50 MG', '250 MG',
       'INFUSE', 'IV CONT', 'Each Nare', 'X1 M659', 'X1 M1079', 'X1 M899',
       '5 mcg/kg/min', '250 ML', 'INTRAVENOU', 'INTRAMUSCU', 'ONETIME',
       '60 MG', 'SUBCUTAN', '.ROUTE', 'X', '0.2 MG', '3 MG',
       'PER PROTOCOL', '40 MG', '4 VL', 'Continuous', 'Continuous PRN',
       'PRN', 'AS-DIR', 'H8', 'IVPB', 'X1', 'H1', '16 mg', 'H24',
       '200 mcg', 'Once', 'q5min', '50 mcg', '1 mg', '0.2 mg', '250 mcg',
       '150 mcg', '1

In [42]:
df_vaso_M = df_vaso_M.drop('variable_value', axis=1)
len(df_vaso_M) # -> 105631 rows

105631

### all vasopressors

In [51]:
df_vaso = pd.concat([df_vaso_I, df_vaso_M])

In [52]:
df_shock.to_csv(data_path + 'labels/Shock_raw.csv', index=False)

In [53]:
df_shock = df_vaso.sort_values(by=['ID', 't', 'variable_name']) \
        .drop_duplicates(subset=['ID'], keep='first').reset_index(drop=True)
df_shock = df_shock.drop(['variable_name'], axis=1)
df_shock = df_shock.rename(columns={
    't': 'Shock_ONSET_TIME',
})
df_shock['Shock_LABEL'] = 1

In [54]:
df_shock.to_csv(data_path + 'labels/Shock.csv', index=False)

In [55]:
pd.DataFrame(np.concatenate([vaso, vaso_M]), columns=['name'])\
    .to_csv(data_path + 'labels/Shock_drugs.csv', index=False)