## Import modules and data

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

In [19]:
filename = '../data/original/all_hourly_data.h5'
df_pats = pd.read_hdf(filename, 'patients')

In [20]:
multi_to_patid = pd.read_csv('../data/processed/multi_to_patid.csv')
multi_to_patid.set_index(keys=['subject_id','hadm_id','icustay_id'], inplace=True)

## Basic Exploration

In [21]:
df_pats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,discharge_location,fullcode_first,dnr_first,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,SNF,1.0,0.0,...,2101-10-26 20:43:09,6.06456,EMERGENCY,MICU,0,0,0,1,0,145
4,185777,294638,F,WHITE,47.845047,Private,2191-03-16 00:28:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-23 18:41:00,HOME WITH HOME IV PROVIDR,1.0,0.0,...,2191-03-17 16:46:31,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
6,107064,228232,F,WHITE,65.942297,Medicare,2175-05-30 07:15:00,CHRONIC RENAL FAILURE/SDA,2175-06-15 16:00:00,HOME HEALTH CARE,1.0,0.0,...,2175-06-03 13:39:54,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
9,150750,220597,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,2149-11-09 13:06:00,HEMORRHAGIC CVA,2149-11-14 10:15:00,DEAD/EXPIRED,1.0,0.0,...,2149-11-14 20:52:14,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
11,194540,229441,F,WHITE,50.148295,Private,2178-04-16 06:18:00,BRAIN MASS,2178-05-11 19:00:00,HOME HEALTH CARE,1.0,0.0,...,2178-04-17 20:21:05,1.58441,EMERGENCY,SICU,0,0,0,1,0,38


In [22]:
df_pats.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 34472 entries, (3, 145834, 211552) to (99999, 113369, 246512)
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   gender                  34472 non-null  category      
 1   ethnicity               34472 non-null  category      
 2   age                     34472 non-null  float64       
 3   insurance               34472 non-null  object        
 4   admittime               34472 non-null  datetime64[ns]
 5   diagnosis_at_admission  34471 non-null  object        
 6   dischtime               34472 non-null  datetime64[ns]
 7   discharge_location      34472 non-null  object        
 8   fullcode_first          28162 non-null  float64       
 9   dnr_first               28162 non-null  float64       
 10  fullcode                28162 non-null  float64       
 11  dnr                     28162 non-null  float64       
 12  dnr_first_

In [23]:
total_pats = df_pats.shape[0]

## Transfer to singular index

In [24]:
df_pats = df_pats.join(multi_to_patid)
df_pats.set_index('pat_id', inplace=True)

## Filter data

In [25]:
rel_cols = ["gender","ethnicity","age","insurance","admission_type","first_careunit"]
df_pats = df_pats[rel_cols]

## Clean Data
#### Handle Categorical Data

Prepare gender

In [26]:
df_pats.gender = df_pats.gender.map(lambda g: True if g == 'M' else False).astype(bool)

Prepare ethnicity

In [27]:
ethnicities = dict()
for ethnicity, cnt in df_pats.ethnicity.value_counts().items():
    share = (cnt / total_pats)
    label = ethnicity if share >= 0.01 else 'OTHER'
    ethnicities[ethnicity] = label

ethnicities['UNKNOWN/NOT SPECIFIED'] = None
ethnicities['UNABLE TO OBTAIN'] = None
ethnicities['PATIENT DECLINED TO ANSWER'] = None

ethnicities['BLACK/AFRICAN AMERICAN'] = 'BLACK'
ethnicities['HISPANIC OR LATINO'] = 'HISPANIC'

df_pats.ethnicity = df_pats.ethnicity.map(ethnicities)
df_eth =  pd.get_dummies(df_pats.ethnicity, prefix='ethnicity')
df_pats = df_pats.join(df_eth)
df_pats.drop('ethnicity', axis=1, inplace=True)

Prepare insurance

In [28]:
df_ins =  pd.get_dummies(df_pats.insurance, prefix='insurance')
df_pats = df_pats.join(df_ins)
df_pats.drop('insurance', axis=1, inplace=True)

Prepare Admission Type

In [29]:
admit_type_order = {'ELECTIVE':0, 'URGENT': 1, 'EMERGENCY': 2}
df_pats.admission_type = df_pats.admission_type.map(admit_type_order).astype(int)

Prepare First Care Unit

In [30]:
df_fcu =  pd.get_dummies(df_pats.first_careunit, prefix='fcu')
df_pats = df_pats.join(df_fcu)
df_pats.drop('first_careunit', axis=1, inplace=True)

### Split Data

In [31]:
ss_age = StandardScaler()
df_pats.age = ss_age.fit_transform(df_pats.age.values.reshape(-1,1))

## Split & Save results

In [32]:
processed_dir = '../data/processed/'
csv_filename = 'demographic.csv'

In [33]:
train_idxs = np.load(os.path.join(processed_dir, 'train_idxs.npy'))
test_idxs = np.load(os.path.join(processed_dir, 'test_idxs.npy'))

In [34]:
df_train = df_pats.loc[train_idxs].copy()
df_test = df_pats.loc[test_idxs].copy()

In [36]:
df_train.to_csv(os.path.join(processed_dir, 'train/', csv_filename))
df_test.to_csv(os.path.join(processed_dir, 'test/', csv_filename))