In [1]:
# This notebook reads in the discretised input data and then preprocesses the model features
# Firstly, values deemed excessively high/low are capped
# Relevant binary features and normally/log-normally features are standardised accordingly
# Training and test sets are split - 70% train, 10% validation, 20% test
# Resulting datasets are saved to file.

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
from tqdm import tqdm

In [3]:
disc_inp_data = pd.read_csv("../data/discretised_input_data.csv")

In [4]:
# add rewards - sparsely for now; reward function shaping comes in a separate script
# if died in hospital -> reward at final timestep = 0
# if survived in hospital -> reward at final timestep = 1
disc_inp_data['terminal_reward'] = 0
m = ~disc_inp_data.duplicated('icustayid', 'last') # select the last timestep for each patient
disc_inp_data.loc[m, 'terminal_reward'] = 1-disc_inp_data.loc[m, 'died_in_hosp']

In [5]:
disc_inp_data.head(20)

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,output_total,output_4hourly,cumulated_balance,SOFA,SIRS,vaso_input,iv_input,vaso_input_NEW,iv_input_NEW,terminal_reward
0,1,3,7245486000,0,17639.826435,0,0,0,0,1,...,13617.0,520.0,-7090.0,4,1,0,2,0,1,0
1,2,3,7245500400,0,17639.826435,0,0,0,0,1,...,13982.0,365.0,-7405.0,7,2,0,2,0,1,0
2,3,3,7245514800,0,17639.826435,0,0,0,0,1,...,14262.0,280.0,-7635.0,5,2,0,2,0,1,0
3,4,3,7245529200,0,17639.826435,0,0,0,0,1,...,14602.0,340.0,-7925.0,5,1,0,2,0,1,0
4,5,3,7245543600,0,17639.826435,0,0,0,0,1,...,15142.0,540.0,-8415.0,5,2,0,2,0,1,0
5,6,3,7245558000,0,17639.826435,0,0,0,0,1,...,15442.0,300.0,-8695.0,5,2,0,1,0,1,0
6,9,3,7245601200,0,17639.826435,0,0,0,0,1,...,15442.0,0.0,-8695.0,13,0,0,0,0,0,1
7,1,11,6898241400,1,30766.069028,6,1,0,0,0,...,0.0,0.0,0.0,12,0,0,0,0,0,0
8,2,11,6898255800,1,30766.069028,6,1,0,0,0,...,460.0,460.0,-460.0,10,0,0,0,0,0,0
9,3,11,6898270200,1,30766.069028,6,1,0,0,0,...,1020.0,560.0,-1020.0,10,1,0,0,0,0,0


In [6]:
# now split into train/validation/test sets
import random
random.seed(42)
unique_ids = disc_inp_data['icustayid'].unique()
random.shuffle(unique_ids)
train_sample = 0.7
val_sample = 0.1
test_sample = 0.2
train_num = int(len(unique_ids) * 0.7)
val_num = int(len(unique_ids)*0.1) + train_num
train_ids = unique_ids[:train_num]
val_ids = unique_ids[train_num:val_num]
test_ids = unique_ids[val_num:]

In [7]:
train_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(train_ids)].copy()
val_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(val_ids)].copy()
test_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(test_ids)].copy()

In [8]:
binary_fields = ['gender','mechvent','re_admission']
norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
    'PaO2_FiO2','cumulated_balance', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
              'input_total','input_4hourly','output_total','output_4hourly', 'bloc']

In [9]:
# normalise binary fields
train_set[binary_fields] = train_set[binary_fields] - 0.5 
val_set[binary_fields] = val_set[binary_fields] - 0.5 
test_set[binary_fields] = test_set[binary_fields] - 0.5 

In [10]:
# normal distn fields
mean_stds = {}
for item in norm_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std
    mean_stds[item] = (av,std)

In [11]:
# log normal fields
mean_stds_log = {}
train_set[log_fields] = np.log(0.1 + train_set[log_fields])
val_set[log_fields] = np.log(0.1 + val_set[log_fields])
test_set[log_fields] = np.log(0.1 + test_set[log_fields])
for item in log_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std
    mean_stds_log[item] = (av,std)

In [12]:
train_set.head()

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,output_total,output_4hourly,cumulated_balance,SOFA,SIRS,vaso_input,iv_input,vaso_input_NEW,iv_input_NEW,terminal_reward
0,-2.293667,3,7245486000,-0.5,-0.986319,-1.854621,-0.5,0,0,1,...,0.783147,0.703927,-0.621371,-0.667738,-0.585288,0,2,0,1,0
1,-1.466238,3,7245500400,-0.5,-0.986319,-1.854621,-0.5,0,0,1,...,0.790666,0.59752,-0.643613,0.191735,0.37122,0,2,0,1,0
2,-0.967876,3,7245514800,-0.5,-0.986319,-1.854621,-0.5,0,0,1,...,0.796302,0.517825,-0.659853,-0.381247,0.37122,0,2,0,1,0
3,-0.610117,3,7245529200,-0.5,-0.986319,-1.854621,-0.5,0,0,1,...,0.802999,0.576191,-0.68033,-0.381247,-0.585288,0,2,0,1,0
4,-0.330838,3,7245543600,-0.5,-0.986319,-1.854621,-0.5,0,0,1,...,0.813321,0.715274,-0.714928,-0.381247,0.37122,0,2,0,1,0


In [13]:
import pickle
pickle.dump({'norm': mean_stds, 'lognorm': mean_stds_log}, open('normalization_params.p', 'wb'))

In [14]:
train_set.to_csv('../data/rl_train_set_unscaled.csv',index = False)
val_set.to_csv('../data/rl_val_set_unscaled.csv', index = False)
test_set.to_csv('../data/rl_test_set_unscaled.csv', index = False)

In [15]:
# scale features to [0,1] in train set, similar in val and test
import copy
scalable_fields = copy.deepcopy(binary_fields)
scalable_fields.extend(norm_fields)
scalable_fields.extend(log_fields)
for col in scalable_fields:
    minimum = min(train_set[col])
    maximum = max(train_set[col])
    train_set[col] = (train_set[col] - minimum)/(maximum-minimum)
    val_set[col] = (val_set[col] - minimum)/(maximum-minimum)
    test_set[col] = (test_set[col] - minimum)/(maximum-minimum)

In [16]:
train_set.head()

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,output_total,output_4hourly,cumulated_balance,SOFA,SIRS,vaso_input,iv_input,vaso_input_NEW,iv_input_NEW,terminal_reward
0,0.0,3,7245486000,0.0,0.412568,0.0,0.0,0,0,1,...,0.787595,0.717533,0.322651,0.166667,0.25,0,2,0,1,0
1,0.22256,3,7245500400,0.0,0.412568,0.0,0.0,0,0,1,...,0.789357,0.68786,0.322334,0.291667,0.5,0,2,0,1,0
2,0.356608,3,7245514800,0.0,0.412568,0.0,0.0,0,0,1,...,0.790678,0.665636,0.322103,0.208333,0.5,0,2,0,1,0
3,0.452837,3,7245529200,0.0,0.412568,0.0,0.0,0,0,1,...,0.792248,0.681912,0.321812,0.208333,0.25,0,2,0,1,0
4,0.527957,3,7245543600,0.0,0.412568,0.0,0.0,0,0,1,...,0.794667,0.720697,0.321319,0.208333,0.5,0,2,0,1,0


In [17]:
train_set.to_csv('../data/rl_train_set_scaled.csv',index = False)
val_set.to_csv('../data/rl_val_set_scaled.csv', index = False)
test_set.to_csv('../data/rl_test_set_scaled.csv', index = False)