In [None]:
############################################################################
### import libraries
import os
import platform
import copy
import sys
import pyodbc
import pymssql
import pandas as pd
import numpy as np
import functools
import sklearn as sk
import joblib
from fancyimpute import KNN    
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import DictVectorizer
from functools import reduce
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

################################################################################################################
################################################################################################################
# automatically reload python fiels (util.py and conf.py) when they are changed.
%reload_ext autoreload
%autoreload 2

# import from parent directory with a little help from sys.path.insert()
sys.path.insert(0, '..') 

### from util.py (file which once contained all classes and functions):
from util import * 

### Configuration file to determine root directory 
import conf

# from configuration file set working directory
os.chdir(os.path.join(conf.ROOT_DIR, 'SEPSIS'))

# Define the subfolders paths
data_path = '\data\\'

############################################################################
# Settings for Pandas to display more then the default amount of collumns
pd.set_option("display.max_columns",150)

### Check everything
conf.print_python_environment()

# Load final ICV and MIMIC datasets

In [None]:
ICV_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_ICV.csv'), sep=',')
MIMIC_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_MIMIC.csv'), sep=',')

# CREATE EXPERIMENT

In [None]:
################################################################################################################
################################################################################################################
### Experiment name
exp_name = 'FINAL'
exp_comment = 'Main Experiment - training a model on half of the ICV data and testing on the full MIMIC dataset'

### First create experiment directory if not yet exists
if not os.path.exists(os.path.join(conf.EXP_DIR, exp_name)):
    os.makedirs(os.path.join(conf.EXP_DIR, exp_name))
exp_dir = os.path.join(conf.EXP_DIR, exp_name)

## Add data
if not os.path.exists(os.path.join(exp_dir, 'data')):
    os.makedirs(os.path.join(exp_dir, 'data'))
    

# Themn add subdirectories
if not os.path.exists(os.path.join(exp_dir, 'figures')):
    os.makedirs(os.path.join(exp_dir, 'figures'))
    
if not os.path.exists(os.path.join(exp_dir, 'models')):
    os.makedirs(os.path.join(exp_dir, 'models'))

if not os.path.exists(os.path.join(exp_dir, 'performance')):
    os.makedirs(os.path.join(exp_dir, 'performance'))
    
if not os.path.exists(os.path.join(exp_dir, 'results')):
    os.makedirs(os.path.join(exp_dir, 'results'))

if not os.path.exists(os.path.join(exp_dir, 'KNN')):
    os.makedirs(os.path.join(exp_dir, 'KNN'))
    
if not os.path.exists(os.path.join(exp_dir, 'FQI')):
    os.makedirs(os.path.join(exp_dir, 'FQI'))

# SET EXPERIMENT DATA CONFIGURATION

In [None]:
################################################################################################################
################################################################################################################
config = {'random_SEED': 42,    # God does not play dice
          'train_sample': 0.7,  # percentage (0 to 1) of patients reserved for training (remainder used for valiation)
          'comment': str(exp_comment)
         }
config_df = pd.DataFrame(config, index=[0])
config_df.to_csv(os.path.join(exp_dir, 'data/' + exp_name + '_dataconfig.csv'), index=False)

# Split data, pick a cell to run:

In [None]:
# God does not play dice
import random
random.seed(config['random_SEED'])

####### MIMIC SPLIT
# now split into train/validation/test sets
unique_ids = MIMIC_data['PatientID'].unique()

random.shuffle(unique_ids)
train_sample = config['train_sample']
train_num = int(len(unique_ids) * train_sample)
train_ids = unique_ids[:train_num]
val_ids = unique_ids[train_num:]

# Create datasets
train_set = MIMIC_data.loc[MIMIC_data['PatientID'].isin(train_ids)]
val_set = MIMIC_data.loc[MIMIC_data['PatientID'].isin(val_ids)]
test_set = ICV_data

####### ICV SPLIT
# unique_ids = ICV_data['PatientID'].unique()

# # Create datasets
# train_set = ICV_data.loc[ICV_data['PatientID'].isin(train_ids)]
# val_set = ICV_data.loc[ICV_data['PatientID'].isin(val_ids)]
# test_set = MIMIC_data

# keep a raw data copy
train_rawdata = train_set
val_rawdata = val_set
test_rawdata = test_set

print(train_rawdata.shape)
print(val_rawdata.shape)
print(test_rawdata.shape)

# Cap values in datasets

In [None]:
caps = pd.read_csv(os.path.join(conf.DATA_DIR, 'capping_values.csv'), sep=',',decimal='.')
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    for i in caps.index:
        param = caps.loc[i,'Parameter']
        maxval = caps.loc[i,'maxval']
        minval = caps.loc[i,'minval']
        print(param,minval,maxval)
        train_set[param][train_set[param] >= maxval] = maxval
        train_set[param][train_set[param] <= minval] = minval
        val_set[param][val_set[param] >= maxval] = maxval
        val_set[param][val_set[param] <= minval] = minval
        test_set[param][test_set[param] >= maxval] = maxval
        test_set[param][test_set[param] <= minval] = minval

# Transform data

### as Raghu et al 2017:
    binary_fields = ['gender','mechvent','re_admission']
    
    norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
        'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
        'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
        'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
        'PaO2_FiO2','cumulated_balance_tev', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
        
    log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
                  'input_total_tev','input_4hourly_tev','output_total','output_4hourly', 'bloc']

In [None]:
binary_fields = ['Gender','Ventilator']

norm_fields= ['Age','Weight','HeartRate','SYS','MAP','DIA','RespRate','Temp','FiO2',
    'Kalium','Natrium','Chloride','Glucose','Magnesium','Calcium','ANION_GAP',
    'HB','LEU','Trombo','APTT','Art_PH','PaO2','PaCO2','Height',
    'Art_BE','Bicarbonaat','Lactate','Sofa_score','Sirs_score','Shock_Index',
    'PF_ratio','Albumine', 'Ion_Ca']

log_fields = ['max_VP_prev','SpO2','Ureum','Creat','ALAT','ASAT','Bili','INR',
              'Running_total_IV','total_IV_prev','Running_total_UP','total_UP']

not_used = ['PatientID', 'interval_start_time', 'interval_end_time', 'Reward', 'Discharge', 'discrete_action','discrete_action_original','total_IV','max_VP']

# check if all collumns used for ICV_data
print("All collumns accounted for excluding: " + str(not_used) + " == " +str(len(ICV_data.columns) - len(not_used)==len(binary_fields)+len(norm_fields)+len(log_fields)))

In [None]:
# normalise binary fields
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    train_set[binary_fields] = train_set[binary_fields] - 0.5 
    val_set[binary_fields] = val_set[binary_fields] - 0.5 
    test_set[binary_fields] = test_set[binary_fields] - 0.5
    print("done")

In [None]:
# normal distn fields
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    for item in norm_fields:
        av = train_set[item].mean()
        std = train_set[item].std()
        train_set[item] = (train_set[item] - av) / std
        val_set[item] = (val_set[item] - av) / std
        test_set[item] = (test_set[item] - av) / std
        print(item,av.round(3),std.round(3))

In [None]:
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    # log normal fields
    train_set[log_fields] = np.log(0.1 + train_set[log_fields])
    val_set[log_fields] = np.log(0.1 + val_set[log_fields])
    test_set[log_fields] = np.log(0.1 + test_set[log_fields])
    
    for item in log_fields:
        av = train_set[item].mean()
        std = train_set[item].std()
        train_set[item] = (train_set[item] - av) / std
        val_set[item] = (val_set[item] - av) / std
        test_set[item] = (test_set[item] - av) / std
        print(item,av.round(3),std.round(3))

In [None]:
# scale all features
scalable_fields = copy.deepcopy(binary_fields)
scalable_fields.extend(norm_fields)
scalable_fields.extend(log_fields)

# min-max normalization
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    for col in scalable_fields:
        minimum = np.nanmin(train_set[col])
        maximum = np.nanmax(train_set[col])
        print(col,minimum,maximum)
        train_set[col] = (train_set[col] - minimum)/(maximum-minimum)
        val_set[col] = (val_set[col] - minimum)/(maximum-minimum)
        test_set[col] = (test_set[col] - minimum)/(maximum-minimum)

# Impute data

### Training

In [None]:
train_unique_ids = train_set['PatientID'].unique()
train_before_impute = train_set.head(30)
for unique_id in train_unique_ids:
    X_incomplete = train_set.loc[train_set['PatientID']==unique_id][binary_fields+norm_fields+log_fields]
    pd.reset_option('mode.chained_assignment')
    with pd.option_context('mode.chained_assignment', None):
        train_set.loc[train_set['PatientID']==unique_id,binary_fields+norm_fields+log_fields] = KNN(k=3,verbose=False).fit_transform(X_incomplete) # XX_filled_knn
print("done")
train_set_after_impute = train_set.head(30)

### Validation

In [None]:
val_unique_ids = val_set['PatientID'].unique()
val_before_impute = val_set.head(30)
for unique_id in val_unique_ids:
    X_incomplete = val_set.loc[val_set['PatientID']==unique_id][binary_fields+norm_fields+log_fields]
    pd.reset_option('mode.chained_assignment')
    with pd.option_context('mode.chained_assignment', None):
        val_set.loc[val_set['PatientID']==unique_id,binary_fields+norm_fields+log_fields] = KNN(k=3,verbose=False).fit_transform(X_incomplete) # XX_filled_knn
print("done")
val_after_impute = val_set.head(30)

### Test

In [None]:
test_unique_ids = test_set['PatientID'].unique()
test_before_impute = test_set.head(30)
for unique_id in test_unique_ids:
    X_incomplete = test_set.loc[test_set['PatientID']==unique_id][binary_fields+norm_fields+log_fields]
    pd.reset_option('mode.chained_assignment')
    with pd.option_context('mode.chained_assignment', None):
        test_set.loc[test_set['PatientID']==unique_id,binary_fields+norm_fields+log_fields] = KNN(k=3,verbose=False).fit_transform(X_incomplete) # XX_filled_knn
print("done")
test_after_impute = test_set.head(30)

# Add Row ID's

In [None]:
train_set.index = pd.RangeIndex(len(train_set.index))
val_set.index = pd.RangeIndex(len(val_set.index))
test_set.index = pd.RangeIndex(len(test_set.index))
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    # row_id.values
    train_set['row_id'] = train_set.index
    train_set['row_id_next'] = np.where(train_set['PatientID'].shift(+1) != train_set['PatientID'], train_set['row_id'], train_set['row_id']+1)
    train_set['row_id_next']  = train_set['row_id_next'] .astype(int)
    train_set['row_id_next'][0] = 1                                    # fix the one shortcoming of the above np.where with .shift() solution
    train_set['row_id_next'].iloc[-1] = train_set['row_id'].iloc[-1]   # without changes the above text: fix the second shortcoming of the above method

    # row_id.values
    val_set['row_id'] = val_set.index
    val_set['row_id_next'] = np.where(val_set['PatientID'].shift(+1) != val_set['PatientID'], val_set['row_id'], val_set['row_id']+1)
    val_set['row_id_next']  = val_set['row_id_next'] .astype(int)
    val_set['row_id_next'][0] = 1                                      # fix the one shortcoming of the above np.where with .shift() solution
    val_set['row_id_next'].iloc[-1] = val_set['row_id'].iloc[-1]       # without changes the above text: fix the second shortcoming of the above method
    # row_id.values
    test_set['row_id'] = test_set.index
    test_set['row_id_next'] = np.where(test_set['PatientID'].shift(+1) != test_set['PatientID'], test_set['row_id'], test_set['row_id']+1)
    test_set['row_id_next']  = test_set['row_id_next'] .astype(int)
    test_set['row_id_next'][0] = 1                                     # fix the one shortcoming of the above np.where with .shift() solution
    test_set['row_id_next'].iloc[-1] = test_set['row_id'].iloc[-1]     # without changes the above text: fix the second shortcoming of the above method
print("done")

# Convert to dictionary

In [None]:
feature_names = binary_fields+norm_fields+log_fields
feature_df_train = train_set[feature_names]
feature_df_val = val_set[feature_names]
feature_df_test = test_set[feature_names]

v = DictVectorizer(sparse = False)
feature_dict_train = feature_df_train.to_dict('records')
feature_dict_val = feature_df_val.to_dict('records')
feature_dict_test = feature_df_test.to_dict('records')

print(np.sort(np.array(feature_df_train.columns)))

X_train = v.fit_transform(feature_dict_train)
X_val = v.transform(feature_dict_val)
X_test = v.transform(feature_dict_test)

reward_train = train_set.Reward.values
reward_val = val_set.Reward.values
reward_test = test_set.Reward.values

action_train = train_set.discrete_action.values
action_val = val_set.discrete_action.values
action_test = test_set.discrete_action.values

state_row_id_train       = [int(x) for x in train_set.row_id.values]
next_state_row_id_train  = [int(x) for x in  train_set.row_id_next.values]

state_row_id_val         = [int(x) for x in val_set.row_id.values]
next_state_row_id_val    = [int(x) for x in val_set.row_id_next.values]

state_row_id_test        = [int(x) for x in test_set.row_id.values]
next_state_row_id_test   = [int(x) for x in test_set.row_id_next.values]

output_dict = {'train' : {
                    'X' : X_train,
                    'action' : action_train,
                    'reward' : reward_train,
                    'state_id' : state_row_id_train,
                    'next_state_id' : next_state_row_id_train
                },
                'val' : {
                    'X' : X_val,
                    'action' : action_val,
                    'reward' : reward_val,
                    'state_id' : state_row_id_val,
                    'next_state_id' : next_state_row_id_val
                },
              'test' : {
                    'X' : X_test,
                    'action' : action_test,
                    'reward' : reward_test,
                    'state_id' : state_row_id_test,
                    'next_state_id' : next_state_row_id_test
                },
               'v' : v,
               'featurenames': np.sort(np.array(list(feature_dict_train[1].keys())))
         }

print(len(feature_dict_train))
print(len(output_dict['train']['next_state_id']))
print(len(output_dict['val']['next_state_id']))
print(len(output_dict['test']['next_state_id']))

# Save to Pickle and csv

In [None]:
# Save raw data to csv files
train_rawdata.to_csv(os.path.join(exp_dir, 'data/train_rawdata.csv'), index=False)
val_rawdata.to_csv(os.path.join(exp_dir, 'data/val_rawdata.csv'), index=False)
test_rawdata.to_csv(os.path.join(exp_dir, 'data/test_rawdata.csv'), index=False)

# Save processed data to csv files
train_set.to_csv(os.path.join(exp_dir, 'data/train_data.csv'), index=False)
val_set.to_csv(os.path.join(exp_dir, 'data/val_data.csv'), index=False)
test_set.to_csv(os.path.join(exp_dir, 'data/test_data.csv'), index=False)

# Save Pickle for modelling
joblib.dump(output_dict, os.path.join(exp_dir, 'data/FINAL_data_dict.pkl'))

print("\nFinished at: " + str(datetime.now()) + "\n")

# CHECK PICKLE DIMENSIONS

In [None]:
data_dict = joblib.load(os.path.join(exp_dir, 'data/FINAL_data_dict_old.pkl'))
old = len(data_dict['train']['state_id'])
#printf("", len(data_dict['train']['state_id'])
data_dict = joblib.load(os.path.join(exp_dir, 'data/FINAL_data_dict.pkl'))
new = len(data_dict['train']['state_id'])
print("old = {0}\nnew = {1}".format(old,new))