In [None]:
############################################################################
### import libraries
import os
import platform
import copy
import sys
import pyodbc
import pymssql
import pandas as pd
import numpy as np
import functools
import sklearn as sk
import joblib
from fancyimpute import KNN    
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import DictVectorizer
from functools import reduce
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

################################################################################################################
################################################################################################################
# automatically reload python fiels (util.py and conf.py) when they are changed.
%reload_ext autoreload
%autoreload 2

# import from parent directory with a little help from sys.path.insert()
sys.path.insert(0, '..') 

### from util.py (file which once contained all classes and functions):
from util import * 

### Configuration file to determine root directory 
import conf

# from configuration file set working directory
os.chdir(os.path.join(conf.ROOT_DIR, 'SEPSIS'))

# Define the subfolders paths
data_path = '\data\\'

############################################################################
# Settings for Pandas to display more then the default amount of collumns
pd.set_option("display.max_columns",150)

### Check everything
conf.print_python_environment()

# Import data from MIMIC and ICV

        # This notebook reads in the input data and then preprocesses the model features
        # Firstly, patients without sufficient data are dropped
        # Secondly, data is split into train/validation/test
        # Then, training, validation and test sets are split - FINAL SETUP = Train = 70% MIMIC train, Validaton = 30% MIMIC, Test = 100% ICV
        # Finally, relevant binary features and normally/log-normally features are standardised accordingly
        # Resulting datasets are saved together in a pickle file that has no patient identifying information and can be uploaded to github for transparency sake

In [None]:
# import csv files
ICV_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'ICV_data.csv'), sep=',')
MIMIC_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'MIMIC_MVdata.csv'), sep=',')

# Check if dimensions and data types of the MIMC and ICV datasets are the same
((MIMIC_data.dtypes.index == ICV_data.dtypes.index) & (MIMIC_data.columns == ICV_data.columns)).all()

## Fix action space (remove empty actions "5,10,15,20")

In [None]:
# actual (fancy-ass) mapping function
mapping = [x for x in range(25) if x not in [5, 10, 15, 20]]

# Check what actions take place in the test dataset and how the mapping would transform them. 
print(sorted(ICV_data['discrete_action'].unique()))
print(sorted(ICV_data['discrete_action'].apply(lambda x: mapping.index(x)).unique())) # (no more nested if/else statements)

# Check what actions take place in the test dataset and how the mapping would transform them. 
print(sorted(MIMIC_data['discrete_action'].unique()))
print(sorted(MIMIC_data['discrete_action'].apply(lambda x: mapping.index(x)).unique())) # (no more nested if/else statements)

In [None]:
# keep original action space
MIMIC_data['discrete_action_original'] = MIMIC_data['discrete_action']
ICV_data['discrete_action_original'] = ICV_data['discrete_action']
# apply mapping
MIMIC_data['discrete_action'] = MIMIC_data['discrete_action'].apply(lambda x: mapping.index(x))
ICV_data['discrete_action'] = ICV_data['discrete_action'].apply(lambda x: mapping.index(x))

## ICV: Exclude patients with insufficient data

In [None]:
# add new row to ICV dataset representing the % of feature collumns with missing data
ICV_data['ICV_percent_missing'] = ICV_data.isna().sum(axis=1)/47*100 # not 54 because we only have 47 'real' features
print("total amount of patients: " + str(len(ICV_data.PatientID.unique().tolist())))

# MINIMAL % OF MISSING DATA PATIENTS
min_percent_missing_ptlist = ICV_data.loc[ICV_data.groupby('PatientID')['ICV_percent_missing'].idxmin()][['PatientID','ICV_percent_missing']]
list_of_min_percentage_missing_patientIDs = min_percent_missing_ptlist[min_percent_missing_ptlist['ICV_percent_missing']>20].PatientID.tolist()
print("Unique patients to be excluded because they always (any time period) have >20% missing data: " + str(len(min_percent_missing_ptlist[min_percent_missing_ptlist['ICV_percent_missing']>20])))

mean_percent_missing_ptlist = ICV_data.groupby('PatientID')['ICV_percent_missing'].mean()
#print("Unique patients to be excluded because they have an average of >20% missing data: " + str(len(mean_percent_missing_ptlist[mean_percent_missing_ptlist['ICV_percent_missing']>20])))

list_of_mean_percentage_missing_patientIDs = mean_percent_missing_ptlist.index[mean_percent_missing_ptlist>20].tolist()
print("Amount of patients to exclude based on mean percentage (20%) of missing data: " + str(len(list_of_mean_percentage_missing_patientIDs)))

# total amount of patients to exclude:
exclude_ptid_list = list_of_min_percentage_missing_patientIDs+list_of_mean_percentage_missing_patientIDs
print("total amount of patients to be exclude based on MIN and MEAN missing data >20%: " + str(len(~ICV_data[ICV_data['PatientID'].isin(exclude_ptid_list)].PatientID.unique())    ))

# plot % of missing data
mean_percent_missing_ptlist.plot.hist(bins=50,ylim=(0,1000),color='b', alpha=0.5) # blue = mean
min_percent_missing_ptlist.ICV_percent_missing.plot.hist(bins=50,ylim=(0,1000),color='r', alpha=0.5) # red = min

# FINAL DATASET ICV
print("Final ICV patient count: " + str( len(ICV_data.PatientID.unique().tolist()) - len(~ICV_data[ICV_data['PatientID'].isin(exclude_ptid_list)].PatientID.unique())     ))
filt_ICV = ICV_data[~ICV_data['PatientID'].isin(exclude_ptid_list)]
filt_ICV.head()

## MIMIC: Exclude patients with insufficient data

In [None]:
# add new row to ICV dataset representing the % of feature collumns with missing data
MIMIC_data['MIMIC_percent_missing'] = MIMIC_data.isna().sum(axis=1)/47*100 # not 54 because we only have 47 'real' features
print("total amount of patients: " + str(len(MIMIC_data.PatientID.unique().tolist())))

# MINIMAL % OF MISSING DATA PATIENTS
min_percent_missing_ptlist = MIMIC_data.loc[MIMIC_data.groupby('PatientID')['MIMIC_percent_missing'].idxmin()][['PatientID','MIMIC_percent_missing']]
list_of_min_percentage_missing_patientIDs = min_percent_missing_ptlist[min_percent_missing_ptlist['MIMIC_percent_missing']>20].PatientID.tolist()
print("Amount of excluded patients based on minimum percentage (20%) of missing data: " + str(len(min_percent_missing_ptlist[min_percent_missing_ptlist['MIMIC_percent_missing']>20])))


# MEAN % OF MISSING DATA PATIENTS
mean_percent_missing_ptlist = MIMIC_data.groupby('PatientID')['MIMIC_percent_missing'].mean()
list_of_mean_percentage_missing_patientIDs = mean_percent_missing_ptlist.index[mean_percent_missing_ptlist>75].unique().tolist()
print("Amount of excluded patients based on mean percentage (75%) of missing data: " + str(len(list_of_mean_percentage_missing_patientIDs)))

# FULL SET OF PATIENTS TO EXCLUDE:
exclude_ptid_list = list_of_min_percentage_missing_patientIDs+list_of_mean_percentage_missing_patientIDs
print("total amount of patients to be exclude based on MIN (red) and MEAN (blue) missing data >20%: " + str(len(~MIMIC_data[MIMIC_data['PatientID'].isin(exclude_ptid_list)].PatientID.unique())    ))

# plot % of missing data
mean_percent_missing_ptlist.plot.hist(bins=50,ylim=(0,1500),color='b', alpha=0.5) # blue = mean
min_percent_missing_ptlist.MIMIC_percent_missing.plot.hist(bins=50,ylim=(0,1500),color='r', alpha=0.5) # red = min

# FINAL DATASET MIMIC
print("Final MIMIC patient count: " + str( len(MIMIC_data.PatientID.unique().tolist()) - len(~MIMIC_data[MIMIC_data['PatientID'].isin(exclude_ptid_list)].PatientID.unique())     ))
filt_MIMIC = MIMIC_data[~MIMIC_data['PatientID'].isin(exclude_ptid_list)]
filt_MIMIC.head()

## Save final ICV and MIMIC datasets

In [None]:
filt_MIMIC = filt_MIMIC.drop(columns=['MIMIC_percent_missing'])
filt_MIMIC.to_csv(os.path.join(conf.DATA_DIR, 'final_MIMIC.csv'), index=False)
filt_ICV = filt_ICV.drop(columns=['ICV_percent_missing'])
filt_ICV.to_csv(os.path.join(conf.DATA_DIR, 'final_ICV.csv'), index=False)