In [None]:
##############################################################################$#################################
################################################################################################################
### import overall usefull libraries
import os
import platform
import copy
import sys
import inspect
import time
import collections
import math
import random
import joblib
from datetime import datetime
from tqdm import tqdm

### import specific libraries for this project
import pickle as pkl
import pandas as pd
import numpy as np

### import visualisation libraries
import matplotlib.patches as mpatches
import seaborn as sns
import matplotlib.pyplot as plt

################################################################################################################
################################################################################################################
# import from parent directory with a little help from sys.path.insert()
sys.path.insert(0, '..') 

### from util.py (file which once contained all classes and functions):
%reload_ext autoreload
%autoreload 2
from util import * # automatically reload python (e.g. util.py) file when they are changed.

### Configuration file to determine root directory 
import conf

### check for GPU's
use_gpu = torch.cuda.is_available()

### Check everything
conf.print_python_environment()

# Select experiment

In [None]:
### Experiment name
exp_name = 'FINAL'

# Load data

In [None]:
################################################################################################################
################################################################################################################
# This is horrible practice: https://stackoverflow.com/questions/2052390/manually-raising-throwing-an-exception-in-python
if not os.path.exists(os.path.join(conf.EXP_DIR, exp_name)):
    raise Exception('Cannot find experiment directory, run create_exp_dataset prior to running this file')
else:
    exp_dir = os.path.join(conf.EXP_DIR, exp_name)
    ############################################
    # load data files    
    try:
        data_dict = joblib.load(os.path.join(exp_dir, 'data/FINAL_data_dict.pkl'))
    except:
        raise Exception('Cannot load dataset, run the create_exp_dataset Notebook to create new data pickle files ')


# inspect it (slightly less shitty code then before, still sorry)
print("Visual inspection of data dictionary structure:")
for k, v in data_dict.items():
    if(k == 'v'): continue
    elif(k == 'featurenames'): continue
    for k1, v1 in v.items():
        print(k, k1)

# Get model-ready data
mimic_normalised = data_dict['train']['X']
icv_normalised = data_dict['test']['X']
mimic_n = pd.DataFrame(mimic_normalised, columns=data_dict['featurenames'])
icv_n = pd.DataFrame(icv_normalised, columns=data_dict['featurenames'])

# Get feature names
features = data_dict['featurenames']
print(features)

# import ORIGINAL csv files
ICV_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_ICV.csv'), sep=',')
icv = ICV_data[features]
MIMIC_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_MIMIC.csv'), sep=',')
mimic = MIMIC_data[features]

mimic['dataset'] = 'mimic'
icv['dataset'] = 'icv'
data = pd.concat([mimic, icv])

def ignore_outliers(a, p=0.01):
    a = a.dropna()
    b = np.quantile(a, p)
    c = np.quantile(a, 1-p)
    assert b > -np.Infinity
    assert c < np.Infinity
    a = a[a > b]
    a = a[a < c]
    return a

# state space features dimension
data_dict['train']['X'].shape[1]

# Mortality in datasets

In [None]:
ICV_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_ICV.csv'), sep=',')
MIMIC_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_MIMIC.csv'), sep=',')

# import ORIGINAL csv files
ICV_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_ICV.csv'), sep=',')
icv = ICV_data[features]
MIMIC_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_MIMIC.csv'), sep=',')
mimic = MIMIC_data[features]

print(len(MIMIC_data['PatientID'].unique()))
print(len(ICV_data['PatientID'].unique()))

In [None]:
# ICV MORTALITY
print(len(ICV_data['PatientID'].loc[ICV_data['Discharge']>0].unique()))
print(len(ICV_data['PatientID'].loc[ICV_data['Discharge']==0].unique()))
927/4047*100

In [None]:
# MIMIC MORTALITY
print(len(MIMIC_data['PatientID'].loc[MIMIC_data['Discharge']>0].unique()))
print(len(MIMIC_data['PatientID'].loc[MIMIC_data['Discharge']==0].unique()))
931/7320*100

# Feature inspection

In [None]:
# Get model-ready data
mimic_normalised = data_dict['train']['X']
icv_normalised = data_dict['test']['X']
mimic_n = pd.DataFrame(mimic_normalised, columns=data_dict['featurenames'])
icv_n = pd.DataFrame(icv_normalised, columns=data_dict['featurenames'])

# Get feature names
features = data_dict['featurenames']
print(features)

# import ORIGINAL csv files
ICV_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_ICV.csv'), sep=',')
icv = ICV_data[features]
MIMIC_data = pd.read_csv(os.path.join(conf.DATA_DIR, 'final_MIMIC.csv'), sep=',')
mimic = MIMIC_data[features]

mimic['dataset'] = 'mimic'
icv['dataset'] = 'icv'
data = pd.concat([mimic, icv])

def ignore_outliers(a, p=0.01):
    a = a.dropna()
    b = np.quantile(a, p)
    c = np.quantile(a, 1-p)
    assert b > -np.Infinity
    assert c < np.Infinity
    a = a[a > b]
    a = a[a < c]
    return a

features = ['ALAT', 'ASAT', 'Bili', 'APTT', 'Age', 'Albumine',
            'Bicarbonaat', 'Calcium', 'Chloride', 'Creat', 'DIA', 
            'Glucose', 'HB', 'HeartRate', 'INR', 'Kalium', 'LEU',
            'Lactate', 'MAP', 'Magnesium', 'Natrium', 'PaCO2', 'PaO2',
            'RespRate', 'SYS', 'Weight']

binary_fields = ['Gender','Ventilator']

norm_fields= ['Age','Weight','HeartRate','SYS','MAP','DIA','RespRate','Temp','FiO2',
    'Kalium','Natrium','Chloride','Glucose','Magnesium','Calcium','ANION_GAP',
    'HB','LEU','Trombo','APTT','Art_PH','PaO2','PaCO2','Height',
    'Art_BE','Bicarbonaat','Lactate','Sofa_score','Sirs_score','Shock_Index',
    'PF_ratio','Albumine', 'Ion_Ca']

log_fields = ['max_VP_prev','SpO2','Ureum','Creat','ALAT','ASAT','Bili','INR',
              'Running_total_IV','total_IV_prev','Running_total_UP','total_UP']

for feature in features:

    # raw distribution
    part1 = ignore_outliers(icv[feature], 0.01)
    part2 = ignore_outliers(mimic[feature], 0.01)
    
    # binary features
    if feature in binary_fields:
        mimic[feature] = mimic[feature] - 0.5 
        icv[feature] = icv[feature] - 0.5 

    # gaussian normalised features
    elif feature in norm_fields:
        m_av = mimic[feature].mean()
        m_std = mimic[feature].std()
        mimic[feature] = (mimic[feature] - m_av) / m_std
        i_av = icv[feature].mean()
        i_std = icv[feature].std()
        icv[feature] = (icv[feature] - i_av) / i_std
    
    # log normal fields
    elif feature in log_fields: 
        mimic[feature] = np.log(0.1 + mimic[feature])
        icv[feature] = np.log(0.1 + icv[feature])
        m_av = mimic[feature].mean()
        m_std = mimic[feature].std()
        mimic[feature] = (mimic[feature] - m_av) / m_std
        i_av = icv[feature].mean()
        i_std = icv[feature].std()
        icv[feature] = (icv[feature] - i_av) / i_std

    # normalised
    part3 = ignore_outliers(icv[feature], 0.01)
    part4 = ignore_outliers(mimic[feature], 0.01)
           
    # min-max normalisation
    mimic_minimum = np.nanmin(mimic[feature])
    icv_minimum = np.nanmin(icv[feature])
    mimic_maximum = np.nanmax(mimic[feature])
    icv_maximum = np.nanmax(icv[feature])
    mimic[feature] = (mimic[feature] - mimic_minimum)/(mimic_maximum-mimic_minimum)
    icv[feature] = (icv[feature] - icv_minimum)/(icv_maximum-icv_minimum)
    
    # min-max normalised
    part5 = ignore_outliers(icv[feature], 0.01)
    part6 = ignore_outliers(mimic[feature], 0.01)

    
    # Multiplot dimensions
    plt.figure(figsize=(18, 4))
    
    plt.subplot(1, 3, 1)
    plt.hist(part1, label='AmsterdamUMCdb', bins=100, alpha=0.5, density=True)
    plt.hist(part2, label='MIMIC', bins=100, alpha=0.5, density=True)
    plt.title(str("raw {}".format(feature)))
    plt.legend()
    plt.subplot(1, 3, 2)
    plt.hist(part3, label='AmsterdamUMCdb', bins=100, alpha=0.5, density=True)
    plt.hist(part4, label='MIMIC', bins=100, alpha=0.5, density=True)
    plt.title(str("Normalised {}".format(feature)))
    plt.legend()
    plt.subplot(1, 3, 3)
    plt.hist(part5, label='AmsterdamUMCdb', bins=100, alpha=0.5, density=True)
    plt.hist(part6, label='MIMIC', bins=100, alpha=0.5, density=True)
    plt.title(str("min max normalised {}".format(feature)))   
    plt.legend()
    plt.show()

# Whole dataset exploratory analysis

In [None]:
# Figures directory
FIG_DIR = os.path.join(conf.ROOT_DIR, 'SEPSIS', 'figures')

# data preprocessing (WARNING: hacky)
mimic_sofa = MIMIC_data.groupby("PatientID").apply(lambda x: x.sort_values(['interval_start_time'])).reset_index(drop=True).groupby("PatientID").nth(7).Sofa_score # I know this is not correct way to do this, but you do get what i'm doing right? Stop complaining.
icv_sofa = ICV_data.groupby("PatientID").apply(lambda x: x.sort_values(['interval_start_time'])).reset_index(drop=True).groupby("PatientID").nth(1).Sofa_score

# SOFA scores

In [None]:
# Good old MatplotLib
plt.figure(figsize=(16, 10))
plt.suptitle('SOFA scores',fontsize=40)
plt.hist(mimic_sofa, color='red', label='MIMIC', alpha=0.5,bins=24, range=(0.5,24.5))
plt.hist(icv_sofa, color='blue', label='AmsterdamUMCdb', alpha=0.5,bins=24, range=(0.5,24.5))
plt.xlabel("SOFA score")
plt.ylabel("Patient Count")
plt.xticks(np.arange(0, 25, step=1))
plt.legend(fontsize = 'xx-large',frameon=False)
plt.title('Degree of dysfunction of six organ systems', fontsize=20, y=1.005) # just that little bit (upward) offset!)
plt.savefig(os.path.join(FIG_DIR, 'SOFA_datasets.png'),dpi=400,transparent=False)
plt.show()

# TSNE data

In [None]:
# Get TSNE data
try: 
    x = np.array(pd.read_csv(os.path.join(conf.DATA_DIR, 'TSNE_MIMIC_data.csv'), sep=','))
    y = np.array(pd.read_csv(os.path.join(conf.DATA_DIR, 'TSNE_ICV_data.csv'), sep=','))
    print("Restoring old TSNE datasets")
except:
    print("Starting new TSNE analysis, takes 8+ hours, be patient")
    from sklearn.manifold import TSNE
    np.random.shuffle(mimic_normalised)
    np.random.shuffle(icv_normalised)
    both = np.concatenate([mimic_normalised[:130000], icv_normalised[:59000]])

    # Oh boy here we go: Dimensionality reduction into 2 (X and Y-axis) components
    tsne = TSNE()

    # This is going to take 8+ hours, have patience
    c = tsne.fit_transform(both, )

    # How about we save this TSNE dataset it so this doesn't need to be run again... (Counter: 12)
    mimic_c, icv_c = c[:130000], c[130000:]
    pd.DataFrame(mimic_c).to_csv(os.path.join(conf.DATA_DIR, 'TSNE_MIMIC_data.csv'), index=False)
    pd.DataFrame(icv_c).to_csv(os.path.join(conf.DATA_DIR, 'TSNE_ICV_data.csv'), index=False)

    x = mimic_c
    y = icv_c
print("done")

## TSNE plot

In [None]:
# TSNE PLOT
red_patch = mpatches.Patch(color='red', label='MIMIC',alpha=0.5)
blue_patch = mpatches.Patch(color='blue', label='AmsterdamUMCdb',alpha=0.5)
plt.figure(figsize=(12, 8))
plt.scatter(x[:,0], x[:,1], c='red', alpha=0.05, label='MIMIC')
plt.scatter(y[:,0], y[:,1], c='blue', alpha=0.05, label='AmsterdamUMCdb')
plt.suptitle('State space distribution',fontsize=40,y=0.99) # just that little y axis offset (details matter!)
plt.title('t-Distributed Stochastic Neighbor Embedded State space', fontsize=15)
plt.legend(handles=[red_patch, blue_patch],frameon=False)
plt.savefig(os.path.join(FIG_DIR, 'TSNE_datasets.png'),dpi=400,transparent=False)
plt.show()