In [None]:
################################################################################################################
################################################################################################################
### import overall usefull libraries
import os
import platform
import copy
import sys
import inspect
import time
import collections
import math
import random
from datetime import datetime
from multiprocessing import cpu_count

### import specific libraries for this project
import pickle as pkl
import pandas as pd
import numpy as np
import sklearn as sk
import torch
import joblib

### import KNN related libraries
from sklearn.neighbors import NearestNeighbors as KNN
from sklearn.model_selection import train_test_split

################################################################################################################
################################################################################################################
# import from parent directory with a little help from sys.path.insert()
sys.path.insert(0, '..') 

### from util.py (file which once contained all classes and functions):
from util import * # automatically reload python (e.g. util.py) file when they are changed.
%reload_ext autoreload
%autoreload 2

### Configuration file to determine root directory 
import conf

# from configuration file set working directory
os.chdir(os.path.join(conf.ROOT_DIR, 'SEPSIS'))

### Check everything
conf.print_python_environment()

# Select Experiment

In [None]:
################################################################################################################
### Experiment name
exp_name = 'FINAL'

# This is horrible practice: https://stackoverflow.com/questions/2052390/manually-raising-throwing-an-exception-in-python
if not os.path.exists(os.path.join(conf.EXP_DIR, exp_name)):
    raise Exception('Cannot find experiment directory, run create_exp_dataset prior to running this file')
else:
    ############################################
    # define experiment directory
    exp_dir = os.path.join(conf.EXP_DIR, exp_name)
    
    ############################################
    # add a KNN interim data subdirectory if needed
    if not os.path.exists(os.path.join(exp_dir, 'KNN')):
        os.makedirs(os.path.join(exp_dir, 'KNN'))
    
    ############################################
    # load dataset    
    try:
        data_dict = joblib.load(os.path.join(exp_dir, 'data/FINAL_data_dict.pkl'))
    except:
        raise Exception('Cannot load dataset, rerun create_exp_dataset!')

    # inspect it (slightly less shitty code then before, still sorry)
    print("Visual inspection of data dictionary structure:")
    for k, v in data_dict.items():
        if(k == 'v'): continue
        elif(k == 'featurenames'): 
            print(v); continue
        for k1, v1 in v.items():
            print(k, k1)
            
    ############################################      
    ### features: 
    feature_names = data_dict['featurenames']
    print('features: ', feature_names)
    print("\nExperiment loaded")

From paper https://arxiv.org/abs/1811.09602:
*We define similarity of patient states using a ‘physiological distance kernel’, which is based on Euclidean distance and
upweights certain informative features of the patient’s state. Informative features were the patient’s:*
- *SOFA score*
- *Lactate levels*
- *fluid output*
- *mean and blood pressure (MAP)*
- *diastolic blood pressure*
- *PaO2/FiO2 ratio*
- *chloride levels*
- *weight*
- *age*

In [None]:
########################################################################################
# Upweighted features for KNN distance metric
upweighted_features = ['Sofa_score', # SOFA score
                       'Lactate',    # Lactate levels
                       'total_UP',   # fluid output of current state
                       'total_IV',   # iv fluid input of current state
                       'MAP',        # mean and blood pressure (MAP)
                       'DIA',        # diastolic blood pressure,
                       'PF_ratio',   # PaO2/FiO2 ratio
                       'Weight',     # weight,
                       'Age',        # age
                      ]

# Check if 
assert set(upweighted_features).issubset(feature_names), "Can't upweight non-existent feature."

# Upweight some features for distance metric
feature_weights = 1 + np.array([f in upweighted_features 
                                for f in feature_names], dtype=np.float)

# SET EXPERIMENT DATA CONFIGURATION 
    (after defining feature weights)

In [None]:
################################################################################################################
################################################################################################################
config = {'metric': 'wminkowski',                               # God does not play dice
          'feature_weights' : [feature_weights],
          'algorithm': 'auto',                        # Amount of iterations to perform the FQI - S A R S' A'
          'Minkowski_Power_parameter': 2, 
          'n_neighbors': 300,
         }
config_df = pd.DataFrame(config, index=[0])
config_df.to_csv(os.path.join(exp_dir, 'KNN/' + exp_name + '_KNNconfig.csv'), index=False)

### Upweighting some of the features for KNN

### For TRAIN/VAL/TEST: Create a probability distribution over the action space for each state and save the results as the KNN_PI_behavior_results.pickle

In [None]:
########################################################################################
eval_types = ['train','val','test']
for eval_type in eval_types:
    eval_since = time.time()
    try:
        transition_dict = dict(zip(data_dict[eval_type]['state_id'], data_dict[eval_type]['next_state_id']))
        print("Physician model for: " + str(eval_type))
    except:
        print("Error using evaluate_model: Incorrect eval type. It should be 'test', 'val' or 'train'")

    # Get model-ready data
    state_space = data_dict[eval_type]['X']
    state_space[np.where(np.isinf(state_space))] = 0 # impute inf by mean, something with feature 42, ironic isn't it?

    # K-Nearest-Neighbor
    state_space = data_dict[eval_type]['X']
    knn = KNN(n_neighbors=config['n_neighbors'],
              metric=config['metric'],
              p=config['Minkowski_Power_parameter'],
              metric_params={'w': config['feature_weights']},
              algorithm=config['algorithm'],
              n_jobs=cpu_count()-1)

    knn = knn.fit(state_space)
    print(str(eval_type) + " Knn fitted")

    ################################################################################################################
    counter = 0
    step_size = 50
    total_steps = math.floor(state_space.shape[0]/step_size)
    final_step = state_space.shape[0] % step_size
    print('state space', state_space.shape[0])
    print('step size (states in a step):', step_size)
    print('final step state space count:', final_step)
    print('total steps:', total_steps+1)

    ################################################################################################################
    for i in range(0,state_space.shape[0],step_size):
        ### get subset of state space
        if i < total_steps*step_size:
            step = step_size
        else:
            step = final_step

        counter += 1
        if counter % 10 == 0 or i == 0:
            print('step ' + str(counter) + ' out of ' + str(total_steps+1) + '. Start state: ' + str(i) + '. End state: ' + str(i+step*10))

        ### get subset of state space
        state_space_subset = state_space[i:i+step]

        #### get distances and indices
        dist_subset, ind_subset = knn.kneighbors(state_space_subset) 
        if i == 0:
            dist, ind = dist_subset, ind_subset
        else:
            dist = np.append(dist,dist_subset,axis=0)
            ind = np.append(ind,ind_subset,axis=0)

        ### save interim models every slighty less then 10%
        if i % (math.floor(total_steps/10)*step_size) == 0 and i > 0:
                saving_step = math.ceil((i / state_space.shape[0])*100)
                print('fitted AND SAVED all data up to ' + str(saving_step) + "%")
                time_elapsed = time.time() - eval_since
                hours = time_elapsed//3600
                temp = time_elapsed - 3600*hours
                minutes = temp//60
                seconds = temp - 60*minutes
                print('KNN INTERIM STEP completed in %d hours, %d minutes and %d seconds' %(hours,minutes,seconds))
                dist_df = pd.DataFrame(dist)
                ind_df = pd.DataFrame(ind)
                dist_df.to_csv(os.path.join(exp_dir, 'KNN/KNN_pi_behavior_interim_step_' + str(i) + '_' +str(eval_type) + '_dist.csv'), index=False)
                ind_df.to_csv(os.path.join(exp_dir, 'KNN/KNN_pi_behavior_interim_step_' + str(i) + '_' +str(eval_type) + '_ind.csv'), index=False)

    print(str(eval_type) + " state space processed")

    ###########################################
    all_states_action_probabilities = np.zeros([state_space.shape[0],len(np.unique(data_dict[eval_type]['action']))])
    # For each state in the state_space assign the probability of each action to the appropriate column in the final_df
    for i in range(state_space.shape[0]):
        if i % 5000 == 0: # and i > 0:
            print(str(eval_type) + " APPENDING step " + str(i) + " out of " + str(state_space.shape[0]) + ".")

        # get count of actions for this state
        this_state_action = data_dict[eval_type]['action'][(i)]
        similar_actions_for_this_state = data_dict[eval_type]['action'][(ind[i])]
        all_state_actions= np.append(this_state_action, similar_actions_for_this_state)

        # count frequency of each possible (out of ?) actions from (the performed action +  smiliar actions performed by clinicians in similar states)
        all_action_count = []

        # add the action count of each action for j in action range (0 to unique amount of actions)
        for j in np.unique(data_dict[eval_type]['action']):
            all_action_count.append(all_state_actions.tolist().count(j))

        # get the probability of each action in this state out of [similar_action_for_this_sate+this_state_action]
        all_action_probability = [x / sum(all_action_count) for x in all_action_count]
        all_action_probs = np.around(all_action_probability,3)

        # add to results matrix
        all_states_action_probabilities[i,:] = all_action_probs

    # visual inspection of final dataframe with assigned action probabilities
    results_df = pd.DataFrame.from_records(all_states_action_probabilities)

    # save results
    results_df.columns = ['A' + str(i) for i in np.unique(data_dict[eval_type]['action'])]

    # save to pickle
    results_df.to_pickle(os.path.join(exp_dir, 'KNN/KNN_pi_behavior_' + str(eval_type) +'data.pkl'))
    
    ################################################################################################################
    time_elapsed = time.time() - eval_since
    hours = time_elapsed//3600
    temp = time_elapsed - 3600*hours
    minutes = temp//60
    seconds = temp - 60*minutes
    print('KNN model complete in %d hours, %d minutes and %d seconds' %(hours,minutes,seconds))
    print("Finished Physician model for " + str(eval_type) + " at: " + str(datetime.now()))
    
################################################################################################################
time_elapsed = time.time() - total_since
hours = time_elapsed//3600
temp = time_elapsed - 3600*hours
minutes = temp//60
seconds = temp - 60*minutes
print('Total KNN experiment complete in %d hours, %d minutes and %d seconds' %(hours,minutes,seconds))
print("Finished experiment at: " + str(datetime.now()))