In [None]:
################################################################################################################
################################################################################################################
### import overall usefull libraries
import os
import platform
import copy
import sys
import inspect
import time
import collections
import math
import random
import joblib
from datetime import datetime
from tqdm import tqdm

### import specific libraries for this project
import matplotlib.pyplot as plt
import pickle as pkl
import pandas as pd
import numpy as np
import sklearn as sk
import torch
from multiprocessing import cpu_count

### import KNN related libraries
from sklearn.neighbors import NearestNeighbors as KNN
from sklearn.model_selection import train_test_split

### import FQI-RandomForest related libraries
from sklearn.ensemble import RandomForestRegressor
import statistics

################################################################################################################
################################################################################################################
# import from parent directory with a little help from sys.path.insert()
sys.path.insert(0, '..') 

### from util.py (file which once contained all classes and functions):
%reload_ext autoreload
%autoreload 2
from util import * # automatically reload python (e.g. util.py) file when they are changed.

### Configuration file to determine root directory 
import conf

# from configuration file set working directory
os.chdir(os.path.join(conf.ROOT_DIR, 'SEPSIS'))

### check for GPU's
use_gpu = torch.cuda.is_available()

### Check everything
conf.print_python_environment()

# Select Experiment

In [None]:
################################################################################################################
################################################################################################################
### Experiment name
exp_name = 'FINAL'

# This is horrible practice: https://stackoverflow.com/questions/2052390/manually-raising-throwing-an-exception-in-python... but works for now.
if not os.path.exists(os.path.join(conf.EXP_DIR, exp_name)):
    raise Exception('Cannot find experiment directory, run create_exp_dataset prior to running this file')
else:
    exp_dir = os.path.join(conf.EXP_DIR, exp_name)
    ############################################
    # load dataset    
    try:
        data_dict = joblib.load(os.path.join(exp_dir, 'data/FINAL_data_dict.pkl'))
    except:
        raise Exception('Cannot load dataset, rerun create_exp_dataset!')

    # inspect it (slightly less shitty code then before, still sorry)
    print("Visual inspection of data dictionary structure:")
    for k, v in data_dict.items():
        if(k == 'v'): continue
        elif(k == 'featurenames'): 
            print(v); continue
        for k1, v1 in v.items():
            print(k, k1)
    print("\nExperiment loaded")

# SET EXPERIMENT DATA CONFIGURATION

In [None]:
################################################################################################################
################################################################################################################
config = {'FQI_SEED': 42,                               # God does not play dice
          'gamma': 0.9,
          'FQI_iterations': 100,                        # Amount of iterations to perform the FQI - S A R S' A'
          'max_depth': 5, 
          'n_estimators': 80,
         }
config_df = pd.DataFrame(config, index=[0])
config_df.to_csv(os.path.join(exp_dir, 'FQI/' + exp_name + '_FQIconfig.csv'), index=False)

# Select evaluation type

In [None]:
############################################
# Before any further steps, decide the dataset to be evaluated
eval_type = 'val'

# and select how often to train?
FQI_iterations = config['FQI_iterations']

## Build a continuous feature based state space for the Random Forest(FQI)  model to predict R+y(Q(s,a))

In [None]:
############################################
# get transition dictionary (to determine the end of a trajectory)
transition_dict = dict(zip(data_dict[eval_type]['state_id'], data_dict[eval_type]['next_state_id']))

# get state_ID's and next_state_ID's
batch_ids = data_dict[eval_type]['state_id']
next_state_batch_ids = [transition_dict[x] for x in batch_ids]

############################################
# load the data for the FQI
state_features = data_dict[eval_type]['X'][(batch_ids)]
state_features[np.where(np.isinf(state_features))] = 0 # impute inf by mean, something with feature 42, ironic isn't it?
actions = data_dict[eval_type]['action'][(batch_ids)]
reward = data_dict[eval_type]['reward'][(batch_ids)]
next_state_features = data_dict[eval_type]['X'][(next_state_batch_ids)]
next_state_features[np.where(np.isinf(next_state_features))] = 0
# define the reward mask. In case of the end of a trajectory, the MaxQN(state,action) function should be zero... (no more future discounted reward after final state)
reward_mask = []
for i in range(state_features.shape[0]):
    if (batch_ids[i] == next_state_batch_ids[i]):
        reward_mask += [False] 
    else:
        reward_mask += [True]

# visual inspection of the shape of the state features and the actions
print('Check the dimensions of the state and action space:')

# next state features
print("\nState features:")
print(state_features.shape)

# next state features
print("\nNext State features:")
print(next_state_features.shape)

# action dimensions
print("\nUnique Actions:")
print(np.unique(data_dict[eval_type]['action']))

In [None]:
### Build a dataset with with dimensions: all_states * all_possible_actions
for i in tqdm(np.unique(data_dict[eval_type]['action'])):
    next_state_joined_with_action = np.concatenate((next_state_features, np.transpose([np.repeat(i,next_state_features.shape[0])])), axis=1)
    next_state_joined_with_action_joined_with_unique_state_id = np.concatenate((next_state_joined_with_action,np.transpose(np.array([batch_ids]))), axis=1)
    if(i==0): 
        all_next_states_all_possible_actions = pd.DataFrame.from_records(next_state_joined_with_action_joined_with_unique_state_id)
        #print("\nDimensions of the dataset that is being appended with it's base length for each action (out of 25):")
        #print(all_next_states_all_possible_actions.shape)
    else: 
        all_next_states_all_possible_actions = all_next_states_all_possible_actions.append(pd.DataFrame.from_records(next_state_joined_with_action_joined_with_unique_state_id))
        #print(all_next_states_all_possible_actions.shape)

# Add column names for continuous model features
feature_names = data_dict['featurenames'].tolist()

# Add action to feature space
feature_and_action_names = feature_names + ['action']

# Add state ID to dataframe 
column_names = feature_and_action_names + ['state_id']

# add the column names to the dataframe
all_next_states_all_possible_actions.columns = column_names

# visual inspection of the shape of the state features and the actions
print('\nCheck the dimensions of the new state and action space (should be original_state_length * unique_amount_of_actions, feature length + 3 (1 for action, 1 for state_ID, 1 for Qvalue):')
print(all_next_states_all_possible_actions.shape)

### visual check
print("\nVisual check column names of this dataframe:")
print(all_next_states_all_possible_actions.columns)

## Create action dummies for all_next_states_all_possible_actions dataset and concatenate with the feature dataset 

In [None]:
# Create dummy variables for actions
action_dummies = np.array(pd.get_dummies(actions))

## Create Random Forest Fitted Q-iteration input (X) and training (feat) dataset
    ### Create FQI-RandomForest in "i" iterations while learning output = reward + future_discounted_reward given a sum over actions. 
        This algorithm is trained in a form equal to the objective used in expected S A R S' A'

In [None]:
# initial X (set of states and taken actions at time t from 0 to T)
X = np.concatenate([state_features, action_dummies], axis=1)

# initialize the Q_minus_1 function, set at 0 for N==0.
all_next_states_all_possible_actions["Qvalue"] = 0
cont_Q_minus_1 = all_next_states_all_possible_actions.groupby(['state_id'], sort=False)['Qvalue'].max()

# construct the output for the regressor function ~ reward mask is False if state is terminal in which case there is no "future discounted reward, only current state reward"\
gamma = config['gamma']
output = reward + gamma * (cont_Q_minus_1 * reward_mask)

feat = np.concatenate([all_next_states_all_possible_actions[feature_names], pd.get_dummies(all_next_states_all_possible_actions['action'])], axis=1)

############################################
# Create initial RF regressor and set hyperparamters for FQI (gamma for learning, the amount of trees (n_estimators), etc...)
cont_regr = RandomForestRegressor(max_depth=config['max_depth'], random_state=config['FQI_SEED'],n_estimators=config['n_estimators'], n_jobs=cpu_count()-1)

In [None]:
# Initialize first RF regressor (thus, only truly fitting between state_action_pair and immediate reward)
cont_regr.fit(X, output)

# Before FQI loop:
cont_performance_dict = {'train': {
                             'iteration':[],
                             'loss':[],
                             'meanQ': [],
                             'Qvar': []
                             },
                    'val': {
                             'iteration':[],
                             'loss':[],
                             'meanQ': [],
                             'Qvar': []
                             },
                    'test': {
                             'iteration':[],                             
                             'loss':[],
                             'meanQ': [],
                             'Qvar': []
                             }
                   }

## FQI SARSA

In [None]:
# loop additional times to improve maxQ_n_minus_1(s,a) ~ Function approximator algorithm by FQI: from "2005 - Tree-Based Batch Mode Reinforcement Learning"
for i in tqdm(range(config['FQI_iterations'])):
    # assign new Q values to dataframe
    all_next_states_all_possible_actions['Qvalue'] = cont_regr.predict(feat)
    
    # get max Q value for each state ID # https://stackoverflow.com/questions/15705630/python-getting-the-row-which-has-the-max-value-in-groups-using-groupby
    cont_Q_minus_1 = all_next_states_all_possible_actions.groupby(['state_id'], sort=False)['Qvalue'].max()

    # construct the output for the regressor function
    output = reward + gamma * (cont_Q_minus_1 * reward_mask)
    
    # Build the next iteration of the RF regressor
    cont_regr.fit(X, output)
    
    # metric calculation for each loop
    loss = (output - cont_Q_minus_1).mean()
    meanQ = cont_Q_minus_1.mean()
    Qvar = statistics.variance(cont_Q_minus_1)
    
    # In main loop of the FQI algorithm: Update the performance_dict
    cont_performance_dict[eval_type]['iteration'].append(i)
    cont_performance_dict[eval_type]['loss'].append(loss)
    cont_performance_dict[eval_type]['meanQ'].append(meanQ)
    cont_performance_dict[eval_type]['Qvar'].append(Qvar)
    
    # Keep track of performance
    if (i % 10 == 0 and i > 0) or i <25:
        print('Iteration: {}, {} Loss: {:4f}, meanQ: {:4f}, Qvar: {:4f}'.format(i, eval_type, loss, meanQ, Qvar))
 

## Visual inspection

In [None]:
# Create multiplot
plt.figure(figsize=(20, 3))

# Q loss function
plt.subplot(141)
plt.plot(cont_performance_dict[eval_type]['iteration'], cont_performance_dict[eval_type]['loss'])
plt.xlim(0,100)
plt.title("Iterations step (x) versus loss (y)")

# Q variance plotting
plt.subplot(142)
plt.plot(cont_performance_dict[eval_type]['iteration'], cont_performance_dict[eval_type]['Qvar'])
plt.xlim(0,100)
plt.title("Iterations step (x) versus Qvalues variance (y)")

# Q variance plotting
plt.subplot(143)
plt.plot(cont_performance_dict[eval_type]['iteration'], cont_performance_dict[eval_type]['meanQ'])
plt.xlim(0,100)
plt.title("Iterations step (x) versus mean Qvalues (y)")

# Q-value for current_state_joined_with_action
plt.subplot(144)
pd.Series(cont_Q_minus_1).hist(bins=50)
plt.title("Qvalue histogram for current state values")

# visual inspection
plt.savefig((os.path.join(exp_dir, 'figures/FQI_QValues_continuous_' + str(eval_type) + '.tiff')),dpi=200,transparent=False)
plt.show()
    
# find range for visualisation
print("Minimal Q Value: " + str(cont_Q_minus_1.min()))
print("Maximum Q Value: " + str(cont_Q_minus_1.max()))

## Save the FQI continuous state space Q-values for each action (the max of these for each state is the V-values)

In [None]:
### Build a dataset with with dimensions: all_states * all_possible_actions ~ 6408 (states) * 21 (possible actions)
for i in np.unique(data_dict[eval_type]['action']):
    current_state_joined_with_action = np.concatenate((state_features, np.transpose([np.repeat(i,state_features.shape[0])])), axis=1)
    current_state_joined_with_action_joined_with_unique_state_id = np.concatenate((current_state_joined_with_action,np.transpose(np.array([batch_ids]))), axis=1)
    if(i==0): 
        all_current_states_all_possible_actions = pd.DataFrame.from_records(current_state_joined_with_action_joined_with_unique_state_id)
        print("\nDimensions of the dataset that is being appended with it's base length for each action:")
        print(all_current_states_all_possible_actions.shape)
    else: 
        all_current_states_all_possible_actions = all_current_states_all_possible_actions.append(pd.DataFrame.from_records(current_state_joined_with_action_joined_with_unique_state_id))
        print(all_current_states_all_possible_actions.shape)

# add the column names to the dataframe
all_current_states_all_possible_actions.columns = column_names

# visual inspection of the shape of the state features and the actions
print('\nCheck the dimensions of the new state and action space (should be original_state_length * unique_amount_of_actions, feature length + 3 (1 for action, 1 for state_ID, 1 for Qvalue):')
print(all_current_states_all_possible_actions.shape)

### visual check
print("\nVisual check column names of this dataframe:")
print(all_current_states_all_possible_actions.columns)

# add empty Qvalue column
all_current_states_all_possible_actions['Qvalue'] = 0

## add dummies back to all_current_states_all_possible_actions
current_state_all_possible_actions_dummies = np.concatenate([all_current_states_all_possible_actions[feature_names], pd.get_dummies(all_current_states_all_possible_actions['action'])], axis=1)

# assign new Q values to dataframe
all_current_states_all_possible_actions['Qvalue'] = cont_regr.predict(current_state_all_possible_actions_dummies)

# create an output dataframe with the Q values (25 collumns) for each state (rows) for the 'eval_type' dataset
cont_results_df = all_current_states_all_possible_actions.pivot_table(values='Qvalue', index='state_id', columns='action')

# add Q values for each unique action
cont_results_df.columns = ['Q' + str(i) for i in np.unique(data_dict[eval_type]['action'])]

# save to pickle
cont_results_df = np.around(cont_results_df,decimals=3)
cont_results_df.to_pickle(os.path.join(exp_dir, 'FQI/FQI_QValues_' + str(eval_type) + 'data.pkl'))