# Charts for paper - Random Forest Classifier

## Purpose and Context

This notebook is for creating the classifier charts and data utilized in the final paper

## Setup

Import libraries

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import tqdm.notebook
tqdm.notebook.tqdm_notebook.pandas()

import utils
import labels
import train

### Set Styles and colors

In [6]:
sns.set_theme(style = "whitegrid", font_scale = 1.1, font = 'Calibri')
sns.despine(left = True)

colors = ['#e66101', '#fdb863', '#b2abd2', '#5e3c99']
sns.set_palette(sns.color_palette(colors))
figureSize = (4, 3)
padInches = 0.05

<Figure size 640x480 with 0 Axes>

## Training results

### Load Data

In [7]:
development = utils.LoadDataFromOutput('dataset-development')
validation = utils.LoadDataFromOutput('dataset-validation')
print('Developement Dataset Count: ' + str(len(development)))
print('Validate Dataset Count: ' + str(len(validation)))

data = pd.concat([development, validation]).reset_index(drop = True)
print('Total Count: ' + str(len(data)))
print('Number of Training Features: ' + str(len(development.columns)))
development.head(1)

FileNotFoundError: [Errno 2] No such file or directory: '../output\\dataset-development.gzip.parquet'

In [None]:
import ast

models = pd.read_parquet('trainedModels-RandomForestClassifier.gzip.parquet')
models['Model Params'] = models['Model'].apply(ast.literal_eval)
models.head(1)

### Classifier Results

#### Functions

In [None]:
import sklearn.metrics as skm

def ComputeAllowsLowAccuracy(threshold, data):
    _, y = train.GetXandY(data)
    y = train.ComputeLabel(y, threshold)
    
    y_pred = data['Epsilon'].apply(lambda x: 'low ε')

    return skm.accuracy_score(y, y_pred)

#### Best Model with the least amount of information

In [None]:
temp = models.copy()
temp = temp.join(temp['Model Params'].apply(lambda x: pd.Series(x, dtype = 'object')))
temp['Accuracy'] = temp['Accuracy'].apply(lambda x: np.round(x, 2))

Computing the amount of information encoded

In [None]:
temp['Total Information Used'] = temp.apply(lambda row: train.ComputeTotalInfomationUsed(row, len(development.columns)), axis = 'columns')

 Selecting the models with the highest accuracy and the lowest information used

In [None]:
maxAccuracy = temp.groupby(['Threshold']).max('Accuracy')['Accuracy'].reset_index()
maxAccuracy
totalInformationUsed = temp.merge(maxAccuracy, on = ['Threshold', 'Accuracy']).groupby(['Threshold']).min('Total Information Used')[['Accuracy', 'Total Information Used']].reset_index()

Selecting the models with the highest precision

In [None]:
mostAccurateAndPreciseWithTheLeastAmountOfInformation = temp.merge(totalInformationUsed, on = ['Threshold', 'Accuracy', 'Total Information Used']).groupby(['Threshold']).max('Precision (High ε)').reset_index()
bestModels = temp.merge(mostAccurateAndPreciseWithTheLeastAmountOfInformation[['Threshold', 'Accuracy', 'Total Information Used', 'Precision (High ε)']], on = ['Threshold', 'Accuracy', 'Total Information Used', 'Precision (High ε)'])

If there are still multiple models, then take the first one since they are identical from our point of view

In [None]:
bestModels = bestModels.groupby(['Threshold']).first().reset_index()

In [None]:
bestModels['Model'] = bestModels.progress_apply(lambda x: train.TrainRandomForestClassifier(x['Model Params'], x['Threshold'], development), axis = 'columns')
bestModels['Random Forest Classifier(Development)'] = bestModels.apply(lambda x: train.ComputeClassifierAccuracy(x['Model'], x['Threshold'], development), axis = 'columns')
bestModels['Random Forest Classifier(Validation)'] = bestModels.apply(lambda x: train.ComputeClassifierAccuracy(x['Model'], x['Threshold'], validation), axis = 'columns')
bestModels['Always Low ε(Development)'] = bestModels.apply(lambda x: ComputeAllowsLowAccuracy(x['Threshold'], development), axis = 'columns')
bestModels['Always Low ε(Validation)'] = bestModels.apply(lambda x: ComputeAllowsLowAccuracy(x['Threshold'], validation), axis = 'columns')
bestModels.head(1)

from sklearn.utils import resample

classifierModelUsed = bestModels.iloc[1]

low = data[data['Epsilon'] < classifierModelUsed['Threshold']].reset_index(drop = True).copy()
high = data[data['Epsilon'] >= classifierModelUsed['Threshold']].reset_index(drop = True).copy()
total = len(low) + len(high)

percentageLowRuns = []
for lowPercent in range(50, 100, 2):
    lowPercent = lowPercent / 100
    numberOfHighsNeeded = int(len(low) / lowPercent - len(low))
    newHighs = resample(high, n_samples = numberOfHighsNeeded, random_state = 82219)
    lowEpsilonDevelopment, lowEpsilonValidation = train.SplitData(pd.concat([low, newHighs], ignore_index = True))

    percentageLowRuns.append([lowPercent, lowEpsilonDevelopment, lowEpsilonValidation])
    
percentageLowRuns = pd.DataFrame(percentageLowRuns, columns = ['Percentage Low ε', 'Development', 'Validation'])

percentageLowRuns['Model'] = percentageLowRuns.progress_apply(lambda x: train.TrainRandomForestClassifier(classifierModelUsed['Model Params'], classifierModelUsed['Threshold'], x['Development']), axis = 'columns')
percentageLowRuns['Random Forest Classifier(Development)'] = percentageLowRuns.apply(lambda x: train.ComputeClassifierAccuracy(x['Model'], classifierModelUsed['Threshold'], x['Development']), axis = 'columns')
percentageLowRuns['Random Forest Classifier(Validation)'] = percentageLowRuns.apply(lambda x: train.ComputeClassifierAccuracy(x['Model'], classifierModelUsed['Threshold'], x['Validation']), axis = 'columns')
percentageLowRuns['Always Low ε(Development)'] = percentageLowRuns.apply(lambda x: ComputeAllowsLowAccuracy(classifierModelUsed['Threshold'], x['Development']), axis = 'columns')
percentageLowRuns['Always Low ε(Validation)'] = percentageLowRuns.apply(lambda x: ComputeAllowsLowAccuracy(classifierModelUsed['Threshold'], x['Validation']), axis = 'columns')

In [None]:
fig, axes = plt.subplots(ncols = 2, figsize = (11, 4), constrained_layout = True)

graphData = bestModels[['Threshold', 'Random Forest Classifier(Development)', 'Random Forest Classifier(Validation)', 'Always Low ε(Development)', 'Always Low ε(Validation)']]
graphData = graphData.melt(id_vars = ['Threshold'])
graphData['Threshold'] = graphData['Threshold'] / 1000

g = sns.lineplot(data = graphData, x = 'Threshold', y = 'value', hue = 'variable', ax = axes[0])
g.set(ylim = (.9, 1), ylabel = 'Accuracy', xlabel = labels.EpsilonFull);
g.legend_.set_title('Model (Dataset)')

graphData = percentageLowRuns.drop(['Development', 'Validation', 'Model'], axis = 'columns').melt('Percentage Low ε')

g = sns.lineplot(data = graphData, x = 'Percentage Low ε', y = 'value', hue = 'variable', ax = axes[1])
g.set(ylabel = 'Accuracy', xlabel = 'Percentage Low ε in Dataset');
g.legend_.set_title('Model(Dataset)')

fig.savefig('../output/chart-overall-RandomForestClassifier.png', bbox_inches = 'tight', dpi = 600)

In [None]:
print('Classifier model used in experiments:')
print('Threshold of ' + str(classifierModelUsed['Threshold']))
display(classifierModelUsed['Model Params'])