# Validation vs test set performance plot and related numbers

In [1]:
import pandas as pd 
import numpy as np
from pathlib import Path
from ODD.analysis.hyperparameters import select_peak_performance, select_best_average_performance, calculate_best_average_performance
from ODD.analysis.result_processing import average_performance_per_method, average_aligned_ranks_with_versions, average_ranks_with_versions, average_ranks_with_versions_and_nemenyi
from ODD.analysis.result_analysis_charts import *
from ODD.analysis.dataset_selection import get_datasets_to_use
from ODD.analysis.validation_set import  get_data_df
from tqdm import tqdm
import altair as alt 
alt.data_transformers.disable_max_rows()
# alt.renderers.enable('png')
from collections import defaultdict


In [2]:
%load_ext autoreload
%autoreload 2

# Config

In [3]:
VERSION = 1 
algorithms = ['CBLOF', 'HBOS', 'IForest', 'KNN' , 'LOF', 'OCSVM']
grid_versions_to_use = defaultdict(lambda: 1)
grid_versions_to_use['HBOS'] = 2
grid_versions_to_use['CBLOF'] = 2
grid_versions_to_use['IForest'] = 2
grid_versions_to_use['OCSVM'] = 3
result_path = Path()/'results'
processed_path = Path()/'processed_results_v5'

In [4]:
def big_chart(chart): 
    return chart.configure_axis(
        labelFontSize=25,
        titleFontSize=25, 
        labelLimit = 0,
#         labelAngle = 45
    ).configure_legend(
            titleFontSize=25,
            labelFontSize=25,
            strokeColor="black",
            strokeWidth=3,
            # fillColor="#EEEEEEC8",
            padding=10,
            cornerRadius=0,
            symbolSize=300,
            symbolStrokeWidth=4,
            labelLimit=0,
            titleLimit=0,
            #         orient = 'bottom'
            orient="right"
    ).configure_title(
        fontSize = 25,
        subtitleFontSize = 25
    )

## Helper functions

In [5]:
datasets_to_use = pd.read_pickle(Path()/'used_datasets'/'used_all_datasets.pkl')


In [6]:
def read_validation_set_performance_df(path, datasets_to_use): 
    dfs = []
    for algo in algorithms: 
        df = pd.read_csv(path/f'{algo}.csv')
        dfs.append(df) 
    result_df = (
        # concat the dataframes
        pd.concat(dfs, axis =0 , ignore_index = True)
        # select the usefull columns
        [['algo_name', 'dataset_id', 'anomaly_fraction', 'validation_auc', 'test_auc', 'test_ap', 'full_auc', 'full_ap']]
        # only keep datasets in datasets_to_use 
        .pipe(lambda x: x.groupby(['dataset_id', 'anomaly_fraction']).filter(lambda x: x.name in datasets_to_use.index).reset_index())
        #drop unnecessary columns 
        .drop(columns = ['index', 'anomaly_fraction'])
        # set the correct index
        .set_index(['algo_name', 'dataset_id'])
    )
    return result_df

In [7]:
def add_best_validation_set_performer(result_df): 
    best_validation_auc_per_dataset = result_df.groupby(['dataset_id']).validation_auc.max().to_frame('best_validation_auc')
    result_df = result_df.join(best_validation_auc_per_dataset, on = 'dataset_id')
    result_df['best_on_validation_set'] = result_df['best_validation_auc'] == result_df['validation_auc']
    result_df = result_df.drop('best_validation_auc', axis = 1)
    return result_df 

In [8]:
def test_auc_barplot_per_dataset(result_df, title = None): 
    chart = alt.Chart(result_df.reset_index()).mark_bar().encode(
        x = alt.X('algo_name:N'), 
        y = alt.Y('test_auc:Q',  title = 'AUC on the test set'),
        color = alt.Color('best_on_validation_set:N', title = 'Is best on the validation set')
    ).facet(column = 'dataset_id').resolve_axis(x = 'independent').properties(title = title)
    return chart 

def validation_and_test_auc_barplot_per_dataset(result_df, title = None): 
    result_df

In [9]:
def get_data_and_plot_test_auc_barplot(path, datasets_to_use):
    result_chart = (
        read_validation_set_performance_df(path, datasets_to_use)
        .pipe(add_best_validation_set_performer)
        .pipe(test_auc_barplot_per_dataset, title = 'validation set 10% of dataset')
    )
    return result_chart

In [10]:
def get_data_and_plot_difference_between_test_and_validation_performance(path, datasets_to_use): 

    result_df = (
        read_validation_set_performance_df(path, datasets_to_use)
        [['validation_auc', 'test_auc']]
        .stack()
        .reset_index()
        .rename(columns = {'level_2':'score_type', 0:'value'})
        .pipe(lambda x: x.merge(x.loc[x.score_type == 'validation_auc', ['algo_name', 'dataset_id', 'value']].rename(columns = {'value':'sort'}), on = ['algo_name', 'dataset_id']))
        .assign(
            score_type = lambda x: x.score_type.replace({'test_auc': 'AUC on Test', 'validation_auc': 'AUC on Validation'}), 
            algo_name = lambda x: x.algo_name.replace({'IF':'IForest'})
        )
    )
    
    idx = pd.IndexSlice
    result2_df = result_df.pivot_table(index = ['algo_name', 'dataset_id'], columns = ['score_type'], values ='value').reset_index()
    correlation_df = (
        result_df.pivot_table(index = ['algo_name', 'dataset_id'], columns = ['score_type'], values ='value')
        .groupby(['algo_name']).corr()
        .loc[idx[:,'AUC on Test'], 'AUC on Validation']
        .droplevel(level = 1)
    )
    new_algo_names = pd.Series(correlation_df.reset_index().apply(lambda x: f"{x.iloc[0]} (corr={round(float(x.iloc[1]),2)})", axis = 1).values, index = correlation_df.index)
    
#     result_df['algo_name'] = result_df.algo_name.replace(new_algo_names)
    
    chart = alt.Chart(result_df, height = 200, width = 430).mark_line().encode(
        x = alt.X('dataset_id:N',sort = alt.EncodingSortField(field='sort'), title = None), 
        y = alt.Y('value:Q', scale = alt.Scale(zero = False), title = 'AUC'), 
        color = alt.Color('algo_name:N', title = 'Algorithm', legend = None), 
        strokeDash = alt.StrokeDash('score_type:N', title = 'Performance'),
        facet = alt.Facet('algo_name', columns = 3, header = alt.Header(labelFontSize=30), title = None)
    ).resolve_scale(x = 'independent').resolve_axis(y = 'shared')
#     chart = alt.Chart(result2_df, height = 400, width = 400).mark_point().encode(
#         x = alt.X('AUC on Test:Q', scale = alt.Scale(domain = [0.5, 1])), 
#         y = alt.Y('AUC on Validation:Q', scale = alt.Scale(domain = [0.5, 1])), 
#         color = alt.Color('algo_name:N', title = 'Algorithm', legend = None), 
# #         strokeDash = alt.StrokeDash('score_type:N', title = 'Performance'),
#         facet = alt.Facet('algo_name', columns = 3, header = alt.Header(labelFontSize=30), title = None)
#     ).resolve_scale(x = 'shared', y = 'shared').resolve_axis(y = 'shared')
#     chart = alt.Chart(result_df, height = 200, width = 400).mark_point(size = 100, filled = True).encode(
#         x = alt.X('dataset_id:N', title = None), 
#         y = alt.Y('value:Q', scale = alt.Scale(zero = False), title = 'AUC'), 
#         color = alt.Color('score_type:N', title = 'Performance'), 
#         shape = alt.Shape('score_type:N', scale=alt.Scale(range=['cross', 'square', 'circle', 'triangle-right', 'diamond'])),
# #         strokeDash = alt.StrokeDash('score_type:N', title = 'Performance'),
#         facet = alt.Facet('algo_name', columns = 3, header = alt.Header(labelFontSize=25), title = None)
#     ).resolve_scale(x = 'independent').resolve_axis(y = 'shared')
    return big_chart(chart)


In [23]:
def get_validation_test_set_performance_df_single_run(path, datasets_to_use): 
    result_df = (
    #         read_validation_set_performance_df(processed_path/'statistical_validation_set_multiple_10runs'/'size=0.05'/f'run0', datasets_to_use)
            read_validation_set_performance_df(path, datasets_to_use)
            [['validation_auc', 'test_auc']]
            .stack()
            .reset_index()
            .rename(columns = {'level_2':'score_type', 0:'value'})
            .pipe(lambda x: x.merge(x.loc[x.score_type == 'validation_auc', ['algo_name', 'dataset_id', 'value']].rename(columns = {'value':'sort'}), on = ['algo_name', 'dataset_id']))
            .assign(
                score_type = lambda x: x.score_type.replace({'test_auc': 'AUC on Test', 'validation_auc': 'AUC on Validation'}), 
                algo_name = lambda x: x.algo_name.replace({'IF':'IForest'})
            )
        )
    return result_df 
def get_average_validation_test_set_performance_df_multiple_runs(path, datasets_to_use): 
    dfs = []
    run_paths = path.glob('run*')
    for run_path in run_paths: 
        df = get_validation_test_set_performance_df_single_run(run_path, datasets_to_use)
        dfs.append(df)
    return pd.concat(dfs, axis = 0).groupby(['algo_name', 'dataset_id', 'score_type']).mean().reset_index()
        

## RSME between validation and test set performance for size = 100

In [29]:
result_df = get_average_validation_test_set_performance_df_multiple_runs(processed_path/'absolute_validation_set_multiple_10runs_max_25'/'size=100', datasets_to_use)

from sklearn.metrics import mean_squared_error
average_error_df = (
        result_df.pivot_table(index = ['algo_name', 'dataset_id'], columns = ['score_type'], values ='value')
        .groupby(['algo_name']).apply(lambda x: mean_squared_error(x['AUC on Validation'], x['AUC on Test'], squared = False))
        .sort_values()
)
average_error_df.to_frame('RMSE')

Unnamed: 0_level_0,RMSE
algo_name,Unnamed: 1_level_1
KNN,0.066734
HBOS,0.073751
IForest,0.108758
LOF,0.111395
CBLOF,0.142509
OCSVM,0.195313


## MAE between validation and test set performance for statistical size = 0.05

In [34]:
result_df = get_average_validation_test_set_performance_df_multiple_runs(processed_path/'statistical_validation_set_multiple_10runs'/'size=0.05', datasets_to_use)

from sklearn.metrics import mean_squared_error, mean_absolute_error
average_error_df = (
        result_df.pivot_table(index = ['algo_name', 'dataset_id'], columns = ['score_type'], values ='value')
        .groupby(['algo_name']).apply(lambda x: mean_absolute_error(x['AUC on Validation'], x['AUC on Test']))
        .sort_values()
)
average_error_df.to_frame('MAE')

Unnamed: 0_level_0,MAE
algo_name,Unnamed: 1_level_1
HBOS,0.032749
KNN,0.040848
IForest,0.061095
LOF,0.079238
CBLOF,0.108719
OCSVM,0.164332


## The plot of the paper

In [49]:
get_data_and_plot_difference_between_test_and_validation_performance(processed_path/'statistical_validation_set_multiple_10runs'/'size=0.05'/f'run1', datasets_to_use)