# Plots for influence of the validation set size
There are probably quite some irrelevant helper methods still in this notebook

In [3]:
import pandas as pd 
import numpy as np
from pathlib import Path
from ODD.analysis.hyperparameters import select_peak_performance, select_best_average_performance, calculate_best_average_performance, calculate_validation_set_performances
from ODD.analysis.result_processing import average_performance_per_method, average_aligned_ranks_with_versions, average_ranks_with_versions, average_ranks_with_versions_and_nemenyi
from ODD.analysis.result_analysis_charts import *
from ODD.analysis.dataset_selection import get_datasets_to_use
from ODD.analysis.validation_set import  get_data_df
from tqdm import tqdm
import altair as alt 
alt.data_transformers.disable_max_rows()
# alt.renderers.enable('png')
from collections import defaultdict


In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Config

In [5]:
VERSION = 1 
algorithms = ['CBLOF', 'HBOS', 'IForest', 'KNN' , 'LOF', 'OCSVM']
grid_versions_to_use = defaultdict(lambda: 1)
grid_versions_to_use['HBOS'] = 2
grid_versions_to_use['IForest'] = 2
grid_versions_to_use['CBLOF'] = 2
grid_versions_to_use['OCSVM'] = 3
result_path = Path()/'results'
processed_path = Path()/'processed_results_v5'

In [6]:
def big_chart(chart): 
    return chart.configure_axis(
        labelFontSize=25,
        titleFontSize=25
    ).configure_legend(
            titleFontSize=25,
            labelFontSize=23,
            strokeColor="black",
            strokeWidth=3,
            # fillColor="#EEEEEEC8",
            padding=10,
            cornerRadius=0,
            symbolSize=300,
            symbolStrokeWidth=4,
            labelLimit=0,
            titleLimit=0,
            #         orient = 'bottom'
            orient="right"
    ).configure_title(
        fontSize = 25,
        subtitleFontSize = 25
    )

## Helper functions to read results

These are probably not all relevant anymore

In [7]:
COLUMNS_TO_KEEP = ['significance_p', 'size', 'anom_multiplier', 'run_idx']
def read_all_algos_from_dir(path, datasets_to_use): 
    result_dfs = [pd.read_pickle(path/f"{algo}.pkl") for algo in algorithms]
    result_df = pd.concat(result_dfs, axis = 0, keys = algorithms, names = ['algorithm_name'])
    result_df = result_df.groupby(['dataset_id', 'anomaly_fraction']).filter(lambda x: x.name in datasets_to_use.index).reset_index()
    return result_df
    
def get_validation_performance_df(experiment_name,datasets_to_use, reference_validation_df): 
    path = processed_path/experiment_name
    
    EXCLUDE = set('.ipynb_checkpoints')
    # look at all experiments 
    result_directories = [directory for directory in path.iterdir() if (directory.is_dir() and not directory.name.startswith('.'))]
    
    
    dfs = []
    for path in result_directories: 
        result_df = read_all_algos_from_dir(path, datasets_to_use)
        columns_to_keep  = [column for column in COLUMNS_TO_KEEP if column in result_df.columns]
        result_df = (
            result_df 
            # make sure you only have datasets that are in datasets_to_use
            # drop the test_auc and test_ap 
            .drop(columns = ['test_auc', 'test_ap', 'validation_auc'])
            # recalculate test_auc and test_ap using standard validation set
            .pipe(lambda x: calculate_validation_set_performances(x, reference_validation_df))
            # average performances over the datasets
            .groupby(['algorithm_name']+columns_to_keep)[['test_auc', 'test_ap']].mean()
            #reset the index
            .reset_index()
        )
        dfs.append(result_df)
    validation_performance_df = pd.concat(dfs, axis = 0, ignore_index = True)
    validation_performance_df['reference'] = 'tuned'
    return validation_performance_df

def get_reference_performance_df(references, datasets_to_use, reference_validation_df): 
    dfs = []
    for reference in references: 
        path = processed_path/reference
        result_df = (
            # read the data
            read_all_algos_from_dir(path, datasets_to_use)
            # calculate test_ap and test_auc
            .pipe(lambda x: calculate_validation_set_performances(x, reference_validation_df))
            #average performance over the datasets
            .groupby('algorithm_name')[['test_auc', 'test_ap']].mean()
            # reset the index
            .reset_index()
        )
        dfs.append(result_df)
    return (
        # concat resulting dataframes
        pd.concat(dfs, axis = 0, keys = references, names = ['reference'])
        .reset_index()
        .assign(
            reference = lambda x: x.reference.replace({'default_performance': 'out-of-the-box', 'peak_performance':'peak', 'best_average_performance':'best-default'})
        )
    )
     

In [8]:
def get_validation_and_reference_dfs(validation_experiment, references, datasets_to_use): 
    # get the datasets to use 
    print(f"using {len(datasets_to_use)} datasets in the comparison")

    # get the reference_validation_df 
    reference_validation_df = pd.read_pickle(processed_path/validation_experiment/'reference_validation.pkl').rename(columns = {'reference_validation_set':'validation_indices'})
    
    # calculate the best average performance if necessary
    if 'best_average_performance' in references: 
        print('calculating best average performance')
        calculate_best_average_performance(Path()/'results', algorithms, grid_versions_to_use, datasets_to_use, processed_path/'best_average_performance')

    # get the validation settings dataframe
    full_validation_performance_df = get_validation_performance_df(
        validation_experiment,
        datasets_to_use,
        reference_validation_df
    )
    # get the references settings dataframe
    full_reference_df = get_reference_performance_df(
        references, 
        datasets_to_use, 
        reference_validation_df
    )
    
    return full_validation_performance_df, full_reference_df
        
        

In [9]:
    
def get_validation_performance_df_multiple_runs(experiment_name,datasets_to_use, reference_validation_dfs): 
    path = processed_path/experiment_name
 
    # look at all experiments 
    result_directories = [directory for directory in path.iterdir() if (directory.is_dir() and not directory.name.startswith('.'))]
    
    nb_runs = len(reference_validation_dfs)
    
    dfs = []
    columns_to_keep = None 
    parameter_columns = None
    # for each setting calculate the average performance on the test set 
    for setting_path in result_directories: 
        for run_idx, reference_validation_df in enumerate(reference_validation_dfs): 
            run_path = setting_path/f'run{run_idx}'
            result_df = read_all_algos_from_dir(run_path, datasets_to_use)
            if columns_to_keep is None: 
                columns_to_keep  = [column for column in COLUMNS_TO_KEEP if column in result_df.columns]
                parameter_columns = list(columns_to_keep)
                parameter_columns.remove('run_idx')
            result_df = (
                result_df 
                # make sure you only have datasets that are in datasets_to_use
                # drop the test_auc and test_ap 
                .drop(columns = ['test_auc', 'test_ap', 'validation_auc'])
                # recalculate test_auc and test_ap using standard validation set
                .pipe(lambda x: calculate_validation_set_performances(x, reference_validation_df))
                # average performances over the datasets
                .groupby(['algorithm_name']+columns_to_keep)[['test_auc', 'test_ap']].mean()
            )
            dfs.append(result_df)
    
    validation_performance_df = (
        # add all the dfs together
        pd.concat(dfs, axis = 0).reset_index()
        # hack to add the average algorithm here! 
        .pipe(lambda df: 
              pd.concat(
                  [df,
                  # the average performance over algorithms
                  (
                      df
                      .assign(algorithm_name = 'Average')
                      # average over algorithms
                      .groupby(['algorithm_name']+columns_to_keep).mean().reset_index()
                  )], 
                  axis = 0
              )
        )
        # take average and std over runs 
        .groupby(['algorithm_name'] + parameter_columns).agg(['mean', 'std'])
        .assign(
            reference = 'tuned'
        )
    )
    # flatten the multi-index in columns
    validation_performance_df.columns = ['_'.join(tup) if len(tup[1])>0 else tup[0] for tup in validation_performance_df.columns]
    
    return validation_performance_df.reset_index()

def add_average_algorithm(df, columns_to_keep): 
    average_df = (
        df
        .assign(
            algorithm_name = 'average'
        )
        .groupby(['algorithm_name'])
    )
def get_reference_performance_df_multiple_runs(references, datasets_to_use, reference_validation_dfs): 
    dfs = []
    nb_runs = len(reference_validation_dfs)
    for reference in references: 
        path = processed_path/reference
        
        result_df = read_all_algos_from_dir(path, datasets_to_use)
        for run_idx, reference_validation_df in enumerate(reference_validation_dfs): 
            performance_df = (
                result_df
                # calculate test_ap and test_auc
                .pipe(lambda x: calculate_validation_set_performances(x, reference_validation_df))
                #average performance over the datasets
                .groupby('algorithm_name')[['test_auc', 'test_ap']].mean()
                .assign(reference = reference)
            )
            dfs.append(performance_df.reset_index())
        

    result_df = (
        pd.concat(dfs, axis = 0)
        # average performance over runs 
        .groupby(['reference', 'algorithm_name']).mean().reset_index()
        # add average of average
        .pipe(
            lambda df: pd.concat([df, df.assign(algorithm_name = 'Average').groupby(['algorithm_name', 'reference']).mean().reset_index()])
        )
        .reset_index()
        .assign(
            reference = lambda x: x.reference.replace({'default_performance': 'out-of-the-box', 'peak_performance':'peak', 'best_average_performance':'best-default'})
        )
    )
    return result_df
     

In [10]:
def get_validation_and_reference_dfs_multiple_runs(validation_experiment, references, datasets_to_use): 
    # get the datasets to use 
    print(f"using {len(datasets_to_use)} datasets in the comparison")
    
    reference_df_paths = sorted((processed_path/validation_experiment).glob('reference_validation_run*.pkl'), key = lambda x: x.name)
    
    # get the reference_validation_df 
    reference_validation_dfs = [pd.read_pickle(reference_df_path).rename(columns = {'reference_validation_set':'validation_indices'}) for reference_df_path in reference_df_paths]
    
    # calculate the best average performance using the correct datasets
    if 'best_average_performance' in references: 
        print('calculating best average performance')
        calculate_best_average_performance(Path()/'results', algorithms, grid_versions_to_use, datasets_to_use, processed_path/'best_average_performance')

    # get the validation settings dataframe
    full_validation_performance_df = get_validation_performance_df_multiple_runs(
        validation_experiment,
        datasets_to_use,
        reference_validation_dfs
    )
    # get the references settings dataframe
    full_reference_df = get_reference_performance_df_multiple_runs(
        references, 
        datasets_to_use, 
        reference_validation_dfs
    )
    
    return full_validation_performance_df, full_reference_df
        
        

## Plot and process the results

Again, probably not all relevant

In [11]:
def validation_set_overview_performance(
    full_validation_performance_df,
    full_reference_df,                             
    reverse = False, 
    format = None, 
    x_axis = 'validation set size',
    x_axis_title = 'validation set size'
   ): 
    
    reference_names = list(sorted(full_reference_df.reference.unique()))
    
    charts = []
    for algo in algorithms: 
        validation_performance_df = full_validation_performance_df.pipe(lambda x: x[x.algorithm_name == algo])
        reference_df = full_reference_df.pipe(lambda x: x[x.algorithm_name == algo])
        
        if len(charts)>0: 
            validation_chart = alt.Chart(validation_performance_df).mark_bar(clip = True).encode(
                x = alt.X(f'{x_axis}:O', scale = alt.Scale(reverse = reverse), title = x_axis_title), 
                y = alt.Y(f'test_auc:Q', scale = alt.Scale(domain = [0.65, 0.9]), axis = alt.Axis(title = None, labels = False)), 
                color =alt.Color('reference:N',legend = alt.Legend(title = 'Performance'),scale = alt.Scale(domain = ['tuned'] + reference_names ))
            )
        else: 
            validation_chart = alt.Chart(validation_performance_df).mark_bar(clip = True).encode(
                x = alt.X(f'{x_axis}:O', scale = alt.Scale(reverse = reverse), title = x_axis_title), 
                y = alt.Y(f'test_auc:Q', scale = alt.Scale(domain = [0.65, 0.9]), title = 'average AUC'), 
                color =alt.Color('reference:N',legend = alt.Legend(title = 'Performance'),scale = alt.Scale(domain = ['tuned'] + reference_names ))
            )
        reference_chart = alt.Chart(reference_df).mark_rule(strokeWidth = 4).encode(
                y = alt.Y(f'test_auc:Q', scale = alt.Scale(zero = False)), 
                color = 'reference:N'
            )
#         if show_test_performance: 
#             test_chart = alt.Chart(validation_performance_df).mark_bar().encode(
#                 x = f'{variable_name}:O', 
#                 y = alt.Y(f'test_auc:Q', scale = alt.Scale(zero = False)), 
#                 color = alt.ColorValue('black')
#             )
#             chart = alt.layer(validation_chart, test_chart, reference_chart).properties(title = algo)
#         else: 
        chart = alt.layer(validation_chart, reference_chart).properties(title = algo, height = HEIGHT, width = WIDTH).resolve_scale(y= 'shared')
        charts.append(chart)
    return alt.hconcat(*charts).resolve_scale(y = 'shared').resolve_axis(y='shared')
    

In [12]:
def validation_set_overview_performance_multiple_runs(
    full_validation_performance_df,
    full_reference_df,                             
    reverse = False, 
    format = None, 
    x_axis = 'validation set size',
    x_axis_title = 'validation set size'
   ): 
    
    reference_names = list(sorted(full_reference_df.reference.unique()))
    
    charts = []
    for algo in sorted(full_validation_performance_df.algorithm_name.unique(), reverse = True): 
        validation_performance_df = (
            full_validation_performance_df
            # filter out the correct info about one algorithm
            .pipe(lambda x: x[x.algorithm_name == algo])
            # add min_max std 
            .assign(
                min_error = lambda x: x.test_auc_mean - x.test_auc_std, 
                max_error = lambda x: x.test_auc_mean + x.test_auc_std
            )
        )
        reference_df = full_reference_df.pipe(lambda x: x[x.algorithm_name == algo])
        
        if len(charts)>0: 
            y_axis = alt.Axis(title =None, labels = False)
        else: 
            y_axis = alt.Axis(title = 'average AUC')
        
        y_scale = alt.Scale(domain = [0.65, 0.905])
        validation_chart_bar = alt.Chart(validation_performance_df).mark_bar(clip = True).encode(
            x = alt.X(f'{x_axis}:O', scale = alt.Scale(reverse = reverse), title = x_axis_title), 
            y = alt.Y(f'test_auc_mean:Q', scale = y_scale, axis = y_axis), 
            color =alt.Color('reference:N',legend = alt.Legend(title = 'Performance'),scale = alt.Scale(domain = ['tuned'] + reference_names ))
        )
           
        validation_chart_error = alt.Chart(validation_performance_df).mark_rule().encode(
            x = alt.X(f'{x_axis}:O', scale = alt.Scale(reverse = reverse), title = x_axis_title), 
            y = alt.Y(f'min_error:Q', scale =y_scale), 
            y2 = alt.Y2('max_error:Q')
        )
        
        reference_chart = alt.Chart(reference_df).mark_rule(strokeWidth = 4).encode(
                y = alt.Y(f'test_auc:Q', scale = y_scale), 
                color = 'reference:N'
            )
#         if show_test_performance: 
#             test_chart = alt.Chart(validation_performance_df).mark_bar().encode(
#                 x = f'{variable_name}:O', 
#                 y = alt.Y(f'test_auc:Q', scale = alt.Scale(zero = False)), 
#                 color = alt.ColorValue('black')
#             )
#             chart = alt.layer(validation_chart, test_chart, reference_chart).properties(title = algo)
#         else: 
        chart = alt.layer(validation_chart_bar,  validation_chart_error,reference_chart).resolve_scale(y= 'shared').properties(title = algo, height = HEIGHT, width = WIDTH)
        charts.append(chart)
    return alt.hconcat(*charts).resolve_scale(y = 'shared').resolve_axis(y='shared')
    

In [13]:
def validation_set_overview_performance_average(
    full_validation_performance_df,
    full_reference_df,                             
    reverse = False, 
    format = None, 
    x_axis = 'validation set size',
    x_axis_title = 'test'
   ): 
    reference_names = list(sorted(full_reference_df.reference.unique()))
        
    # take the average over all algorithms
    validation_performance_df = full_validation_performance_df.groupby(['reference',x_axis]).mean().reset_index()
    reference_df = full_reference_df.groupby('reference').mean().reset_index()
    
    validation_chart = alt.Chart(validation_performance_df).mark_bar(clip =  True).encode(
        x = alt.X(f'{x_axis}:O', scale = alt.Scale(reverse = reverse), title = x_axis_title), 
        y = alt.Y(f'test_auc:Q', scale = alt.Scale(domain = [0.65, 0.9]), axis = alt.Axis( title = None, labels = False)), 
        color = alt.Color('reference:N',legend = alt.Legend( title = 'Performance'), scale = alt.Scale(domain = ['tuned'] + reference_names))
    )

    reference_chart = alt.Chart(reference_df).mark_rule(strokeWidth = 4).encode(
        y = alt.Y(f'test_auc:Q', scale = alt.Scale(zero = False)),
        color = 'reference:N'
    )

    chart = alt.layer(validation_chart, reference_chart).resolve_scale(y = 'shared').resolve_axis(y='shared').properties(title = 'Average', height = HEIGHT, width = WIDTH)
    return chart
    

In [14]:
def compare_plot_probabilities(
    experiment_name, 
    references,
    datasets_to_use
): 
    full_validation_performance_df, full_reference_df = get_validation_and_reference_dfs(experiment_name, references, datasets_to_use)
    chart1 = validation_set_overview_performance(full_validation_performance_df, full_reference_df, reverse = True, format = format, x_axis = 'significance_p', x_axis_title = 'p')
    chart2 = validation_set_overview_performance_average(full_validation_performance_df, full_reference_df, reverse = True, format = format, x_axis = 'significance_p', x_axis_title = 'p')
    return (chart1 | chart2.properties(title = 'Average')).resolve_axis(y='shared')
#     return chart1.resolve_scale(y = 'shared').resolve_axis(y = 'shared')

In [15]:
def compare_plot_fixed_size_bars(
    experiment_name, 
    references,
    datasets_to_use, 
): 
    full_validation_performance_df, full_reference_df = get_validation_and_reference_dfs(experiment_name, references, datasets_to_use)
    assert 'anom_multiplier' not in full_validation_performance_df.columns or len(full_validation_performance_df.anom_multiplier.unique()) == 1, 'there should only be one multiplier here!'
    chart1 = validation_set_overview_performance(full_validation_performance_df, full_reference_df, reverse = False, format = format, x_axis = 'size', x_axis_title = 'size')
    chart2 = validation_set_overview_performance_average(full_validation_performance_df, full_reference_df, reverse = False, format = format, x_axis = 'size', x_axis_title = 'size')
    return (chart1 | chart2.properties(title = 'Average')).resolve_axis(y='shared')

In [16]:
def compare_plot_fixed_size_bars_multiple_runs(
    experiment_name, 
    references,
    datasets_to_use, 
): 
    full_validation_performance_df, full_reference_df = get_validation_and_reference_dfs_multiple_runs(experiment_name, references, datasets_to_use)
    assert 'anom_multiplier' not in full_validation_performance_df.columns or len(full_validation_performance_df.anom_multiplier.unique()) == 1, 'there should only be one multiplier here!'
    chart1 = validation_set_overview_performance_multiple_runs(full_validation_performance_df, full_reference_df, reverse = False, format = format, x_axis = 'size', x_axis_title = 'max size')
    return chart1 

In [17]:
def compare_plot_statistical_size_bars_multiple_runs(
    experiment_name, 
    references,
    datasets_to_use, 
): 
    full_validation_performance_df, full_reference_df = get_validation_and_reference_dfs_multiple_runs(experiment_name, references, datasets_to_use)
    display(full_validation_performance_df.set_index('algorithm_name').sort_index())
    chart1 = validation_set_overview_performance_multiple_runs(full_validation_performance_df, full_reference_df, reverse = True, format = format, x_axis = 'size', x_axis_title = 'p')
    return chart1

In [18]:
def compare_plot_fixed_size_lines(
    experiment_name, 
    references,
    datasets_to_use, 
): 
    full_validation_performance_df, full_reference_df = get_validation_and_reference_dfs(experiment_name, references, datasets_to_use)
    
    reference_names = list(sorted(full_reference_df.reference.unique()))
        
    # take the average over all algorithms
    validation_performance_df = full_validation_performance_df.groupby(['reference','size', 'anom_multiplier']).mean().reset_index()
    reference_df = full_reference_df.groupby('reference').mean().reset_index()
    reverse= False
    validation_chart = alt.Chart(validation_performance_df).mark_line(clip =  True).encode(
        x = alt.X(f'size:O', scale = alt.Scale(reverse = reverse)), 
        y = alt.Y(f'test_auc:Q', scale = alt.Scale(), axis = alt.Axis( title = 'average AUC')), 
        color = alt.Color('reference:N',legend = alt.Legend( title = 'Performance type'), scale = alt.Scale(domain = ['tuned'] + reference_names)),
        strokeDash = alt.StrokeDash('anom_multiplier:O', title = 'Anomaly multiplier')
    )

    reference_chart = alt.Chart(reference_df).mark_rule(strokeWidth = 4).encode(
        y = alt.Y(f'test_auc:Q', scale = alt.Scale(zero = False)),
        color = 'reference:N'
    )

    chart = alt.layer(validation_chart, reference_chart).resolve_scale(y = 'shared').properties(title = 'Average', height = HEIGHT, width = WIDTH)
    
    return chart

In [19]:
HEIGHT = 250
WIDTH = 150 
REFERENCE_NAMES = ['out-of-the-box', 'peak', 'best-default']
REFERENCES = ['default_performance', 'peak_performance', 'best_average_performance']

## Actual plots in the paper

In [20]:
datasets_to_use = pd.read_pickle(Path()/'used_datasets'/'used_stat_datasets.pkl')
datasets_to_use

Unnamed: 0_level_0,Unnamed: 1_level_0,index,dataset_version
dataset_id,anomaly_fraction,Unnamed: 2_level_1,Unnamed: 3_level_1
ALOI,3,1,
Annthyroid,7,6,
InternetAds,19,17,
KDDCup99,0,21,
Lymphography,4,25,
PageBlocks,9,30,
Stamps,9,42,
Arrhythmia,5,7,1.0
Cardiotocography,2,10,1.0
Ionosphere,10,19,1.0


# Statistical validation set multiple runs

In [22]:
chart = compare_plot_statistical_size_bars_multiple_runs('statistical_validation_set_multiple_10runs', REFERENCES, datasets_to_use)
big_chart(chart)

using 16 datasets in the comparison
calculating best average performance


Unnamed: 0_level_0,size,run_idx_mean,run_idx_std,test_auc_mean,test_auc_std,test_ap_mean,test_ap_std,reference
algorithm_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Average,0.01,4.5,3.02765,0.832034,0.006181,0.390167,0.014358,tuned
Average,0.05,4.5,3.02765,0.812741,0.00854,0.363244,0.009605,tuned
Average,0.1,4.5,3.02765,0.789778,0.008446,0.337867,0.011013,tuned
CBLOF,0.01,4.5,3.02765,0.830699,0.008855,0.425203,0.009927,tuned
CBLOF,0.05,4.5,3.02765,0.799546,0.02358,0.385028,0.025434,tuned
CBLOF,0.1,4.5,3.02765,0.770827,0.033979,0.355104,0.018511,tuned
HBOS,0.01,4.5,3.02765,0.817657,0.007617,0.362852,0.00884,tuned
HBOS,0.05,4.5,3.02765,0.814782,0.008871,0.366921,0.012363,tuned
HBOS,0.1,4.5,3.02765,0.809654,0.011981,0.364697,0.015291,tuned
IForest,0.01,4.5,3.02765,0.855703,0.007135,0.422541,0.021393,tuned


# Fixed validation set multiple runs with limit

In [23]:
chart = compare_plot_fixed_size_bars_multiple_runs('absolute_validation_set_multiple_10runs_max_25', REFERENCES, datasets_to_use)
big_chart(chart)

using 16 datasets in the comparison
calculating best average performance
