# Make comparison between different hyperparameter selection strategies

To make the result processing later on easier, we already preprocess the different strategies a bit and put them all in the same dataframe for easy comparison. 

## Config and imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd 
import numpy as np
from pathlib import Path
from ODD.analysis.hyperparameters import select_peak_performance, select_best_average_performance, calculate_best_average_performance, calculate_validation_set_performances
from ODD.analysis.result_processing import average_performance_per_method, average_aligned_ranks_with_versions, average_ranks_with_versions, average_ranks_with_versions_and_nemenyi
from ODD.analysis.result_analysis_charts import *
from ODD.analysis.dataset_selection import get_datasets_to_use
from ODD.analysis.validation_set import  get_data_df
from tqdm import tqdm
import altair as alt 
alt.data_transformers.disable_max_rows()
# alt.renderers.enable('png')
from collections import defaultdict


In [None]:
VERSION = 1 
algorithms = ['CBLOF', 'HBOS', 'IForest', 'KNN' , 'LOF', 'OCSVM']
REFERENCES = ['default_performance', 'peak_performance', 'best_average_performance']
grid_versions_to_use = defaultdict(lambda: 1)
grid_versions_to_use['HBOS'] = 2
grid_versions_to_use['IForest'] = 2
grid_versions_to_use['CBLOF'] = 2
grid_versions_to_use['OCSVM'] = 3
result_path = Path()/'results'
processed_path = Path()/'processed_results_v5'
comparison_path = Path()/'comparisons'


In [None]:
datasets_to_use = pd.read_pickle(Path()/'used_datasets'/'used_stat_datasets.pkl')
datasets_to_use;

In [None]:
COLUMNS_TO_KEEP = ['significance_p', 'size', 'anom_multiplier', 'run_idx']

## Helper methods

In [None]:
def read_all_algos_from_dir(path, datasets_to_use): 
    result_dfs = [pd.read_pickle(path/f"{algo}.pkl") for algo in algorithms]
    result_df = pd.concat(result_dfs, axis = 0, keys = algorithms, names = ['algorithm_name'])
    result_df = result_df.groupby(['dataset_id', 'anomaly_fraction']).filter(lambda x: x.name in datasets_to_use.index).reset_index()
    return result_df

In [None]:
def get_validation_performance_df_multiple_runs(experiment_name,datasets_to_use, reference_validation_dfs): 
    path = processed_path/experiment_name
 
    # look at all experiments 
    result_directories = [directory for directory in path.iterdir() if (directory.is_dir() and not directory.name.startswith('.'))]
    
    nb_runs = len(reference_validation_dfs)
    
    dfs = []
    columns_to_keep = None 
    parameter_columns = None
    # for each setting calculate the average performance on the test set 
    for setting_path in result_directories: 
        for run_idx, reference_validation_df in enumerate(reference_validation_dfs): 
            run_path = setting_path/f'run{run_idx}'
            result_df = read_all_algos_from_dir(run_path, datasets_to_use)
            if columns_to_keep is None: 
                columns_to_keep  = [column for column in COLUMNS_TO_KEEP if column in result_df.columns]
                parameter_columns = list(columns_to_keep)
                parameter_columns.remove('run_idx')
            result_df = (
                result_df 
                # drop the test_auc and test_ap 
                .drop(columns = ['test_auc', 'test_ap', 'validation_auc'])
                # recalculate test_auc and test_ap using standard validation set
                .pipe(lambda x: calculate_validation_set_performances(x, reference_validation_df))
            )
            dfs.append(result_df)
    
    validation_performance_df = (
        # add all the dfs together
        pd.concat(dfs, axis = 0).reset_index()
        # take average and std over runs 
        .groupby(['algorithm_name', 'dataset_id'] + parameter_columns)[['test_auc', 'test_ap']].mean()
        .rename(columns = {'test_auc': 'auc', 'test_ap':'ap'})
        .assign(
            reference = 'tuned'
        )
    )
    
    return validation_performance_df.reset_index()

In [None]:
def get_reference_performance_df_multiple_runs(references, datasets_to_use, reference_validation_dfs): 
    dfs = []
    nb_runs = len(reference_validation_dfs)
    for reference in references: 
        path = processed_path/reference
        
        result_df = read_all_algos_from_dir(path, datasets_to_use)
        for run_idx, reference_validation_df in enumerate(reference_validation_dfs): 
            performance_df = (
                result_df
                # calculate test_ap and test_auc
                .pipe(lambda x: calculate_validation_set_performances(x, reference_validation_df))
                #average performance over the datasets
                .assign(reference = reference)
            )
            dfs.append(performance_df.reset_index())
        

    result_df = (
        pd.concat(dfs, axis = 0)
        # average performance over runs 
        .groupby(['reference', 'algorithm_name', 'dataset_id'])[['test_auc', 'test_ap']].mean().reset_index()
        .rename(columns = {'test_auc':'auc', 'test_ap':'ap'})
        .assign(
            reference = lambda x: x.reference.replace({'default_performance': 'out-of-the-box', 'peak_performance':'peak', 'best_average_performance':'best-default'})
        )
    )
    return result_df

In [None]:
def get_validation_and_reference_dfs_multiple_runs(validation_experiment, references, datasets_to_use): 
    # get the datasets to use 
    print(f"using {len(datasets_to_use)} datasets in the comparison")
    
    reference_df_paths = sorted((processed_path/validation_experiment).glob('reference_validation_run*.pkl'), key = lambda x: x.name)
    
    # get the reference_validation_df 
    reference_validation_dfs = [pd.read_pickle(reference_df_path).rename(columns = {'reference_validation_set':'validation_indices'}) for reference_df_path in reference_df_paths]
    
    # calculate the best average performance using the correct datasets
    if 'best_average_performance' in references: 
        print('calculating best average performance')
        calculate_best_average_performance(Path()/'results', algorithms, grid_versions_to_use, datasets_to_use, processed_path/'best_average_performance')

    # get the validation settings dataframe
    full_validation_performance_df = get_validation_performance_df_multiple_runs(
        validation_experiment,
        datasets_to_use,
        reference_validation_dfs
    )
    # get the references settings dataframe
    full_reference_df = get_reference_performance_df_multiple_runs(
        references, 
        datasets_to_use, 
        reference_validation_dfs
    )
    
    return full_validation_performance_df, full_reference_df
        
        

In [None]:
def save_comparison(experiment_name, comparison_name): 
    validation_df, reference_df = get_validation_and_reference_dfs_multiple_runs(experiment_name, REFERENCES, datasets_to_use)
    path = comparison_path / comparison_name
    path.mkdir(parents = True, exist_ok = True)
    combined = pd.concat([validation_df, reference_df], axis = 0, ignore_index =True)
    combined.to_csv(path/'comparison.csv', index = False)

## Do the actual comparison

In [None]:
save_comparison('statistical_validation_set_multiple_10runs', 'statistical_validation_set_size')