# Calculate tuned performance

## Imports and config

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd 
import numpy as np
from pathlib import Path
import altair as alt 
alt.data_transformers.disable_max_rows() 
from ODD.analysis.validation_set import (
    get_average_performance_per_dataset,
    generate_validation_sets_statistical, 
    generate_validation_sets_absolute, 
    generate_validation_sets_absolute_subset, 
    generate_validation_sets_absolute_subset_multiple_runs, 
    generate_validation_sets_statistical_subset_multiple_runs,
    get_data_df
)
from ODD.analysis.hyperparameters import select_best_validation_performance
from sklearn.model_selection import train_test_split
from numpy.random import default_rng
from sklearn.metrics import roc_auc_score
from dask.distributed import Client
from tqdm import tqdm
from numpy.random import default_rng
from collections import defaultdict

In [None]:
grid_version_to_use = defaultdict(lambda: 1)
grid_version_to_use['HBOS'] = 2
grid_version_to_use['CBLOF'] = 2
grid_version_to_use['IForest'] = 2
grid_version_to_use['OCSVM'] = 3
algorithms = ['CBLOF', 'HBOS', 'IForest', 'KNN' , 'LOF', 'OCSVM']

In [None]:
result_path = Path()/'results'
config_path = Path()/'config'
processed_path = Path()/'processed_results_v5'

In [None]:
OVERWRITE = True

### Calculate the auc thresholds to use for statistical validation set size
For this we use the average default performance per datasets

In [None]:
auc_thresholds = (
    get_average_performance_per_dataset(processed_path / 'default_performance')
    .auc.to_frame('auc_threshold')
)
auc_thresholds.head()

### Datasets to use 
If original contamination is <20%, use the original dataset otherwise use one of the subsamples versions at random.

In [None]:
datasets_to_use = pd.read_pickle(Path()/'used_datasets'/'used_stat_datasets.pkl').drop(columns = ['index'])
datasets_to_use

### Get the labels of the datasets we will use


In [None]:
# all datasets with label information
full_data_df = get_data_df(result_path/'grid_HBOS_v2.pkl')
# only datasets to use
data_df = full_data_df.groupby(['dataset_id', 'anomaly_fraction']).filter(lambda x: x.name in datasets_to_use.index)
data_df

### Latex code for dataset information table in the paper

In [None]:
temp_df = (
    data_df.sort_index()
    .drop(columns = 'dataset_version')
    .assign(
        nb_instances = lambda x: x.labels.apply(len), 
        nb_anomalies = lambda x: x.labels.apply(np.sum), 
        contamination = lambda x: (x.nb_anomalies/x.nb_instances*100).apply(lambda y: f"{y:.1f}%")
    )
    .droplevel(level = 1)
    .drop(columns = ['labels', 'nb_anomalies'])
)
print(temp_df.to_latex())

### Helper functions

In [None]:
def calculate_tuned_performance_multiple_runs(validation_dfs, experiment_name, setting_name, record, dask_client = None, overwrite = OVERWRITE):
    """
        Given a list of dataframes with validation set information, calculate the tuned performance and write to disk
    """
    res_path = processed_path /experiment_name / setting_name
    
    for run_idx, validation_df in enumerate(validation_dfs): 
        # save the validation df for later reference 
        run_path = res_path/f"run{run_idx}"
        run_path.mkdir(parents = True, exist_ok = True)
        
        # for reference store the validation set you used
        validation_df.to_csv(run_path / 'validation_set.csv')
        validation_df.to_pickle(run_path / 'validation_set.pkl')
        
        # for each algorithm calculate best performance based on validation set
        best_performance = None
        iterator = tqdm(algorithms)
        for algo in iterator: 
            iterator.set_description(algo)
            grid_version = grid_version_to_use[algo]
            if not overwrite and (run_path/f"{algo}.pkl").exists(): 
                continue
            # read the grid 
            iterator.set_postfix({'status': 'reading grid'})
            result_df = pd.read_pickle(result_path/ f'grid_{algo}_v{grid_version}.pkl')

            # select the best performance based on the validation set 
            iterator.set_postfix({'status': 'selecting best validation set'})
            best_performance = select_best_validation_performance(result_df, validation_df, dask_client = dask_client)

            # assign the record information 
            iterator.set_postfix({'status': 'adding additional information'})
            best_performance = best_performance.assign(run_idx = run_idx, **record)

            # save the result
            iterator.set_postfix({'status': 'saving results'})
            best_performance.to_csv(run_path/f"{algo}.csv")
            best_performance.to_pickle(run_path/f"{algo}.pkl")
    
        print('saving dataset information')
        if best_performance is not None: 
            dataset_info = best_performance[['dataset_id', 'anomaly_fraction', 'dataset_version']].drop_duplicates().reset_index(drop = True)
            dataset_info.to_csv(processed_path/experiment_name/setting_name/f'run{run_idx}'/'dataset_info.csv', index = False)

    

# Statistical size multiple runs
Does 10 runs, for each validation set size with p = 0.01, 0.05, 0.10

In [None]:
def statistical_subset_validation_set_experiment_multiple_runs(data_df,auc_threshold, significance_ps, experiment_name, n_runs = 5, reasonable_threshold = 0.5, dask_client = None, seed =None): 
    print('generating validation sets')
    all_reference_dfs, all_validation_dfs = generate_validation_sets_statistical_subset_multiple_runs(data_df, auc_threshold, significance_ps, n_runs, reasonable_threshold = reasonable_threshold, seed = seed)
                                          
    print('saving reference dfs')
    (processed_path/experiment_name).mkdir(parents = True, exist_ok = True)
    for run_idx, reference_df in enumerate(all_reference_dfs):
        reference_df.to_csv(processed_path/experiment_name/f'reference_validation_run{run_idx}.csv')
        reference_df.to_pickle(processed_path/experiment_name/f'reference_validation_run{run_idx}.pkl')
    
    for size, validation_dfs in all_validation_dfs.items(): 
        print(f'calculating tuned performance for size = {size}')
        record = dict(size = size)
        calculate_tuned_performance_multiple_runs(validation_dfs, experiment_name, f"size={size}", record, dask_client)


In [None]:
with Client(n_workers = 40, local_directory = '/cw/dtailocal/jonass/') as client: 
    statistical_subset_validation_set_experiment_multiple_runs(data_df, auc_thresholds, [0.01, 0.05, 0.10], experiment_name = 'statistical_validation_set_multiple_10runs', n_runs = 10, dask_client = client,  seed = 12341324)


# Absolute size with multiple runs
The bigger size is always a superset of the smaller set, do 10 runs for size = 50, 100, 150,200,250 and use 25% of the labeled data at most.


In [None]:
def absolute_subset_validation_set_experiment_multiple_runs_upper_limit(data_df,absolute_sizes, experiment_name, n_runs = 5, upper_limit = 0.25, reasonable_threshold = 0.5, dask_client = None, seed =None): 
    print('generating validation sets')
    all_reference_dfs, all_validation_dfs = generate_validation_sets_absolute_subset_multiple_runs(data_df, absolute_sizes, n_runs,upper_limit = upper_limit, reasonable_threshold = reasonable_threshold, seed = seed)
                                          
    print('saving reference dfs')
    (processed_path/experiment_name).mkdir(parents = True, exist_ok = True)
    for run_idx, reference_df in enumerate(all_reference_dfs):
        reference_df.to_csv(processed_path/experiment_name/f'reference_validation_run{run_idx}.csv')
        reference_df.to_pickle(processed_path/experiment_name/f'reference_validation_run{run_idx}.pkl')
    
    for size, validation_dfs in all_validation_dfs.items(): 
        print(f'calculating tuned performance for size = {size}')
        record = dict(size = size)
        calculate_tuned_performance_multiple_runs(validation_dfs, experiment_name, f"size={size}", record, dask_client)


In [None]:
with Client(n_workers = 40, local_directory = '/cw/dtailocal/jonass/') as client: 
    absolute_subset_validation_set_experiment_multiple_runs_upper_limit(data_df,  [250,200, 150, 100, 50] , 'absolute_validation_set_multiple_10runs_max_25', 10, dask_client = client, seed = 1231234234)
    
