In [1]:
import pickle
import os
import itertools

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [2]:
def load_pickles(pickles_dir):
    for filename in os.listdir(pickles_dir):
        if not 'pkl' in filename:
            print('IGNORING', os.path.join(pickles_dir, filename))
            continue
        with open(os.path.join(pickles_dir, filename), 'rb') as f:
            save_dict = pickle.load(f)
        yield save_dict, filename

def write_pickle(save_dict, pickle_path):
    with open(pickle_path, 'wb') as f:
        pickle.dump(save_dict, f)

def compare(arr1, arr2):
    left = list(np.copy(list(arr1)))
    for elem in arr2:
        if elem not in left:
            print(f'B contains {elem} while A does not')
        else:
            left.remove(elem)
    for elem in left:
        print(f'A contains {elem} while B does not')

def load_and_check(path, merge_datasets=False):
    print(path)
    global datasets, detectors
    save_dicts = []
    for save_dict, filename in load_pickles(path):
        if not merge_datasets:
            # Datasets arrays need to be equal
            assert not datasets or np.array_equal(datasets, set(save_dict['datasets'])), compare(datasets, save_dict['datasets'])
            if not datasets:
                datasets = set(save_dict['datasets'])
                print(f'Datasets:', *sorted(datasets), sep='\n')
        elif not all([ds in datasets for ds in save_dict['datasets']]):
            print(f'Add datasets:', *sorted(set(save_dict["datasets"]) - datasets), sep='\n')
            datasets = set(list(datasets) + save_dict['datasets'])
            
        if not all([det in detectors for det in save_dict['detectors']]):
            print('Add detectors: ', set(save_dict['detectors']) - detectors)
            detectors = set(list(detectors) + save_dict['detectors'])
        save_dict['_filename'] = os.path.join(filename)
        save_dicts.append(save_dict)
    return save_dicts

In [22]:
datasets = set()
detectors = set()

outlier_type = 'shift_1'

path_1 = os.path.join('..', f'reports/experiment_pollution/{outlier_type}/rest/0.01, 0.5')
save_dicts_1 = load_and_check(path_1)

path_2 = os.path.join('..', f'reports/experiment_pollution/{outlier_type}/LSTM-AD/0.01, 0.5')
save_dicts_2 = load_and_check(path_2, True)

path_3 = os.path.join('..', f'reports/experiment_pollution/{outlier_type}/LSTM-ED/0.01, 0.5')
save_dicts_3 = load_and_check(path_3, True)

path_4 = os.path.join('..', f'reports/experiment_pollution/{outlier_type}/alle_algorithmen_0.05, 0.1, 0.2')
save_dicts_4 = load_and_check(path_4, True)

# --- Merge results --- #

path = os.path.join('..', 'reports', 'experiment_pollution', outlier_type, 'evaluators')
os.makedirs(path, exist_ok=True)
for dict1, dict2, dict3, dict4 in zip(save_dicts_1, save_dicts_2, save_dicts_3, save_dicts_4):
    # We don't need the results values so drop them
    dict1['results'] = None
    dict1['seed'] = None
    
    dict1['datasets'] = datasets
    dict1['detectors'] = detectors
    # dict1['datasets'] = [x for x in dict1['datasets'] if 'anom=0.8' not in x]
    # datasets = [x for x in datasets if 'anom=0.8' not in x]
    
    # dict1: drop all results except for anom=0.2
    # dict1['benchmark_results'] = dict1['benchmark_results'][dict1['benchmark_results'].dataset.str.contains('anom=0.2')]
    
    # dict2: contains 0.05, 0.1, 0.4 without lstmad and lstmed
    dict1['benchmark_results'] = dict1['benchmark_results'].append(dict2['benchmark_results'], ignore_index=True)
    
    # dict3: contains 0.05, 0.1, 0.4 for lstmad
    dict1['benchmark_results'] = dict1['benchmark_results'].append(dict3['benchmark_results'], ignore_index=True)
    
    # dict4: contains 0.05, 0.1, 0.4 for lstmad
    dict1['benchmark_results'] = dict1['benchmark_results'].append(dict4['benchmark_results'], ignore_index=True)
    
    if 'DAGMM-NN' in detectors:
        detectors.remove('DAGMM-NN')
        if 'DAGMM-NN' in dict1['detectors']:
            dict1['detectors'].remove('DAGMM-NN')
            dict1['benchmark_results'] = dict1['benchmark_results'][dict1['benchmark_results'].algorithm != 'DAGMM-NN']
    
    file_path = os.path.join(path, dict1['_filename'])
    dict1['_filename'] = None
    
    # Sanity check: For each ds and det there should be one entry
    print('benchmarks shape:', dict1['benchmark_results'].shape)
    for det, ds in itertools.product(detectors, datasets):
        filtered = dict1['benchmark_results']
        filtered = filtered[filtered.dataset == ds]
        filtered = filtered[filtered.algorithm == det]
        assert len(filtered) == 1, f'Length of results is {len(filtered)} for {det} and {ds}'

#     write_pickle(dict1, file_path)

..\reports/experiment_pollution/variance_1/rest/0.01, 0.5
Datasets:
Syn Variance Outliers (pol=0.01, anom=0.2)
Syn Variance Outliers (pol=0.5, anom=0.2)
Add detectors:  {'DAGMM-NW', 'AutoEncoder', 'DAGMM-LW', 'Recurrent EBM', 'Donut'}
..\reports/experiment_pollution/variance_1/alle_algorithmen_0.05, 0.1, 0.2
Add datasets:
Syn Variance Outliers (pol=0.05, anom=0.2)
Syn Variance Outliers (pol=0.1, anom=0.2)
Syn Variance Outliers (pol=0.2, anom=0.2)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)
benchmarks shape: (25, 8)


In [15]:
dict4['benchmark_results']

Unnamed: 0,F0.1-score,F1-score,accuracy,algorithm,auroc,dataset,precision,recall
0,0.161342,0.275862,0.16,AutoEncoder,0.453915,"Syn Variance Outliers (pol=0.05, anom=0.2)",0.16,1.0
1,0.383126,0.483461,0.774444,DAGMM-NW,0.765542,"Syn Variance Outliers (pol=0.05, anom=0.2)",0.381526,0.659722
2,0.187454,0.308422,0.352222,DAGMM-LW,0.494553,"Syn Variance Outliers (pol=0.05, anom=0.2)",0.18598,0.902778
3,0.161342,0.275862,0.16,Recurrent EBM,0.5202,"Syn Variance Outliers (pol=0.05, anom=0.2)",0.16,1.0
4,0.182708,0.305857,0.288889,Donut,0.550825,"Syn Variance Outliers (pol=0.05, anom=0.2)",0.181234,0.979167
5,0.352968,0.405882,0.775556,AutoEncoder,0.670745,"Syn Variance Outliers (pol=0.1, anom=0.2)",0.352041,0.479167
6,0.275688,0.356098,0.706667,DAGMM-NW,0.640506,"Syn Variance Outliers (pol=0.1, anom=0.2)",0.274436,0.506944
7,0.283377,0.369305,0.707778,DAGMM-LW,0.661284,"Syn Variance Outliers (pol=0.1, anom=0.2)",0.282051,0.534722
8,0.394943,0.405405,0.804444,Recurrent EBM,0.670736,"Syn Variance Outliers (pol=0.1, anom=0.2)",0.394737,0.416667
9,0.61921,0.553846,0.871111,Donut,0.762125,"Syn Variance Outliers (pol=0.1, anom=0.2)",0.62069,0.5


### Merge 0.2 and other anom percentages

In [79]:
datasets = set()
detectors = set()

algorithm = 'lstmed'  # 'lstmad', 'rest'

path_1 = os.path.join('..', f'reports/experiment_pollution/extreme_1/{algorithm}/0.2')
save_dicts_1 = load_and_check(path_1)

path_2 = os.path.join('..', f'reports/experiment_pollution/extreme_1/{algorithm}/other')
save_dicts_2 = load_and_check(path_2, True)

# --- Merge results --- #

path = os.path.join('..', 'reports', 'experiment_pollution', 'extreme_1', algorithm)
os.makedirs(path, exist_ok=True)
for dict1, dict2 in zip(save_dicts_1, save_dicts_2):
    # We don't need the results values so drop them
    dict1['results'] = None
    dict1['seed'] = None
    
    # Add other levels of anom
    dict1['datasets'] += dict2['datasets']
    dict1['benchmark_results'] = dict1['benchmark_results'].append(dict2['benchmark_results'], ignore_index=True)
    
    file_path = os.path.join(path, dict1['_filename'])
    dict1['_filename'] = None
    
    # Sanity check: For each ds and det there should be one entry
    print('benchmarks shape:', dict1['benchmark_results'].shape)
    for det, ds in itertools.product(dict1['detectors'], dict1['datasets']):
        filtered = dict1['benchmark_results']
        filtered = filtered[filtered.dataset == ds]
        filtered = filtered[filtered.algorithm == det]
        assert len(filtered) == 1, f'Length of results is {len(filtered)} for {det} and {ds}'

    write_pickle(dict1, file_path)

..\reports/experiment_pollution/extreme_1/lstmed_old_ds/0.2
Datasets:
Syn Extreme Outliers (pol=0.0, anom=0.2)
Syn Extreme Outliers (pol=0.25, anom=0.2)
Syn Extreme Outliers (pol=0.5, anom=0.2)
Syn Extreme Outliers (pol=0.75, anom=0.2)
Syn Extreme Outliers (pol=1.0, anom=0.2)
Add detectors:  {'LSTMED'}
..\reports/experiment_pollution/extreme_1/lstmed_old_ds/other
Add datasets:
Syn Extreme Outliers (pol=0.0, anom=0.05)
Syn Extreme Outliers (pol=0.0, anom=0.1)
Syn Extreme Outliers (pol=0.0, anom=0.4)
Syn Extreme Outliers (pol=0.25, anom=0.05)
Syn Extreme Outliers (pol=0.25, anom=0.1)
Syn Extreme Outliers (pol=0.25, anom=0.4)
Syn Extreme Outliers (pol=0.5, anom=0.05)
Syn Extreme Outliers (pol=0.5, anom=0.1)
Syn Extreme Outliers (pol=0.5, anom=0.4)
Syn Extreme Outliers (pol=0.75, anom=0.05)
Syn Extreme Outliers (pol=0.75, anom=0.1)
Syn Extreme Outliers (pol=0.75, anom=0.4)
Syn Extreme Outliers (pol=1.0, anom=0.05)
Syn Extreme Outliers (pol=1.0, anom=0.1)
Syn Extreme Outliers (pol=1.0, anom

### Merge three folders, replace LSTMED

In [None]:
datasets = None
detectors = []

path_1 = os.path.join('..', 'reports', 'experiment_pollution', 'trend_1', 'evaluators_old')
save_dicts_1 = load_and_check(path_1)

path_2 = os.path.join('..', 'reports', 'experiment_pollution', 'trend_1', 'additional_evaluators_lstmad')
save_dicts_2 = load_and_check(path_2)

path_3 = os.path.join('..', 'reports', 'experiment_pollution', 'trend_1', 'additional_evaluators_lstmad')
save_dicts_3 = load_and_check(path_3)


# --- Merge results --- #

path = os.path.join('..', 'reports', 'experiment_pollution', 'trend_1', 'evaluators')
os.makedirs(path, exist_ok=True)
for dict1, dict2, dict3 in zip(save_dicts_1, save_dicts_2, save_dicts_3):
    # We don't need the results values so drop them
    dict1['results'] = None
    dict1['seed'] = None
    
    # Drop results of old algorithm
    dict1['benchmark_results'] = dict1['benchmark_results'][dict1['benchmark_results'].algorithm != 'LSTMED']
    
    dict1['detectors'].append('AutoEncoder')
    dict1['detectors'].append('LSTM-AD')
    dict1['benchmark_results'] = dict1['benchmark_results'].append(dict2['benchmark_results'], ignore_index=True)
    dict1['benchmark_results'] = dict1['benchmark_results'].append(dict3['benchmark_results'], ignore_index=True)
    
    file_path = os.path.join(path, dict1['_filename'])
    dict1['_filename'] = None
    write_pickle(dict1, file_path)