In [None]:
import sys
sys.path.append('../..')

%load_ext autoreload
%autoreload 2

In [None]:
from tqdm import tqdm

# import function to load datasets
from oab.data.load_dataset import load_dataset
# import objects for evaluation
from oab.evaluation import EvaluationObject, ComparisonObject

# import anomaly detection algorithms from pyod
from pyod.models.knn import KNN # fit and decision_scores_
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.abod import ABOD
from pyod.models.auto_encoder import AutoEncoder
from ae_lof import AELOF

In [None]:
# load datasets and store them in a list
spambase = load_dataset('spambase')
wilt = load_dataset('wilt')
nasa = load_dataset('NASA_ground_data')
annthyroid = load_dataset('annthyroid')
pageblocks = load_dataset('page-blocks')
ionosphere = load_dataset('ionosphere')
boston = load_dataset('boston')


datasets = [
    spambase,
    wilt,
    nasa,
    annthyroid,
    pageblocks,
    ionosphere,
    boston
            ]

In [None]:
# specify which algorithms to use and what their name is
algorithm_names = ["kNN", 
                   "LOF", 
                   "IForest",
                   "ABOD",
                   "AE",
                   "AELOF",
                   ]

algorithms = [KNN, LOF, IForest, ABOD, AutoEncoder, AELOF]

names_to_algorithms = {'kNN': KNN, 'LOF': LOF, 'IForest': IForest, 'ABOD': ABOD, 'AE': AutoEncoder, 'AELOF': AELOF}
names_to_init = dict()

In [None]:
# kNN
knn_factor = 0.05
knn_minimum = 10
def kNN_initialize(n: int):
    k = int(max(knn_factor * n, knn_minimum))
    return KNN(n_neighbors=k)

names_to_init['kNN'] = kNN_initialize


# LOF
lof_factor = 0.1
lof_minimum = 10
def LOF_initalize(n: int):
    k = int(max(lof_factor * n, lof_minimum))
    return KNN(n_neighbors=k)

names_to_init['LOF'] = LOF_initalize


# ABOD
abod_factor = 0.01
abod_minimum = 10
def ABOD_initialize(n: int):
    k = int( max(abod_factor * n, abod_minimum))
    return ABOD(n_neighbors=k)

names_to_init['ABOD'] = ABOD_initialize


# IForest
def IForest_initialize(*args, **kwargs):
    return IForest(random_state=42)

names_to_init['IForest'] = IForest_initialize


# AE
import tensorflow as tf
import numpy as np
def AE_initialize(*args, **kwargs):
    tf.random.set_seed(42)
    np.random.seed(42)
    return AutoEncoder(verbose=0, hidden_neurons=[6, 3, 3, 6], random_state=42)

names_to_init['AE'] = AE_initialize


# AE LOF
lof_factor = 0.1
lof_minimum = 10
AE_params = {'verbose': 0, 'hidden_neurons': [6, 3, 3, 6], 'random_state': 42}
def AELOF_initialize(n: int):
    k = int(max(lof_factor * n, lof_minimum))
    return AELOF(AE_parameters=AE_params, LOF_parameters={'n_neighbors': k}, random_state=42)

names_to_init['AELOF'] = AELOF_initialize

In [None]:
# create comparison object that holds all evaluations
co = ComparisonObject()

# run algorithms on datasets
for dataset in datasets:
    sampling_size = dataset.get_sampling_parameters(contamination_rate=0.05, downscaling_factor=0.9)['n']
    for algorithm_name in algorithm_names:
        print(f"-- Dataset name {dataset.classification_dataset.name}, size {sampling_size}, algorithm {algorithm_name}")

        # eval_obj stores predictions and ground truths
        eval_obj = EvaluationObject(algorithm_name=algorithm_name)
        initializer=names_to_init[algorithm_name]
        
        i=0

        for (x, y), sample_config in dataset.sample_from_yaml(type='unsupervised_multiple_benchmark'):
            print('.', end='')
            if i==0:
                print(f"{x.shape}", end='')
                i += 1
            algo = initializer(sampling_size)
            # fit data to algorithm
            algo.fit(x)
            # get prediction scores
            pred = algo.decision_scores_
            # add ground truth and prediction to evaluation object
            eval_obj.add(ground_truth=y, prediction=pred, description=sample_config)

        # calculate mean values for metrics based on previously added ground truths
        # and predictions
        eval_desc = eval_obj.evaluate(print=False, metrics=['roc_auc', 'adjusted_average_precision', 'precision_recall_auc'])
        # add resulting evaluation to the comparison object
        co.add_evaluation(eval_desc)

In [None]:
# print results in easily readable format
co.print_results()

In [None]:
datasets = co._get_datasets()

In [None]:
# print results in easily readable format with standard deviations
co.choose_datasets(None)
co.print_results(include_stdevs=True)

In [None]:
# print results in latex format (note: also has parameter include_stdevs)
co.print_latex()