# Run the customized algorithms by ADBench
- Here we provide an example for testing 3 AD algorithms on 4 datasets, and any customized algorithm could be evaluated in ADBench.
- For reproducing the complete experiment results in ADBench, please run the code in the run.py file.

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# import the necessary package
from data_generator import DataGenerator
from myutils import Utils

datagenerator = DataGenerator()
utils = Utils()

- 3 algorithms: unsupervised IForest, semi-supervised DevNet and fully-supervised CatB
- 4 datasets: cardio, musk, optdigits and vowels

In [4]:
from baseline.PyOD import PYOD
from baseline.DevNet.run import DevNet
from baseline.Supervised import supervised

# dataset and model list / dict
dataset_list = ['6_cardio', '25_musk', '26_optdigits', '37_speech', '41_vowels']
# model_dict = {'IForest': PYOD, 'DevNet': DevNet, 'CatB': supervised}
model_dict = {'IForest': PYOD, 'CatB': supervised}

# save the results
df_AUCROC = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())
df_AUCPR = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())

In [5]:
# seed for reproducible results
seed = 42

for dataset in dataset_list:
    '''
    la: ratio of labeled anomalies, from 0.0 to 1.0
    realistic_synthetic_mode: types of synthetic anomalies, can be local, global, dependency or cluster
    noise_type: inject data noises for testing model robustness, can be duplicated_anomalies, irrelevant_features or label_contamination
    '''
    
    # import the dataset
    datagenerator.dataset = dataset # specify the dataset name
    data = datagenerator.generator(la=0.1, realistic_synthetic_mode=None, noise_type=None) # only 10% labeled anomalies are available
    
    for name, clf in model_dict.items():
        # model initialization
        if name == 'DevNet':
            clf = clf(seed=seed, model_name=name, save_suffix='test') # DevNet use early stopping to save the model parameter
        else:
            clf = clf(seed=seed, model_name=name)
        
        # training, for unsupervised models the y label will be discarded
        clf = clf.fit(X_train=data['X_train'], y_train=data['y_train'])
        
        # output predicted anomaly score on testing set
        score = clf.predict_score(data['X_test'])

        # evaluation
        result = utils.metric(y_true=data['y_test'], y_score=score)
        
        # save results
        df_AUCROC.loc[dataset, name] = result['aucroc']
        df_AUCPR.loc[dataset, name] = result['aucpr']

current noise type: None
{'Samples': 1831, 'Features': 21, 'Anomalies': 176, 'Anomalies Ratio(%)': 9.61}
best param: None
Learning rate set to 0.011451
0:	learn: 0.6660661	total: 144ms	remaining: 2m 23s
1:	learn: 0.6366402	total: 145ms	remaining: 1m 12s
2:	learn: 0.6119575	total: 147ms	remaining: 48.8s
3:	learn: 0.5889774	total: 148ms	remaining: 37s
4:	learn: 0.5662223	total: 150ms	remaining: 29.8s
5:	learn: 0.5448871	total: 152ms	remaining: 25.1s
6:	learn: 0.5248001	total: 153ms	remaining: 21.7s
7:	learn: 0.5033908	total: 155ms	remaining: 19.2s
8:	learn: 0.4837202	total: 156ms	remaining: 17.2s
9:	learn: 0.4663383	total: 158ms	remaining: 15.6s
10:	learn: 0.4496595	total: 159ms	remaining: 14.3s
11:	learn: 0.4299942	total: 161ms	remaining: 13.2s
12:	learn: 0.4130893	total: 162ms	remaining: 12.3s
13:	learn: 0.3979537	total: 164ms	remaining: 11.5s
14:	learn: 0.3835322	total: 165ms	remaining: 10.8s
15:	learn: 0.3691356	total: 167ms	remaining: 10.3s
16:	learn: 0.3561153	total: 168ms	remainin

In [6]:
df_AUCROC

Unnamed: 0,IForest,CatB
6_cardio,0.944193,0.9836
25_musk,1.0,1.0
26_optdigits,0.825365,0.996257
37_speech,0.484273,0.569955
41_vowels,0.780727,0.849131


In [7]:
df_AUCPR

Unnamed: 0,IForest,CatB
6_cardio,0.615718,0.908077
25_musk,1.0,1.0
26_optdigits,0.076759,0.889969
37_speech,0.016013,0.021988
41_vowels,0.389675,0.533553
