# Benchmark of various outlier detection model thresholders

### The models are evaluated by ROC, Precision @ n and execution time on 17 benchmark datasets. All datasets are split (60% for training and 40% for testing). 

The thresholders covered in this example include:

1. **IQR: Inter-Quartile Region** 
2. **MAD: Median Absolute Deviation**
3. **FWFM: Full Width at Full Minimum**
4. **YJ: Yeo-Johnson Transformation**
5. **KMEANS: Kmeans Clustering**
6. **ZSCORE: Z Score**
7. **AUCP: Area Under the Curve Percentage**
8. **QMCD: Quasi-Monte Carlo Discreperancy**
9. **FGD: Fixed Gradient Descent**
10. **DSN: Distance Shift from Normal**
11. **CLF: Trained Classifier**
12. **FILTER: Filtering Based**
13. **WIND: Topological Winding Number**
14. **EB: Elliptical Boundary**
15. **REGR: Regression Intercept**
16. **BOOT: Bootstrap Method**
17. **MCST: Monte Carlo Statistical Tests**
18. **Histogram Based Methods**
19. **Mollifier**
20. **Chauvenet's Criterion**
21. **Generalized Extreme Studentized Deviate**
22. **Modified Thompson Tau Test**
23. **Mean Shift Clustering**
24. **KARCH: Karcher Mean**
25. **OCSVM: One-Class SVM**

In [1]:
from __future__ import division
from __future__ import print_function

import os
import sys
from time import time

# temporary solution for relative imports in case pyod is not installed
# if pythresh is installed, no need to use the following line
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

import numpy as np
from numpy import percentile
import matplotlib.pyplot as plt
import matplotlib.font_manager
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

from pyod.models.knn import KNN
from pyod.models.iforest import IForest

from pythresh.thresholds.iqr import IQR
from pythresh.thresholds.mad import MAD
from pythresh.thresholds.fwfm import FWFM
from pythresh.thresholds.yj import YJ
from pythresh.thresholds.kmeans import KMEANS
from pythresh.thresholds.zscore import ZSCORE
from pythresh.thresholds.aucp import AUCP
from pythresh.thresholds.qmcd import QMCD
from pythresh.thresholds.fgd import FGD
from pythresh.thresholds.dsn import DSN
from pythresh.thresholds.clf import CLF
from pythresh.thresholds.filter import FILTER
from pythresh.thresholds.wind import WIND
from pythresh.thresholds.eb import EB
from pythresh.thresholds.regr import REGR
from pythresh.thresholds.boot import BOOT
from pythresh.thresholds.mcst import MCST
from pythresh.thresholds.hist import HIST
from pythresh.thresholds.moll import MOLL
from pythresh.thresholds.chau import CHAU
from pythresh.thresholds.gesd import GESD
from pythresh.thresholds.mtt import MTT
from pythresh.thresholds.shift import SHIFT
from pythresh.thresholds.karch import KARCH
from pythresh.thresholds.ocsvm import OCSVM

from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

INFO: Using numpy backend


In [2]:
# Define data file and read X and y
mat_file_list = ['arrhythmia.mat',
                 'cardio.mat',
                 'glass.mat',
                 'ionosphere.mat',
                 'letter.mat',
                 'lympho.mat',
                 'mnist.mat',
                 'musk.mat',
                 'optdigits.mat',
                 'pendigits.mat',
                 'pima.mat',
                 'satellite.mat',
                 'satimage-2.mat',
                 'vertebral.mat',
                 'vowels.mat',
                 'wbc.mat']

# Define nine outlier detection tools to be compared
random_state = np.random.RandomState(42)

df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc', 'IQR', 'MAD', 'FWFM', 
              'YJ', 'KMEANS', 'ZSCORE', 'AUCP', 'QMCD', 'FGD', 'DSN', 'CLF', 'FILTER', 'WIND', 
              'EB', 'REGR', 'BOOT', 'MCST', 'HIST', 'MOLL', 'CHAU', 'GESD', 'MTT', 'SHIFT', 
              'KARCH', 'OCSVM']

roc_df = pd.DataFrame(columns=df_columns)
prn_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)

clf = IForest()


for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('data', mat_file))

    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                        random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    thresholders = {
        'Inter-Quartile Region (IQR)':IQR(),
        'Median Absolute Deviation (MAD)':MAD(),
        'Full Width at Full Minimum (FWFM)':FWFM(),
        'Yeo-Johnson Transformation (YJ)': YJ(),
        'Kmeans Clustering (KMEANS)': KMEANS(),
        'Z Score (ZSCORE)': ZSCORE(),
        'AUC Percentage (AUCP)': AUCP(),
        'Quasi-Monte Carlo Discreperancy (QMCD)': QMCD(),
        'Fixed Gradient Descent (FGD)': FGD(),
        'Distance Shift from Normal (DSN)': DSN(),
        'Trained Classifier (CLF)': CLF(),
        'Filtering Based (FILTER)': FILTER(),
        'Topological Winding Number (WIND)': WIND(),
        'Elliptical Boundary (EB)': EB(),
        'Regression Intercept (REGR)': REGR(),
        'Bootstrap Method (BOOT)': BOOT(),
        'Monte Carlo Statistical Tests (MCST)': MCST(),
        'Histogram Based Methods (HIST)': HIST(),
        'Mollifier (MOLL)': MOLL(),
        "Chauvenet's Criterion (CHAU)": CHAU(),
        'Generalized Extreme Studentized Deviate (GESD)': GESD(),
        'Modified Thompson Tau Test (MTT)': MTT(),
        'Mean Shift Clustering (SHIFT)': SHIFT(),
        'Karcher Mean (KARCH)': KARCH(),
        'One-Class SVM (OCSVM)': OCSVM()
    }
    
    clf.fit(X_train_norm)
    scores = clf.decision_scores_
    
    for thres_name, thres in thresholders.items():
        t0 = time()
        pred = thres.eval(scores)
        contam = np.sum(pred)/len(pred)
        
        if contam<=0: contam=1e-3
        if contam>0.5: contam=0.5
        
        clf = IForest(contamination=contam)
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{thres_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(
        thres_name=thres_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Inter-Quartile Region (IQR) ROC:0.8333, precision @ rank n:0.5714, execution time: 0.7423s
Median Absolute Deviation (MAD) ROC:0.8469, precision @ rank n:0.5714, execution time: 0.7995s
Full Width at Full Minimum (FWFM) ROC:0.8487, precision @ rank n:0.5714, execution time: 0.7992s
Yeo-Johnson Transformation (YJ) ROC:0.8305, precision @ rank n:0.6071, execution time: 1.2084s
Kmeans Clustering (KMEANS) ROC:0.8329, precision @ rank n:0.5357, execution time: 0.7717s
Z Score (ZSCORE) ROC:0.8347, precision @ rank n:0.5357, execution time: 0.7189s
AUC Percentage (AUCP) ROC:0.8266, precision @ rank n:0.5714, execution time: 0.7535s
Quasi-Monte Carlo Discreperancy (QMCD) ROC:0.8303, precision @ rank n:0.5357, execution time: 0.7171s
Fixed Gradient Descent (FGD) ROC:0.8333, precision @ rank n:0.5357, execution time: 0.7301s
Distance Shift from Normal (DSN) ROC:0.8368, precision @ rank n:0.5357, execution time: 0.7661s
Trained Classifier (CLF) ROC:0.8487, preci

Bootstrap Method (BOOT) ROC:0.7934, precision @ rank n:0.5652, execution time: 0.6227s
Monte Carlo Statistical Tests (MCST) ROC:0.7822, precision @ rank n:0.5435, execution time: 0.5628s
Histogram Based Methods (HIST) ROC:0.7771, precision @ rank n:0.5, execution time: 0.6213s
Mollifier (MOLL) ROC:0.7741, precision @ rank n:0.5435, execution time: 0.5518s
Chauvenet's Criterion (CHAU) ROC:0.7801, precision @ rank n:0.5217, execution time: 0.5685s
Generalized Extreme Studentized Deviate (GESD) ROC:0.7867, precision @ rank n:0.5435, execution time: 0.6615s
Modified Thompson Tau Test (MTT) ROC:0.776, precision @ rank n:0.5652, execution time: 0.6089s
Mean Shift Clustering (SHIFT) ROC:0.7817, precision @ rank n:0.5435, execution time: 1.7159s
Karcher Mean (KARCH) ROC:0.7952, precision @ rank n:0.5217, execution time: 0.611s
One-Class SVM (OCSVM) ROC:0.7799, precision @ rank n:0.5435, execution time: 0.7839s

... Processing letter.mat ...
Inter-Quartile Region (IQR) ROC:0.6433, precision @ r

Z Score (ZSCORE) ROC:0.9999, precision @ rank n:0.9512, execution time: 1.7554s
AUC Percentage (AUCP) ROC:1.0, precision @ rank n:1.0, execution time: 2.0966s
Quasi-Monte Carlo Discreperancy (QMCD) ROC:1.0, precision @ rank n:1.0, execution time: 1.5495s
Fixed Gradient Descent (FGD) ROC:0.9996, precision @ rank n:0.9512, execution time: 1.7566s
Distance Shift from Normal (DSN) ROC:1.0, precision @ rank n:1.0, execution time: 2.0477s
Trained Classifier (CLF) ROC:0.9986, precision @ rank n:0.878, execution time: 1.5277s
Filtering Based (FILTER) ROC:1.0, precision @ rank n:1.0, execution time: 1.533s
Topological Winding Number (WIND) ROC:0.9999, precision @ rank n:0.95, execution time: 2.0397s
Elliptical Boundary (EB) ROC:1.0, precision @ rank n:1.0, execution time: 2.374s
Regression Intercept (REGR) ROC:0.9983, precision @ rank n:0.878, execution time: 1.9295s
Bootstrap Method (BOOT) ROC:1.0, precision @ rank n:0.9756, execution time: 2.0783s
Monte Carlo Statistical Tests (MCST) ROC:0.99

Generalized Extreme Studentized Deviate (GESD) ROC:0.6903, precision @ rank n:0.5752, execution time: 0.7132s
Modified Thompson Tau Test (MTT) ROC:0.7204, precision @ rank n:0.5841, execution time: 0.6265s
Mean Shift Clustering (SHIFT) ROC:0.7088, precision @ rank n:0.5752, execution time: 3.0263s
Karcher Mean (KARCH) ROC:0.6994, precision @ rank n:0.5664, execution time: 0.5759s
One-Class SVM (OCSVM) ROC:0.7177, precision @ rank n:0.5929, execution time: 0.7651s

... Processing satellite.mat ...
Inter-Quartile Region (IQR) ROC:0.6934, precision @ rank n:0.569, execution time: 1.3523s
Median Absolute Deviation (MAD) ROC:0.6832, precision @ rank n:0.5653, execution time: 1.3336s
Full Width at Full Minimum (FWFM) ROC:0.6887, precision @ rank n:0.5764, execution time: 2.6778s
Yeo-Johnson Transformation (YJ) ROC:0.7296, precision @ rank n:0.5887, execution time: 3.9074s
Kmeans Clustering (KMEANS) ROC:0.6671, precision @ rank n:0.5394, execution time: 1.4475s
Z Score (ZSCORE) ROC:0.7068, pr

Fixed Gradient Descent (FGD) ROC:0.795, precision @ rank n:0.3158, execution time: 0.7986s
Distance Shift from Normal (DSN) ROC:0.8146, precision @ rank n:0.2632, execution time: 0.7879s
Trained Classifier (CLF) ROC:0.8097, precision @ rank n:0.2632, execution time: 0.6653s
Filtering Based (FILTER) ROC:0.7777, precision @ rank n:0.2105, execution time: 0.6584s
Topological Winding Number (WIND) ROC:0.7937, precision @ rank n:0.2105, execution time: 0.8437s
Elliptical Boundary (EB) ROC:0.7755, precision @ rank n:0.1579, execution time: 1.2068s
Regression Intercept (REGR) ROC:0.7997, precision @ rank n:0.2105, execution time: 0.7954s
Bootstrap Method (BOOT) ROC:0.7507, precision @ rank n:0.2632, execution time: 0.9338s
Monte Carlo Statistical Tests (MCST) ROC:0.7936, precision @ rank n:0.3158, execution time: 0.7406s
Histogram Based Methods (HIST) ROC:0.7728, precision @ rank n:0.1053, execution time: 0.6384s
Mollifier (MOLL) ROC:0.851, precision @ rank n:0.2632, execution time: 0.6404s
C

In [5]:
print('Time complexity')
time_df

Time complexity


Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,IQR,MAD,FWFM,YJ,KMEANS,ZSCORE,...,BOOT,MCST,HIST,MOLL,CHAU,GESD,MTT,SHIFT,KARCH,OCSVM
0,arrhythmia,452,274,14.6018,0.7423,0.7995,0.7992,1.2084,0.7717,0.7189,...,0.8594,0.7348,0.7204,0.722,0.8172,0.8855,0.8399,2.0241,0.7909,0.9612
0,cardio,1831,21,9.6122,0.7418,0.7338,0.8712,1.3698,0.7303,0.7034,...,1.3263,0.8967,0.8392,0.7439,0.7245,1.102,0.7259,8.2771,0.823,1.2269
0,glass,214,9,4.2056,0.4978,0.506,0.5095,0.7058,0.5182,0.508,...,0.615,0.593,0.602,0.5732,0.5801,0.5694,0.5507,0.8727,0.5289,0.6803
0,ionosphere,351,33,35.8974,0.63,0.6036,0.6169,0.9517,0.5986,0.6212,...,0.6227,0.5628,0.6213,0.5518,0.5685,0.6615,0.6089,1.7159,0.611,0.7839
0,letter,1600,32,6.25,0.7629,0.8354,0.8888,1.7041,0.7958,0.7282,...,1.0427,0.88,0.7848,0.7383,0.721,1.0521,0.8611,14.5172,0.7885,1.1788
0,lympho,148,18,4.0541,0.6075,0.6194,0.5685,0.7749,0.5584,0.5559,...,0.5908,0.5045,0.4987,0.5004,0.4996,0.529,0.4997,0.9514,0.591,0.7357
0,mnist,7603,100,9.2069,2.4306,2.2502,4.4227,5.9778,2.3126,2.1126,...,6.3092,4.7234,2.1649,2.163,2.2833,3.718,2.1614,53.983,2.9344,5.1438
0,musk,3062,166,3.1679,1.5712,1.5367,1.8848,2.8874,1.5401,1.7554,...,2.0783,1.9158,1.6337,1.5592,1.5297,2.1823,1.5483,21.8249,1.9102,3.0653
0,optdigits,5216,64,2.8758,1.7285,1.6396,3.0692,3.7211,1.4964,1.4948,...,2.9014,2.5973,1.5044,1.5132,1.4874,2.4454,1.4873,43.4561,2.5662,4.1778
0,pendigits,6870,16,2.2707,1.3143,1.2807,2.9514,4.0631,1.175,1.2842,...,4.3678,3.7674,1.2489,1.2846,1.1911,2.5494,1.2656,45.3255,1.8557,3.6465


In [6]:
print('ROC Performance')
roc_df

ROC Performance


Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,IQR,MAD,FWFM,YJ,KMEANS,ZSCORE,...,BOOT,MCST,HIST,MOLL,CHAU,GESD,MTT,SHIFT,KARCH,OCSVM
0,arrhythmia,452,274,14.6018,0.8333,0.8469,0.8487,0.8305,0.8329,0.8347,...,0.8291,0.8578,0.8581,0.834,0.8163,0.8214,0.8392,0.828,0.8431,0.8256
0,cardio,1831,21,9.6122,0.9135,0.9222,0.901,0.9277,0.9277,0.9013,...,0.8991,0.9256,0.8954,0.9172,0.9122,0.932,0.9537,0.9486,0.919,0.9116
0,glass,214,9,4.2056,0.6321,0.6222,0.6519,0.637,0.6617,0.6667,...,0.6741,0.6519,0.6395,0.6494,0.6247,0.6519,0.684,0.6296,0.6765,0.6346
0,ionosphere,351,33,35.8974,0.7892,0.7908,0.7943,0.7888,0.7849,0.7847,...,0.7934,0.7822,0.7771,0.7741,0.7801,0.7867,0.776,0.7817,0.7952,0.7799
0,letter,1600,32,6.25,0.6433,0.5908,0.5793,0.5953,0.6159,0.6186,...,0.6666,0.6149,0.6032,0.6453,0.6151,0.6432,0.6105,0.5947,0.6336,0.6015
0,lympho,148,18,4.0541,0.9942,0.9942,1.0,1.0,1.0,0.9942,...,1.0,0.9942,0.9942,1.0,0.9942,1.0,1.0,0.9942,1.0,1.0
0,mnist,7603,100,9.2069,0.8175,0.8143,0.809,0.7749,0.7606,0.8058,...,0.8034,0.8279,0.8137,0.7704,0.7775,0.8034,0.8228,0.821,0.8043,0.7525
0,musk,3062,166,3.1679,1.0,0.9991,1.0,1.0,1.0,0.9999,...,1.0,0.9999,1.0,1.0,0.9976,0.9998,0.9998,0.9999,0.998,0.9999
0,optdigits,5216,64,2.8758,0.6899,0.7188,0.7498,0.7662,0.7131,0.6946,...,0.7185,0.7019,0.7078,0.735,0.7295,0.6886,0.6901,0.5631,0.6931,0.7257
0,pendigits,6870,16,2.2707,0.9375,0.9138,0.9397,0.9429,0.9273,0.9504,...,0.9358,0.9129,0.9372,0.9436,0.9479,0.9432,0.9444,0.9703,0.9337,0.938


In [7]:
print('Precision @ n Performance')
prn_df

Precision @ n Performance


Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,IQR,MAD,FWFM,YJ,KMEANS,ZSCORE,...,BOOT,MCST,HIST,MOLL,CHAU,GESD,MTT,SHIFT,KARCH,OCSVM
0,arrhythmia,452,274,14.6018,0.5714,0.5714,0.5714,0.6071,0.5357,0.5357,...,0.5357,0.5714,0.5,0.4643,0.5,0.5,0.5357,0.5714,0.5357,0.5
0,cardio,1831,21,9.6122,0.5143,0.5,0.4571,0.4429,0.5143,0.4571,...,0.4143,0.5,0.4571,0.4857,0.4714,0.5143,0.5857,0.5857,0.4571,0.5143
0,glass,214,9,4.2056,0.2,0.2,0.2,0.2,0.2,0.2,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
0,ionosphere,351,33,35.8974,0.5652,0.5652,0.5652,0.5435,0.5217,0.5652,...,0.5652,0.5435,0.5,0.5435,0.5217,0.5435,0.5652,0.5435,0.5217,0.5435
0,letter,1600,32,6.25,0.0976,0.0732,0.0976,0.0976,0.0976,0.0732,...,0.0488,0.0976,0.122,0.0732,0.0488,0.0976,0.0732,0.0488,0.0732,0.0732
0,lympho,148,18,4.0541,0.6667,0.6667,1.0,1.0,1.0,0.6667,...,1.0,0.6667,0.6667,1.0,0.6667,1.0,1.0,0.6667,1.0,1.0
0,mnist,7603,100,9.2069,0.3148,0.3296,0.3074,0.2444,0.237,0.2704,...,0.2741,0.337,0.2926,0.2333,0.2593,0.3074,0.337,0.2926,0.2778,0.2074
0,musk,3062,166,3.1679,1.0,0.9024,1.0,1.0,1.0,0.9512,...,0.9756,0.9756,1.0,1.0,0.8537,0.9756,0.9512,0.9512,0.9024,0.9756
0,optdigits,5216,64,2.8758,0.0154,0.0154,0.0308,0.0308,0.0615,0.0154,...,0.0308,0.0,0.0308,0.0308,0.0462,0.0308,0.0308,0.0,0.0308,0.0615
0,pendigits,6870,16,2.2707,0.2258,0.2097,0.3387,0.2258,0.2419,0.3226,...,0.3065,0.2419,0.2581,0.1774,0.2903,0.2581,0.2581,0.3548,0.2581,0.2419
