# Import Python Packages

In [1]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

In [2]:
!pip install pyod



# Import Pyod Packages & the methods

In [3]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



# Import Metrics Packages

In [4]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# Define data file and read X and y

In [5]:
mat_file_list = ['arrhythmia.mat',
                 'cardio.mat',
                 'glass.mat',
                 'ionosphere.mat',
                 'letter.mat',
                 'lympho.mat',
                 'mnist.mat',
                 'musk.mat',
                 'optdigits.mat',
                 'pendigits.mat',
                 'pima.mat',
                 'satellite.mat',
                 'satimage-2.mat',
                 'shuttle.mat',
                 'vertebral.mat',
                 'vowels.mat',
                 'wbc.mat']

In [6]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

# How to load Mat File 

In [7]:
from scipy.io import loadmat

In [8]:
data=loadmat('../data/cardio.mat')

In [9]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [10]:
len(data)

5

In [11]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [12]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

# Input(Independent) Feature Shape in Mat file format

In [13]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

# Dependent/ Target /Output Feature shape

In [14]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [15]:
df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc',
              'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD',
              'OCSVM', 'PCA']

# ROC Performance evulotion table

In [16]:
roc_df = pd.DataFrame(columns=df_columns)


In [17]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# precision_n_scores - Performance evulotion table

In [18]:
prn_df = pd.DataFrame(columns=df_columns)
prn_df


Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Time dataframe

In [19]:
time_df = pd.DataFrame(columns=df_columns)

In [20]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Exploraing All Mat files

In [21]:
from time import time
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('../data', mat_file))

    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                        random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor': CBLOF(
            contamination=outliers_fraction, check_estimator=False,
            random_state=random_state),
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)': MCD(
            contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)': PCA(
            contamination=outliers_fraction, random_state=random_state),
    }

    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(
            clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 3.0879s




Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution time: 3.0459s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution time: 1.3434s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 4.0855s
Isolation Forest ROC:0.8478, precision @ rank n:0.5357, execution time: 0.772s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.2334s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.1811s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 1.1639s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.0859s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.1371s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875, execution time: 0.669s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844, execution time: 0.1948s




Feature Bagging ROC:0.4879, precision @ rank n:0.1406, execution time: 1.1015s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688, execution time: 0.0113s
Isolation Forest ROC:0.9316, precision @ rank n:0.4531, execution time: 1.1807s
K Nearest Neighbors (KNN) ROC:0.6959, precision @ rank n:0.2812, execution time: 0.3503s
Local Outlier Factor (LOF) ROC:0.4715, precision @ rank n:0.125, execution time: 0.2292s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @ rank n:0.3906, execution time: 1.213s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938, execution time: 0.1593s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875, execution time: 0.0138s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25, execution time: 0.1317s




Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25, execution time: 0.1246s
Feature Bagging ROC:0.7043, precision @ rank n:0.25, execution time: 0.1048s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0, execution time: 0.0065s
Isolation Forest ROC:0.7195, precision @ rank n:0.25, execution time: 0.9889s
K Nearest Neighbors (KNN) ROC:0.7805, precision @ rank n:0.25, execution time: 0.0295s
Local Outlier Factor (LOF) ROC:0.7774, precision @ rank n:0.25, execution time: 0.0077s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @ rank n:0.0, execution time: 0.1132s
One-class SVM (OCSVM) ROC:0.6189, precision @ rank n:0.25, execution time: 0.0032s
Principal Component Analysis (PCA) ROC:0.622, precision @ rank n:0.25, execution time: 0.0074s

... Processing ionosphere.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9004, precision @ rank n:0.8214, execution time: 0.1871s
Cluster-based Local Outlier Factor ROC:0.8952, precision @ rank n



Feature Bagging ROC:0.8933, precision @ rank n:0.75, execution time: 0.1666s
Histogram-base Outlier Detection (HBOS) ROC:0.5195, precision @ rank n:0.3393, execution time: 0.0177s
Isolation Forest ROC:0.8294, precision @ rank n:0.6607, execution time: 0.9535s
K Nearest Neighbors (KNN) ROC:0.9134, precision @ rank n:0.8393, execution time: 0.064s
Local Outlier Factor (LOF) ROC:0.8989, precision @ rank n:0.75, execution time: 0.028s
Minimum Covariance Determinant (MCD) ROC:0.9399, precision @ rank n:0.8571, execution time: 0.1798s
One-class SVM (OCSVM) ROC:0.8372, precision @ rank n:0.7143, execution time: 0.0205s
Principal Component Analysis (PCA) ROC:0.7971, precision @ rank n:0.5893, execution time: 0.0215s

... Processing letter.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.8465, precision @ rank n:0.275, execution time: 1.0991s




Cluster-based Local Outlier Factor ROC:0.7423, precision @ rank n:0.175, execution time: 0.2836s
Feature Bagging ROC:0.866, precision @ rank n:0.4, execution time: 2.2322s
Histogram-base Outlier Detection (HBOS) ROC:0.5728, precision @ rank n:0.125, execution time: 0.0392s
Isolation Forest ROC:0.5836, precision @ rank n:0.05, execution time: 1.2008s
K Nearest Neighbors (KNN) ROC:0.845, precision @ rank n:0.3, execution time: 0.4037s
Local Outlier Factor (LOF) ROC:0.8409, precision @ rank n:0.325, execution time: 0.3005s
Minimum Covariance Determinant (MCD) ROC:0.7499, precision @ rank n:0.075, execution time: 2.3654s
One-class SVM (OCSVM) ROC:0.5744, precision @ rank n:0.1, execution time: 0.1754s
Principal Component Analysis (PCA) ROC:0.48, precision @ rank n:0.05, execution time: 0.016s

... Processing lympho.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9382, precision @ rank n:0.4, execution time: 0.0944s




Cluster-based Local Outlier Factor ROC:0.9709, precision @ rank n:0.6, execution time: 0.1348s
Feature Bagging ROC:0.9673, precision @ rank n:0.6, execution time: 0.1254s
Histogram-base Outlier Detection (HBOS) ROC:0.9964, precision @ rank n:0.8, execution time: 0.0093s
Isolation Forest ROC:0.9855, precision @ rank n:0.6, execution time: 0.8664s
K Nearest Neighbors (KNN) ROC:0.9636, precision @ rank n:0.6, execution time: 0.0098s
Local Outlier Factor (LOF) ROC:0.9636, precision @ rank n:0.6, execution time: 0.0081s
Minimum Covariance Determinant (MCD) ROC:0.9164, precision @ rank n:0.6, execution time: 0.1002s
One-class SVM (OCSVM) ROC:0.9636, precision @ rank n:0.6, execution time: 0.0051s
Principal Component Analysis (PCA) ROC:0.9818, precision @ rank n:0.8, execution time: 0.0058s

... Processing



 mnist.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562, execution time: 17.1418s




Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007, execution time: 0.9961s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664, execution time: 125.7463s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199, execution time: 0.1771s
Isolation Forest ROC:0.7813, precision @ rank n:0.3116, execution time: 5.598s
K Nearest Neighbors (KNN) ROC:0.8409, precision @ rank n:0.4144, execution time: 16.6752s
Local Outlier Factor (LOF) ROC:0.7085, precision @ rank n:0.339, execution time: 15.8075s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @ rank n:0.3973, execution time: 5.5237s
One-class SVM (OCSVM) ROC:0.8417, precision @ rank n:0.3801, execution time: 10.3364s
Principal Component Analysis (PCA) ROC:0.8396, precision @ rank n:0.3767, execution time: 0.285s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @ rank n:0.0333, execution time: 5.8764s




Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.4994s
Feature Bagging ROC:0.5228, precision @ rank n:0.1667, execution time: 26.8459s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.9667, execution time: 0.1298s
Isolation Forest ROC:0.9992, precision @ rank n:0.9, execution time: 3.1083s
K Nearest Neighbors (KNN) ROC:0.7348, precision @ rank n:0.2333, execution time: 4.1286s
Local Outlier Factor (LOF) ROC:0.5323, precision @ rank n:0.1333, execution time: 3.8464s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ rank n:0.9667, execution time: 19.219s
One-class SVM (OCSVM) ROC:1.0, precision @ rank n:1.0, execution time: 2.7235s
Principal Component Analysis (PCA) ROC:1.0, precision @ rank n:1.0, execution time: 0.3339s

... Processing optdigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.4428, precision @ rank n:0.0161, execution time: 7.2024s




Cluster-based Local Outlier Factor ROC:0.7852, precision @ rank n:0.0, execution time: 0.6747s
Feature Bagging ROC:0.4641, precision @ rank n:0.0484, execution time: 35.4109s
Histogram-base Outlier Detection (HBOS) ROC:0.8822, precision @ rank n:0.2581, execution time: 0.0815s
Isolation Forest ROC:0.5442, precision @ rank n:0.0161, execution time: 2.6143s
K Nearest Neighbors (KNN) ROC:0.3824, precision @ rank n:0.0, execution time: 4.7856s
Local Outlier Factor (LOF) ROC:0.4584, precision @ rank n:0.0484, execution time: 4.1773s




Minimum Covariance Determinant (MCD) ROC:0.3486, precision @ rank n:0.0, execution time: 2.2305s
One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0, execution time: 3.1378s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0, execution time: 0.1046s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308, execution time: 4.3068s




Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077, execution time: 0.5637s
Feature Bagging ROC:0.4687, precision @ rank n:0.0462, execution time: 12.1949s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615, execution time: 0.0248s
Isolation Forest ROC:0.9482, precision @ rank n:0.2615, execution time: 1.9715s
K Nearest Neighbors (KNN) ROC:0.7602, precision @ rank n:0.0462, execution time: 1.9056s
Local Outlier Factor (LOF) ROC:0.481, precision @ rank n:0.0462, execution time: 1.6426s
Minimum Covariance Determinant (MCD) ROC:0.8271, precision @ rank n:0.0615, execution time: 4.3741s
One-class SVM (OCSVM) ROC:0.93, precision @ rank n:0.2923, execution time: 2.4307s
Principal Component Analysis (PCA) ROC:0.9332, precision @ rank n:0.3385, execution time: 0.0228s

... Processing pima.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6757, precision @ rank n:0.5106, execution time: 0.4687s




Cluster-based Local Outlier Factor ROC:0.684, precision @ rank n:0.4681, execution time: 0.2166s
Feature Bagging ROC:0.6446, precision @ rank n:0.4468, execution time: 0.2837s
Histogram-base Outlier Detection (HBOS) ROC:0.7169, precision @ rank n:0.5213, execution time: 0.0101s
Isolation Forest ROC:0.6777, precision @ rank n:0.4787, execution time: 0.8876s
K Nearest Neighbors (KNN) ROC:0.7252, precision @ rank n:0.5106, execution time: 0.0767s
Local Outlier Factor (LOF) ROC:0.6604, precision @ rank n:0.4787, execution time: 0.0351s
Minimum Covariance Determinant (MCD) ROC:0.7047, precision @ rank n:0.4787, execution time: 0.1352s
One-class SVM (OCSVM) ROC:0.6423, precision @ rank n:0.4574, execution time: 0.0276s
Principal Component Analysis (PCA) ROC:0.6639, precision @ rank n:0.5, execution time: 0.002s

... Processing satellite.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5681, precision @ rank n:0.3918, execution time: 5.4974s




Cluster-based Local Outlier Factor ROC:0.7234, precision @ rank n:0.5574, execution time: 0.518s
Feature Bagging ROC:0.557, precision @ rank n:0.4051, execution time: 19.2065s
Histogram-base Outlier Detection (HBOS) ROC:0.7393, precision @ rank n:0.5466, execution time: 0.0491s
Isolation Forest ROC:0.7094, precision @ rank n:0.578, execution time: 2.0297s
K Nearest Neighbors (KNN) ROC:0.6781, precision @ rank n:0.4994, execution time: 2.6326s
Local Outlier Factor (LOF) ROC:0.5551, precision @ rank n:0.4051, execution time: 2.5914s
Minimum Covariance Determinant (MCD) ROC:0.792, precision @ rank n:0.6747, execution time: 4.4535s
One-class SVM (OCSVM) ROC:0.636, precision @ rank n:0.5224, execution time: 3.5661s
Principal Component Analysis (PCA) ROC:0.5783, precision @ rank n:0.4559, execution time: 0.0617s

... Processing satimage-2.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.86, precision @ rank n:0.2593, execution time: 5.2477s




Cluster-based Local Outlier Factor ROC:0.9987, precision @ rank n:0.8889, execution time: 0.4969s
Feature Bagging ROC:0.4971, precision @ rank n:0.0741, execution time: 17.1768s
Histogram-base Outlier Detection (HBOS) ROC:0.9837, precision @ rank n:0.5926, execution time: 0.0506s
Isolation Forest ROC:0.9973, precision @ rank n:0.8889, execution time: 1.8037s
K Nearest Neighbors (KNN) ROC:0.9505, precision @ rank n:0.3704, execution time: 2.2656s
Local Outlier Factor (LOF) ROC:0.5006, precision @ rank n:0.0741, execution time: 2.1174s
Minimum Covariance Determinant (MCD) ROC:0.9946, precision @ rank n:0.5185, execution time: 4.1394s
One-class SVM (OCSVM) ROC:0.9976, precision @ rank n:0.9259, execution time: 2.6469s
Principal Component Analysis (PCA) ROC:0.9841, precision @ rank n:0.8519, execution time: 0.0579s

... Processing shuttle.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6186, precision @ rank n:0.1918, execution time: 42.7357s




Cluster-based Local Outlier Factor ROC:0.6286, precision @ rank n:0.2336, execution time: 1.4817s
Feature Bagging ROC:0.5211, precision @ rank n:0.111, execution time: 140.7285s
Histogram-base Outlier Detection (HBOS) ROC:0.9851, precision @ rank n:0.9857, execution time: 0.0397s
Isolation Forest ROC:0.9972, precision @ rank n:0.9337, execution time: 7.3397s
K Nearest Neighbors (KNN) ROC:0.645, precision @ rank n:0.2199, execution time: 25.8985s
Local Outlier Factor (LOF) ROC:0.5347, precision @ rank n:0.1406, execution time: 38.1631s








Minimum Covariance Determinant (MCD) ROC:0.9903, precision @ rank n:0.7534, execution time: 44.7114s
One-class SVM (OCSVM) ROC:0.9922, precision @ rank n:0.9553, execution time: 9236.2039s
Principal Component Analysis (PCA) ROC:0.9902, precision @ rank n:0.9503, execution time: 0.0904s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @ rank n:0.0, execution time: 0.1545s
Cluster-based Local Outlier Factor ROC:0.3908, precision @ rank n:0.0, execution time: 0.135s




Feature Bagging ROC:0.3027, precision @ rank n:0.0, execution time: 0.0815s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @ rank n:0.0, execution time: 0.0048s
Isolation Forest ROC:0.3576, precision @ rank n:0.0, execution time: 0.9592s
K Nearest Neighbors (KNN) ROC:0.318, precision @ rank n:0.0, execution time: 0.0274s
Local Outlier Factor (LOF) ROC:0.318, precision @ rank n:0.0, execution time: 0.0068s
Minimum Covariance Determinant (MCD) ROC:0.3308, precision @ rank n:0.0, execution time: 0.1372s
One-class SVM (OCSVM) ROC:0.4087, precision @ rank n:0.0, execution time: 0.0031s
Principal Component Analysis (PCA) ROC:0.3397, precision @ rank n:0.0, execution time: 0.004s

... Processing vowels.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9521, precision @ rank n:0.4706, execution time: 0.9724s
Cluster-based Local Outlier Factor ROC:0.9278, precision @ rank n:0.4118, execution time: 0.1878s




Feature Bagging ROC:0.9385, precision @ rank n:0.3529, execution time: 0.6633s
Histogram-base Outlier Detection (HBOS) ROC:0.6758, precision @ rank n:0.1765, execution time: 0.008s
Isolation Forest ROC:0.7469, precision @ rank n:0.1176, execution time: 0.808s
K Nearest Neighbors (KNN) ROC:0.9568, precision @ rank n:0.5294, execution time: 0.0976s
Local Outlier Factor (LOF) ROC:0.9345, precision @ rank n:0.4118, execution time: 0.0389s
Minimum Covariance Determinant (MCD) ROC:0.6779, precision @ rank n:0.0, execution time: 0.9834s
One-class SVM (OCSVM) ROC:0.7415, precision @ rank n:0.2941, execution time: 0.0513s
Principal Component Analysis (PCA) ROC:0.5787, precision @ rank n:0.1176, execution time: 0.0042s

... Processing wbc.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9232, precision @ rank n:0.3, execution time: 0.1506s
Cluster-based Local Outlier Factor ROC:0.9063, precision @ rank n:0.6, execution time: 0.1049s




Feature Bagging ROC:0.9415, precision @ rank n:0.5, execution time: 0.1222s
Histogram-base Outlier Detection (HBOS) ROC:0.9592, precision @ rank n:0.7, execution time: 0.0138s
Isolation Forest ROC:0.9451, precision @ rank n:0.5, execution time: 0.6482s
K Nearest Neighbors (KNN) ROC:0.9437, precision @ rank n:0.5, execution time: 0.0302s
Local Outlier Factor (LOF) ROC:0.9352, precision @ rank n:0.4, execution time: 0.0108s
Minimum Covariance Determinant (MCD) ROC:0.8986, precision @ rank n:0.4, execution time: 0.0841s
One-class SVM (OCSVM) ROC:0.9408, precision @ rank n:0.5, execution time: 0.0112s
Principal Component Analysis (PCA) ROC:0.9324, precision @ rank n:0.6, execution time: 0.01s


In [22]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8478,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9316,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8294,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5836,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7813,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9992,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5442,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9482,0.7602,0.481,0.8271,0.93,0.9332


In [23]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5357,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.4531,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.3116,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2615,0.0462,0.0462,0.0615,0.2923,0.3385


In [24]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,3.0879,3.0459,1.3434,4.0855,0.772,0.2334,0.1811,1.1639,0.0859,0.1371
0,cardio,1831,21,9.6122,0.669,0.1948,1.1015,0.0113,1.1807,0.3503,0.2292,1.213,0.1593,0.0138
0,glass,214,9,4.2056,0.1317,0.1246,0.1048,0.0065,0.9889,0.0295,0.0077,0.1132,0.0032,0.0074
0,ionosphere,351,33,35.8974,0.1871,0.1573,0.1666,0.0177,0.9535,0.064,0.028,0.1798,0.0205,0.0215
0,letter,1600,32,6.25,1.0991,0.2836,2.2322,0.0392,1.2008,0.4037,0.3005,2.3654,0.1754,0.016
0,lympho,148,18,4.0541,0.0944,0.1348,0.1254,0.0093,0.8664,0.0098,0.0081,0.1002,0.0051,0.0058
0,mnist,7603,100,9.2069,17.1418,0.9961,125.746,0.1771,5.598,16.6752,15.8075,5.5237,10.3364,0.285
0,musk,3062,166,3.1679,5.8764,0.4994,26.8459,0.1298,3.1083,4.1286,3.8464,19.219,2.7235,0.3339
0,optdigits,5216,64,2.8758,7.2024,0.6747,35.4109,0.0815,2.6143,4.7856,4.1773,2.2305,3.1378,0.1046
0,pendigits,6870,16,2.2707,4.3068,0.5637,12.1949,0.0248,1.9715,1.9056,1.6426,4.3741,2.4307,0.0228


In [25]:
# Define the number of inliers and outliers
n_samples = len(y)
clusters_separation = [0]

n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1



In [26]:
# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth)

Number of inliers: 357
Number of outliers: 21
Ground truth shape is (378,). Outlier are 1 and inliers are 0.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]


In [27]:
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

Model 1 Angle-based Outlier Detector (ABOD)
Model 2 Cluster-based Local Outlier Factor
Model 3 Feature Bagging
Model 4 Histogram-base Outlier Detection (HBOS)
Model 5 Isolation Forest
Model 6 K Nearest Neighbors (KNN)
Model 7 Local Outlier Factor (LOF)
Model 8 Minimum Covariance Determinant (MCD)
Model 9 One-class SVM (OCSVM)
Model 10 Principal Component Analysis (PCA)
