## Import Python Packages

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import loadmat
import warnings
warnings.filterwarnings('ignore')

## Import Pyod Packages and methods

In [3]:
pip install pyod

Collecting pyod
  Downloading pyod-0.8.9.tar.gz (104 kB)
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py): started
  Building wheel for pyod (setup.py): finished with status 'done'
  Created wheel for pyod: filename=pyod-0.8.9-py3-none-any.whl size=121416 sha256=70aecb02bbdc43c426c9471c73300daa19fba6a9f11ddf0e7e9c8425d8a07f00
  Stored in directory: c:\users\lr\appdata\local\pip\cache\wheels\8f\d9\6d\df101a4fa21ac257176d4c6ff4e24edd9c1fd992e53a0a7535
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-0.8.9
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install combo

Collecting combo
  Downloading combo-0.1.2.tar.gz (37 kB)

Building wheels for collected packages: combo
  Building wheel for combo (setup.py): started
  Building wheel for combo (setup.py): finished with status 'done'
  Created wheel for combo: filename=combo-0.1.2-py3-none-any.whl size=42032 sha256=ed6aedf3cd466466967baba1edf3bbb62e84b7a3cab45163fcc2c47f19947c07
  Stored in directory: c:\users\lr\appdata\local\pip\cache\wheels\40\d4\93\ea2224f96bd2868538da3b8231211c42632221600b080ddf16
Successfully built combo
Installing collected packages: combo
Successfully installed combo-0.1.2


In [7]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

## Import Metrics Packages

In [8]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

## Load Data

In [9]:
mat_file_list = ['arrhythmia.mat','cardio.mat',
                'glass.mat','ionosphere.mat','letter.mat',
                'lympho.mat','mnist.mat','musk.mat','optdigits.mat',
                'pendigits.mat','pima.mat','satellite.mat','satimage-2.mat',
                 'shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']

In [10]:
len(mat_file_list)

17

In [11]:
df_columns = ['Data','#Sample','#Dimensions','Outlier Perc','PCA',
              'MCD','OCSVM','LOF','CBLOF','KNN','HBOS','ABOD','IFOREST',
              'FEATUREBAGGING']

In [13]:
data=loadmat("D:\Projects/cardio.mat")

In [14]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

## ROC Dataframe

In [15]:
roc_df = pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


## Precison Dataframe

In [16]:
prn_df = pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


## Execution Time Dataframe

In [17]:
time_df = pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


## Exploring All Mat files

In [21]:
from time import time
# Creating random state
random_state = np.random.RandomState(42)

# Processing mat files one by one : 
for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('D:\Projects\data', mat_file))

    X = mat['X']
    y = mat['y'].ravel()  #converts 2D to 1D
    
    # Counts the number of non-zero values in the array y and divide by length of y 
    outliers_fraction = np.count_nonzero(y) / len(y)
    
    # Calculating Outlier percentage
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # Construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)
    
    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                   
                   'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False,
                                                               random_state=random_state),
                   
                   'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
                   
                   'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                   
                   'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),
                   
                   'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                   
                   'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
                   
                   'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
                   
                   'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                   
                   'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),
   }
# Calculating Time taken for each algorithm
    for clf_name, clf in classifiers.items():
        # Initialize the start time 
        t0 = time() 
        
        # Fit( Train )the data
        clf.fit(X_train_norm) 
        
        # Predicting Value on Xtest
        test_scores = clf.decision_function(X_test_norm)  
        
        # Final Time
        t1 = time()   
        
        # Total time duration : t1 - t0
        duration = round(t1 - t0, ndigits=4) 
        
        # Append duration in time list
        time_list.append(duration)

        #Calculating roc and precision value of the algorithm
        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        # Print the roc , precision and executing time 
        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        # Append roc and precision value to their respective list
        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 8.1318s
Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution time: 2.3369s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution time: 0.6171s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 1.8343s
Isolation Forest ROC:0.8478, precision @ rank n:0.5357, execution time: 0.7047s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.1075s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.0697s
Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 1.9093s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.0469s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.1961s

... Processing cardio.mat ...
Angle-based Outlier Detect

One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0, execution time: 1.3177s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0, execution time: 0.0625s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308, execution time: 2.1389s
Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077, execution time: 0.3991s
Feature Bagging ROC:0.4687, precision @ rank n:0.0462, execution time: 4.8775s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615, execution time: 0.0156s
Isolation Forest ROC:0.9482, precision @ rank n:0.2615, execution time: 0.7975s
K Nearest Neighbors (KNN) ROC:0.7602, precision @ rank n:0.0462, execution time: 0.6701s
Local Outlier Factor (LOF) ROC:0.481, precision @ rank n:0.0462, execution time: 0.5937s
Minimum Covariance Determinant (MCD) ROC:0.8271, precision @ rank n:0.0615, execution time: 2.602s
One-class SVM (OCSVM) ROC:0.93, precision @ rank n:0.2

In [22]:
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8478,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9316,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8294,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5836,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7813,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9992,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5442,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9482,0.7602,0.481,0.8271,0.93,0.9332


In [23]:
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5357,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.4531,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.3116,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2615,0.0462,0.0462,0.0615,0.2923,0.3385


In [24]:
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,8.1318,2.3369,0.6171,1.8343,0.7047,0.1075,0.0697,1.9093,0.0469,0.1961
0,cardio,1831,21,9.6122,0.6584,0.1945,0.9832,0.0,0.5205,0.1719,0.1024,0.779,0.0937,0.0781
0,glass,214,9,4.2056,0.0654,0.0554,0.0469,0.0,0.3427,0.0,0.0156,0.0937,0.0,0.0
0,ionosphere,351,33,35.8974,0.0937,0.0666,0.0781,0.0156,0.3715,0.0312,0.0,0.0937,0.0,0.0781
0,letter,1600,32,6.25,0.5944,0.1105,0.7403,0.0,0.462,0.1406,0.105,1.3815,0.0781,0.0
0,lympho,148,18,4.0541,0.0469,0.0469,0.0642,0.0,0.3567,0.0,0.0156,0.0312,0.0,0.0
0,mnist,7603,100,9.2069,7.933,0.8145,51.4316,0.0622,2.1934,6.9693,12.0022,9.2352,8.3195,0.2691
0,musk,3062,166,3.1679,3.8632,0.4467,16.3965,0.0637,1.6241,1.8346,1.5809,12.7202,1.0618,0.1422
0,optdigits,5216,64,2.8758,2.8162,0.3958,13.716,0.0469,1.0823,1.9291,1.5983,1.443,1.3177,0.0625
0,pendigits,6870,16,2.2707,2.1389,0.3991,4.8775,0.0156,0.7975,0.6701,0.5937,2.602,0.9739,0.0
