In [14]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

## Import PyOD packages

In [15]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging


## Import Metrics Packages


In [16]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score
from time import time

## Define data file and read X and y

In [19]:
mat_file_list =['arrhythmia.mat',
'cardio.mat',
'glass.mat',
'ionosphere.mat',
'letter.mat',
'lympho.mat',
'mnist.mat',
'musk.mat',
'optdigits.mat',
'pendigits.mat',
'pima.mat',
'satellite.mat',
'satimage-2.mat',
'shuttle.mat',
'vertebral.mat',
'vowels.mat',
'wbc.mat']

random_state=np.random.RandomState(42)

print(mat_file_list[0])

arrhythmia.mat


In [9]:
df_columns=['Data','#Samples','#Dimensions','Outlier Perc','ABOD','CBLOF','FB','HBOS','Iforest','KNN','LOF','MCD','OCSVM','PCA']

# ROC Performance Evaluation Table 

In [10]:
roc_df=pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,Iforest,KNN,LOF,MCD,OCSVM,PCA


# Precision Performance Evaluation Table

In [7]:
prn_df=pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,Iforest,KNN,LOF,MCD,OCSVM,PCA


# Time Dataframe

In [11]:
time_df=pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,Iforest,KNN,LOF,MCD,OCSVM,PCA


# Exploring all mat files

In [22]:
for mat_file in mat_file_list:
    print("Procesing ......." , mat_file)
    mat=loadmat(os.path.join('C:/Users/Manzoo/Learning/Project Day 1/dataset/Anamoly_detec_data/',mat_file))
    
    X=mat['X']
    y=mat['y'].ravel()
    
    outliers_fraction=np.count_nonzero(y)/len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
    roc_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    
    prn_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    
    time_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    
    
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=random_state)    
    X_train_norm,X_test_norm=standardizer(X_train,X_test)
    
    classifiers={'Angle-based Oulier Detector (ABOD)':ABOD(contamination=outliers_fraction),
                'Cluster based Oulier Detector Factor':CBLOF(contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                'Feature Bagging':FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                'Histogram-based Outlier Detection (HBOS)':HBOS(contamination=outliers_fraction),
                'Isolation Forest':IForest(contamination=outliers_fraction,random_state=random_state),
                'K Nearest Neighbor KNN':KNN(contamination=outliers_fraction),
                'Local Outlier Factor(LOF)':LOF(contamination=outliers_fraction),
                'Minimum Covariance Determinant (MCD)':MCD(contamination=outliers_fraction,random_state=random_state),
                'One-class SVM(OCSVM)':OCSVM(contamination=outliers_fraction),
                'Principal Component Analysis (PCA)':PCA(contamination=outliers_fraction,random_state=random_state)}
    
    for clf_name,clf in classifiers.items():
        t0=time()
        clf.fit(X_train_norm)
        test_scores=clf.decision_function(X_test_norm)
        t1=time()
        duration=round(t1-t0,ndigits=4)
        time_list.append(duration)
        
        roc=round(roc_auc_score(y_test,test_scores),ndigits=4)
        prn=round(precision_n_scores(y_test,test_scores),ndigits=4)
        
        print(clf_name,'ROC:',roc,'PRN:',prn,'DURATION:',duration)
        
        roc_list.append(roc)
        prn_list.append(prn)
        
    temp_df=pd.DataFrame(time_list).transpose()
    temp_df.columns=df_columns
    time_df=pd.concat([time_df,temp_df],axis=0)
        
        
    temp_df=pd.DataFrame(roc_list).transpose()
    temp_df.columns=df_columns
    roc_df=pd.concat([roc_df,temp_df],axis=0)
    
    temp_df=pd.DataFrame(prn_list).transpose()
    temp_df.columns=df_columns
    prn_df=pd.concat([prn_df,temp_df],axis=0)

Procesing ....... arrhythmia.mat
Angle-based Oulier Detector (ABOD) ROC: 0.7762 PRN: 0.4 DURATION: 0.22
Cluster based Oulier Detector Factor ROC: 0.7982 PRN: 0.48 DURATION: 0.183
Feature Bagging ROC: 0.7885 PRN: 0.44 DURATION: 0.67
Histogram-based Outlier Detection (HBOS) ROC: 0.8195 PRN: 0.56 DURATION: 0.1
Isolation Forest ROC: 0.8067 PRN: 0.52 DURATION: 0.51
K Nearest Neighbor KNN ROC: 0.789 PRN: 0.48 DURATION: 0.09
Local Outlier Factor(LOF) ROC: 0.7882 PRN: 0.44 DURATION: 0.07




Minimum Covariance Determinant (MCD) ROC: 0.7541 PRN: 0.44 DURATION: 1.101
One-class SVM(OCSVM) ROC: 0.7956 PRN: 0.48 DURATION: 0.04
Principal Component Analysis (PCA) ROC: 0.7992 PRN: 0.48 DURATION: 0.08
Procesing ....... cardio.mat
Angle-based Oulier Detector (ABOD) ROC: 0.5865 PRN: 0.2778 DURATION: 0.68
Cluster based Oulier Detector Factor ROC: 0.7302 PRN: 0.2917 DURATION: 0.22
Feature Bagging ROC: 0.5641 PRN: 0.1944 DURATION: 1.01
Histogram-based Outlier Detection (HBOS) ROC: 0.8514 PRN: 0.3889 DURATION: 0.01
Isolation Forest ROC: 0.9311 PRN: 0.5278 DURATION: 0.762
K Nearest Neighbor KNN ROC: 0.7177 PRN: 0.3056 DURATION: 0.23
Local Outlier Factor(LOF) ROC: 0.5577 PRN: 0.1944 DURATION: 0.12




Minimum Covariance Determinant (MCD) ROC: 0.8456 PRN: 0.4306 DURATION: 0.882
One-class SVM(OCSVM) ROC: 0.9304 PRN: 0.4861 DURATION: 0.1
Principal Component Analysis (PCA) ROC: 0.9474 PRN: 0.5833 DURATION: 0.11
Procesing ....... glass.mat
Angle-based Oulier Detector (ABOD) ROC: 0.6353 PRN: 0.0 DURATION: 0.08
Cluster based Oulier Detector Factor ROC: 0.8 PRN: 0.0 DURATION: 0.08
Feature Bagging ROC: 0.7882 PRN: 0.0 DURATION: 0.05
Histogram-based Outlier Detection (HBOS) ROC: 0.6588 PRN: 0.0 DURATION: 0.01
Isolation Forest ROC: 0.5765 PRN: 0.0 DURATION: 0.38
K Nearest Neighbor KNN ROC: 0.7294 PRN: 0.0 DURATION: 0.02
Local Outlier Factor(LOF) ROC: 0.8706 PRN: 0.0 DURATION: 0.01
Minimum Covariance Determinant (MCD) ROC: 0.5765 PRN: 0.0 DURATION: 0.07
One-class SVM(OCSVM) ROC: 0.1059 PRN: 0.0 DURATION: 0.0
Principal Component Analysis (PCA) ROC: 0.4235 PRN: 0.0 DURATION: 0.04
Procesing ....... ionosphere.mat
Angle-based Oulier Detector (ABOD) ROC: 0.9159 PRN: 0.8448 DURATION: 0.14
Cluster bas



Minimum Covariance Determinant (MCD) ROC: 0.8248 PRN: 0.1661 DURATION: 3.973
One-class SVM(OCSVM) ROC: 0.8379 PRN: 0.3616 DURATION: 4.89
Principal Component Analysis (PCA) ROC: 0.8389 PRN: 0.3542 DURATION: 0.17
Procesing ....... musk.mat
Angle-based Oulier Detector (ABOD) ROC: 0.1985 PRN: 0.1081 DURATION: 2.55
Cluster based Oulier Detector Factor ROC: 1.0 PRN: 1.0 DURATION: 0.47
Feature Bagging ROC: 0.6993 PRN: 0.2162 DURATION: 11.471
Histogram-based Outlier Detection (HBOS) ROC: 0.9999 PRN: 0.9459 DURATION: 0.08
Isolation Forest ROC: 0.9998 PRN: 0.9459 DURATION: 1.551
K Nearest Neighbor KNN ROC: 0.786 PRN: 0.4324 DURATION: 1.88
Local Outlier Factor(LOF) ROC: 0.6832 PRN: 0.4595 DURATION: 1.661
Minimum Covariance Determinant (MCD) ROC: 0.9996 PRN: 0.973 DURATION: 17.039
One-class SVM(OCSVM) ROC: 1.0 PRN: 1.0 DURATION: 1.11
Principal Component Analysis (PCA) ROC: 1.0 PRN: 1.0 DURATION: 0.16
Procesing ....... optdigits.mat
Angle-based Oulier Detector (ABOD) ROC: 0.4822 PRN: 0.0455 DURATIO



Minimum Covariance Determinant (MCD) ROC: 0.3567 PRN: 0.0 DURATION: 2.026
One-class SVM(OCSVM) ROC: 0.488 PRN: 0.0 DURATION: 1.711
Principal Component Analysis (PCA) ROC: 0.5065 PRN: 0.0 DURATION: 0.09
Procesing ....... pendigits.mat
Angle-based Oulier Detector (ABOD) ROC: 0.6835 PRN: 0.0476 DURATION: 2.78
Cluster based Oulier Detector Factor ROC: 0.9626 PRN: 0.3492 DURATION: 0.32
Feature Bagging ROC: 0.4099 PRN: 0.0635 DURATION: 4.921
Histogram-based Outlier Detection (HBOS) ROC: 0.9122 PRN: 0.2857 DURATION: 0.01
Isolation Forest ROC: 0.9636 PRN: 0.3968 DURATION: 0.92
K Nearest Neighbor KNN ROC: 0.7525 PRN: 0.127 DURATION: 0.8
Local Outlier Factor(LOF) ROC: 0.4081 PRN: 0.0635 DURATION: 0.772
Minimum Covariance Determinant (MCD) ROC: 0.8147 PRN: 0.0635 DURATION: 3.475
One-class SVM(OCSVM) ROC: 0.9271 PRN: 0.3016 DURATION: 1.39
Principal Component Analysis (PCA) ROC: 0.9306 PRN: 0.2857 DURATION: 0.01
Procesing ....... pima.mat
Angle-based Oulier Detector (ABOD) ROC: 0.6554 PRN: 0.4722 D





Minimum Covariance Determinant (MCD) ROC: 0.9894 PRN: 0.7309 DURATION: 13.485
One-class SVM(OCSVM) ROC: 0.9931 PRN: 0.9597 DURATION: 67.6101
Principal Component Analysis (PCA) ROC: 0.9914 PRN: 0.9568 DURATION: 0.04
Procesing ....... vertebral.mat
Angle-based Oulier Detector (ABOD) ROC: 0.5089 PRN: 0.0833 DURATION: 0.09
Cluster based Oulier Detector Factor ROC: 0.4583 PRN: 0.0833 DURATION: 0.07
Feature Bagging ROC: 0.371 PRN: 0.0 DURATION: 0.05
Histogram-based Outlier Detection (HBOS) ROC: 0.2996 PRN: 0.0 DURATION: 0.0
Isolation Forest ROC: 0.369 PRN: 0.0 DURATION: 0.36
K Nearest Neighbor KNN ROC: 0.4851 PRN: 0.0 DURATION: 0.02
Local Outlier Factor(LOF) ROC: 0.4415 PRN: 0.0833 DURATION: 0.0
Minimum Covariance Determinant (MCD) ROC: 0.4772 PRN: 0.0 DURATION: 0.06
One-class SVM(OCSVM) ROC: 0.4167 PRN: 0.0 DURATION: 0.0
Principal Component Analysis (PCA) ROC: 0.3601 PRN: 0.0 DURATION: 0.01
Procesing ....... vowels.mat
Angle-based Oulier Detector (ABOD) ROC: 0.946 PRN: 0.4545 DURATION: 0.47

In [23]:
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,Iforest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.8
0,arrhythmia,452,274,14.6018,0.7762,0.7982,0.7885,0.8195,0.8067,0.789,0.7882,0.7541,0.7956,0.7992
0,cardio,1831,21,9.6122,0.5865,0.7302,0.5641,0.8514,0.9311,0.7177,0.5577,0.8456,0.9304,0.9474
0,glass,214,9,4.2056,0.6353,0.8,0.7882,0.6588,0.5765,0.7294,0.8706,0.5765,0.1059,0.4235
0,ionosphere,351,33,35.8974,0.9159,0.9298,0.926,0.5966,0.8274,0.9362,0.9292,0.9636,0.8344,0.7578
0,letter,1600,32,6.25,0.8741,0.7538,0.8784,0.5419,0.626,0.8714,0.872,0.8118,0.6082,0.5306
0,lympho,148,18,4.0541,0.9375,0.942,0.942,1.0,0.9777,0.9464,0.942,0.817,0.9509,0.9821
0,mnist,7603,100,9.2069,0.7869,0.8354,0.7367,0.5504,0.8047,0.846,0.7344,0.8248,0.8379,0.8389
0,musk,3062,166,3.1679,0.1985,1.0,0.6993,0.9999,0.9998,0.786,0.6832,0.9996,1.0,1.0
0,optdigits,5216,64,2.8758,0.4822,0.7533,0.4593,0.8795,0.7149,0.3582,0.4528,0.3567,0.488,0.5065


In [24]:
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,Iforest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,3.201,3.012,0.63,2.12,0.561,0.1,0.07,2.072,0.04,0.08
0,arrhythmia,452,274,14.6018,0.22,0.183,0.67,0.1,0.51,0.09,0.07,1.101,0.04,0.08
0,cardio,1831,21,9.6122,0.68,0.22,1.01,0.01,0.762,0.23,0.12,0.882,0.1,0.11
0,glass,214,9,4.2056,0.08,0.08,0.05,0.01,0.38,0.02,0.01,0.07,0.0,0.04
0,ionosphere,351,33,35.8974,0.14,0.06,0.08,0.01,0.44,0.02,0.01,0.09,0.01,0.06
0,letter,1600,32,6.25,0.61,0.25,0.822,0.02,0.53,0.17,0.13,1.72,0.09,0.01
0,lympho,148,18,4.0541,0.09,0.1,0.05,0.01,0.37,0.01,0.0,0.05,0.0,0.0
0,mnist,7603,100,9.2069,8.935,1.78,54.6741,0.07,2.402,7.746,6.726,3.973,4.89,0.17
0,musk,3062,166,3.1679,2.55,0.47,11.471,0.08,1.551,1.88,1.661,17.039,1.11,0.16
0,optdigits,5216,64,2.8758,3.261,0.67,13.131,0.04,1.16,1.9,2.051,2.026,1.711,0.09


In [25]:
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,Iforest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,arrhythmia,452,274,14.6018,0.4,0.48,0.44,0.56,0.52,0.48,0.44,0.44,0.48,0.48
0,cardio,1831,21,9.6122,0.2778,0.2917,0.1944,0.3889,0.5278,0.3056,0.1944,0.4306,0.4861,0.5833
0,glass,214,9,4.2056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,ionosphere,351,33,35.8974,0.8448,0.7931,0.7931,0.4483,0.6724,0.8793,0.7931,0.8793,0.7414,0.5517
0,letter,1600,32,6.25,0.4444,0.2444,0.4444,0.0667,0.1111,0.3778,0.4222,0.2,0.1778,0.1111
0,lympho,148,18,4.0541,0.25,0.5,0.5,1.0,0.5,0.5,0.5,0.5,0.5,0.75
0,mnist,7603,100,9.2069,0.3653,0.3764,0.369,0.0996,0.2694,0.4059,0.3506,0.1661,0.3616,0.3542
0,musk,3062,166,3.1679,0.1081,1.0,0.2162,0.9459,0.9459,0.4324,0.4595,0.973,1.0,1.0
0,optdigits,5216,64,2.8758,0.0455,0.0,0.0152,0.2424,0.0152,0.0,0.0152,0.0,0.0,0.0
