# import python packages

In [1]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# import pyod packages and methods

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



 # import metrics packages

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# Define data file and read X and y

In [4]:
mat_file_list=['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']

In [5]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

# how to load mat file

In [7]:
data=loadmat("E:\LetsUpgrade AI-ML\PROJECT DAY-1\cardio.mat")

In [8]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [9]:
len(data)

5

In [10]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [11]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

# input feature shape

In [12]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

# output feature shape

In [13]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [14]:
df_columns =['Data','#Samples','# Dimensions','Outlier Perc','ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

# ROC performance evolution table

In [15]:
roc_df=pd.DataFrame(columns=df_columns)

In [16]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# precision_n_scores -Performance evolution table

In [17]:
prn_df=pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# time dataframe

In [18]:
time_df=pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Exploring all mat files

In [23]:
import time 
for mat_file in mat_file_list:
    print("\n... Processing",mat_file,'...')
    mat = loadmat(os.path.join('E:\LetsUpgrade AI-ML\PROJECT DAY-1',mat_file))
    
    X= mat['X']
    y= mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y)/len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
    #Construct containers for saving result
    roc_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    prn_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    time_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]

    #60% data for trainin g and 40% for testing
    random_state=np.random.RandomState(42)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=random_state)
    
    #Standardizing data for processing
    X_train_norm,X_test_norm=standardizer(X_train,X_test)
    classifiers={'Angle-based Outlier Detector (ABOD)' : ABOD(contamination=outliers_fraction),
                 'Cluster-based Local Outlier Factor' : CBLOF(contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                 'Feature Bagging' : FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                 'Histogram-base Outlier Detection (HBOS)' : HBOS(contamination=outliers_fraction),
                 'Isolation Forest' : IForest(contamination=outliers_fraction, random_state=random_state),
                 'K Nearest Neignors (KNN)' : KNN(contamination=outliers_fraction),
                 'Local Outlier Factor (LOF)' : LOF(contamination=outliers_fraction),
                 'Minimum Covariance Determinat (MCD)' : MCD(contamination=outliers_fraction, random_state=random_state),
                 'One-class SVM (OCSVM)' : OCSVM(contamination=outliers_fraction),
                 'Principal Component Ananlysis (PCA)' : PCA(contamination=outliers_fraction, random_state=random_state),
                }
    for clf_name, clf in classifiers.items():
        t0=time.time() #time before training
        clf.fit(X_train_norm)   #for each algorithm, the model is trained
        test_scores=clf.decision_function(X_test_norm) #testing model score
        t1=time.time() #time for testing or time when testing completed
        duration=round(t1-t0,ndigits=4) #time taken to complete training
        time_list.append(duration)
        roc=round(roc_auc_score(y_test,test_scores),ndigits=4)
        prn=round(precision_n_scores(y_test,test_scores),ndigits=4)
        print('{clf_name} ROC:{roc},precision @ rank n:{prn},' 'execution time : {duration}s'.format(clf_name=clf_name,roc=roc,prn=prn,duration=duration))
        roc_list.append(roc)
        prn_list.append(prn)
        
    temp_df=pd.DataFrame(time_list).transpose()
    temp_df.columns=df_columns
    time_df=pd.concat([time_df,temp_df],axis=0)

    temp_df=pd.DataFrame(roc_list).transpose()
    temp_df.columns=df_columns
    roc_df=pd.concat([roc_df,temp_df],axis=0)

    temp_df=pd.DataFrame(prn_list).transpose()
    temp_df.columns=df_columns
    prn_df=pd.concat([prn_df,temp_df],axis=0)     


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687,precision @ rank n:0.3571,execution time : 0.2414s
Cluster-based Local Outlier Factor ROC:0.7684,precision @ rank n:0.4643,execution time : 3.5145s
Feature Bagging ROC:0.7799,precision @ rank n:0.5,execution time : 0.8497s
Histogram-base Outlier Detection (HBOS) ROC:0.8511,precision @ rank n:0.5714,execution time : 2.2073s
Isolation Forest ROC:0.8527,precision @ rank n:0.5714,execution time : 0.7104s
K Nearest Neignors (KNN) ROC:0.782,precision @ rank n:0.5,execution time : 0.1137s
Local Outlier Factor (LOF) ROC:0.7787,precision @ rank n:0.4643,execution time : 0.12s




Minimum Covariance Determinat (MCD) ROC:0.8228,precision @ rank n:0.4286,execution time : 1.5318s
One-class SVM (OCSVM) ROC:0.7986,precision @ rank n:0.5,execution time : 0.0927s
Principal Component Ananlysis (PCA) ROC:0.7997,precision @ rank n:0.5,execution time : 0.13s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5928,precision @ rank n:0.2838,execution time : 0.5765s
Cluster-based Local Outlier Factor ROC:0.8547,precision @ rank n:0.5541,execution time : 0.1958s
Feature Bagging ROC:0.6565,precision @ rank n:0.2297,execution time : 1.4384s
Histogram-base Outlier Detection (HBOS) ROC:0.87,precision @ rank n:0.5135,execution time : 0.01s
Isolation Forest ROC:0.9304,precision @ rank n:0.5405,execution time : 0.6562s
K Nearest Neignors (KNN) ROC:0.7642,precision @ rank n:0.4054,execution time : 0.3103s
Local Outlier Factor (LOF) ROC:0.6432,precision @ rank n:0.2162,execution time : 0.1794s




Minimum Covariance Determinat (MCD) ROC:0.811,precision @ rank n:0.4865,execution time : 0.9696s
One-class SVM (OCSVM) ROC:0.9462,precision @ rank n:0.527,execution time : 0.211s
Principal Component Ananlysis (PCA) ROC:0.9581,precision @ rank n:0.6216,execution time : 0.0315s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7805,precision @ rank n:0.25,execution time : 0.1162s
Cluster-based Local Outlier Factor ROC:0.9024,precision @ rank n:0.25,execution time : 0.1147s
Feature Bagging ROC:0.8293,precision @ rank n:0.25,execution time : 0.0948s
Histogram-base Outlier Detection (HBOS) ROC:0.7256,precision @ rank n:0.25,execution time : 0.005s
Isolation Forest ROC:0.7287,precision @ rank n:0.25,execution time : 0.5523s
K Nearest Neignors (KNN) ROC:0.8902,precision @ rank n:0.25,execution time : 0.022s
Local Outlier Factor (LOF) ROC:0.7287,precision @ rank n:0.25,execution time : 0.0129s
Minimum Covariance Determinat (MCD) ROC:0.7957,precision @ rank n:0.0,execution




... Processing ionosphere.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9247,precision @ rank n:0.8868,execution time : 0.2015s
Cluster-based Local Outlier Factor ROC:0.8999,precision @ rank n:0.7925,execution time : 0.1436s
Feature Bagging ROC:0.9286,precision @ rank n:0.7736,execution time : 0.1312s
Histogram-base Outlier Detection (HBOS) ROC:0.5154,precision @ rank n:0.3585,execution time : 0.018s
Isolation Forest ROC:0.8433,precision @ rank n:0.6604,execution time : 0.4182s
K Nearest Neignors (KNN) ROC:0.9226,precision @ rank n:0.8868,execution time : 0.0243s
Local Outlier Factor (LOF) ROC:0.9312,precision @ rank n:0.7736,execution time : 0.009s
Minimum Covariance Determinat (MCD) ROC:0.9657,precision @ rank n:0.8868,execution time : 0.0848s
One-class SVM (OCSVM) ROC:0.8799,precision @ rank n:0.7736,execution time : 0.007s
Principal Component Ananlysis (PCA) ROC:0.8068,precision @ rank n:0.6226,execution time : 0.006s

... Processing letter.mat ...
Angle-based Outlier Detector



Minimum Covariance Determinat (MCD) ROC:0.8553,precision @ rank n:0.2268,execution time : 4.1109s
One-class SVM (OCSVM) ROC:0.8622,precision @ rank n:0.4015,execution time : 6.9991s
Principal Component Ananlysis (PCA) ROC:0.8613,precision @ rank n:0.3903,execution time : 0.3185s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.1012,precision @ rank n:0.0294,execution time : 3.7814s
Cluster-based Local Outlier Factor ROC:1.0,precision @ rank n:1.0,execution time : 0.6233s
Feature Bagging ROC:0.5705,precision @ rank n:0.1471,execution time : 19.4184s
Histogram-base Outlier Detection (HBOS) ROC:1.0,precision @ rank n:1.0,execution time : 0.0848s
Isolation Forest ROC:1.0,precision @ rank n:0.9706,execution time : 1.8521s
K Nearest Neignors (KNN) ROC:0.7729,precision @ rank n:0.2059,execution time : 2.8354s
Local Outlier Factor (LOF) ROC:0.5567,precision @ rank n:0.1471,execution time : 2.4991s
Minimum Covariance Determinat (MCD) ROC:1.0,precision @ rank n:1.0,executio



Minimum Covariance Determinat (MCD) ROC:0.4226,precision @ rank n:0.0,execution time : 1.631s
One-class SVM (OCSVM) ROC:0.4817,precision @ rank n:0.0,execution time : 1.9398s
Principal Component Ananlysis (PCA) ROC:0.507,precision @ rank n:0.0,execution time : 0.0646s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.677,precision @ rank n:0.0725,execution time : 2.1809s
Cluster-based Local Outlier Factor ROC:0.8488,precision @ rank n:0.2029,execution time : 0.3415s
Feature Bagging ROC:0.4674,precision @ rank n:0.0725,execution time : 4.4055s
Histogram-base Outlier Detection (HBOS) ROC:0.9261,precision @ rank n:0.2609,execution time : 0.0289s
Isolation Forest ROC:0.9555,precision @ rank n:0.3768,execution time : 1.0289s
K Nearest Neignors (KNN) ROC:0.7603,precision @ rank n:0.1594,execution time : 0.8644s
Local Outlier Factor (LOF) ROC:0.4578,precision @ rank n:0.0725,execution time : 0.7898s
Minimum Covariance Determinat (MCD) ROC:0.8387,precision @ rank n:0.



Minimum Covariance Determinat (MCD) ROC:0.9901,precision @ rank n:0.7332,execution time : 12.2186s
One-class SVM (OCSVM) ROC:0.9922,precision @ rank n:0.9578,execution time : 63.2183s
Principal Component Ananlysis (PCA) ROC:0.9904,precision @ rank n:0.9542,execution time : 0.0312s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.3968,precision @ rank n:0.0909,execution time : 0.0781s
Cluster-based Local Outlier Factor ROC:0.3882,precision @ rank n:0.0,execution time : 0.0698s
Feature Bagging ROC:0.4289,precision @ rank n:0.0909,execution time : 0.0469s
Histogram-base Outlier Detection (HBOS) ROC:0.3166,precision @ rank n:0.0,execution time : 0.0s
Isolation Forest ROC:0.3444,precision @ rank n:0.0,execution time : 0.3695s
K Nearest Neignors (KNN) ROC:0.384,precision @ rank n:0.0,execution time : 0.0156s
Local Outlier Factor (LOF) ROC:0.4342,precision @ rank n:0.0909,execution time : 0.0s
Minimum Covariance Determinat (MCD) ROC:0.4214,precision @ rank n:0.0,exe



Angle-based Outlier Detector (ABOD) ROC:0.9515,precision @ rank n:0.5,execution time : 0.3909s
Cluster-based Local Outlier Factor ROC:0.9419,precision @ rank n:0.2727,execution time : 0.0937s
Feature Bagging ROC:0.943,precision @ rank n:0.2273,execution time : 0.3636s
Histogram-base Outlier Detection (HBOS) ROC:0.6365,precision @ rank n:0.0,execution time : 0.006s
Isolation Forest ROC:0.7601,precision @ rank n:0.1364,execution time : 0.4748s
K Nearest Neignors (KNN) ROC:0.9763,precision @ rank n:0.4545,execution time : 0.0948s
Local Outlier Factor (LOF) ROC:0.9119,precision @ rank n:0.2727,execution time : 0.0439s
Minimum Covariance Determinat (MCD) ROC:0.6877,precision @ rank n:0.0455,execution time : 0.9298s
One-class SVM (OCSVM) ROC:0.7763,precision @ rank n:0.1364,execution time : 0.0781s
Principal Component Ananlysis (PCA) ROC:0.5675,precision @ rank n:0.0909,execution time : 0.0s

... Processing wbc.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9181,precision @ rank n:0.5455,

In [24]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5928,0.8547,0.6565,0.87,0.9304,0.7642,0.6432,0.811,0.9462,0.9581
0,glass,214,9,4.2056,0.7805,0.9024,0.8293,0.7256,0.7287,0.8902,0.7287,0.7957,0.753,0.747
0,ionosphere,351,33,35.8974,0.9247,0.8999,0.9286,0.5154,0.8433,0.9226,0.9312,0.9657,0.8799,0.8068
0,letter,1600,32,6.25,0.8949,0.7612,0.8663,0.6248,0.6403,0.8689,0.8781,0.8119,0.5827,0.5142
0,mnist,7603,100,9.2069,0.7682,0.8339,0.6936,0.5578,0.8045,0.8423,0.7012,0.8553,0.8622,0.8613
0,musk,3062,166,3.1679,0.1012,1.0,0.5705,1.0,1.0,0.7729,0.5567,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4601,0.7515,0.4197,0.8962,0.6531,0.3717,0.4191,0.4226,0.4817,0.507
0,pendigits,6870,16,2.2707,0.677,0.8488,0.4674,0.9261,0.9555,0.7603,0.4578,0.8387,0.946,0.9402
0,pima,768,8,34.8958,0.665,0.6724,0.6551,0.7029,0.6843,0.7123,0.6621,0.6942,0.6423,0.6444


In [25]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.2414,3.5145,0.8497,2.2073,0.7104,0.1137,0.12,1.5318,0.0927,0.13
0,cardio,1831,21,9.6122,0.5765,0.1958,1.4384,0.01,0.6562,0.3103,0.1794,0.9696,0.211,0.0315
0,glass,214,9,4.2056,0.1162,0.1147,0.0948,0.005,0.5523,0.022,0.0129,0.1155,0.007,0.004
0,ionosphere,351,33,35.8974,0.2015,0.1436,0.1312,0.018,0.4182,0.0243,0.009,0.0848,0.007,0.006
0,letter,1600,32,6.25,0.6061,0.1577,1.0508,0.0289,0.7801,0.3232,0.1558,2.1324,0.1835,0.017
0,mnist,7603,100,9.2069,14.1372,2.3444,69.9805,0.0625,2.874,10.0242,9.1175,4.1109,6.9991,0.3185
0,musk,3062,166,3.1679,3.7814,0.6233,19.4184,0.0848,1.8521,2.8354,2.4991,15.9414,1.941,0.1995
0,optdigits,5216,64,2.8758,3.6194,0.6365,17.8399,0.0469,1.422,2.5213,2.2622,1.631,1.9398,0.0646
0,pendigits,6870,16,2.2707,2.1809,0.3415,4.4055,0.0289,1.0289,0.8644,0.7898,2.3279,1.2462,0.014
0,pima,768,8,34.8958,0.2065,0.1107,0.1388,0.004,0.4568,0.0469,0.015,0.0758,0.015,0.002


In [26]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.2838,0.5541,0.2297,0.5135,0.5405,0.4054,0.2162,0.4865,0.527,0.6216
0,glass,214,9,4.2056,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8868,0.7925,0.7736,0.3585,0.6604,0.8868,0.7736,0.8868,0.7736,0.6226
0,letter,1600,32,6.25,0.3721,0.186,0.3721,0.093,0.093,0.3488,0.3721,0.186,0.1395,0.1163
0,mnist,7603,100,9.2069,0.3643,0.3941,0.3234,0.0967,0.2714,0.4349,0.3309,0.2268,0.4015,0.3903
0,musk,3062,166,3.1679,0.0294,1.0,0.1471,1.0,0.9706,0.2059,0.1471,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.0,0.0,0.0164,0.1803,0.0164,0.0,0.0164,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0725,0.2029,0.0725,0.2609,0.3768,0.1594,0.0725,0.1159,0.3333,0.3478
0,pima,768,8,34.8958,0.4314,0.4608,0.4608,0.5392,0.4608,0.5,0.4706,0.4706,0.4412,0.451
