# Anomaly detection - Project 1 

In [1]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat


In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [4]:
#Define data file and read X and y
mat_file_list = ['arrhythmia.mat',
                 'cardio.mat',
                 'glass.mat',
                 'ionosphere.mat',
                 'letter.mat',
                 'lympho.mat',
                 'mnist.mat',
                 'musk.mat',
                 'optdigits.mat',
                 'pendigits.mat',
                 'pima.mat',
                 'satellite.mat',
                 'satimage-2.mat',
                 'shuttle.mat',
                 'vertebral.mat',
                 'vowels.mat',
                 'wbc.mat']

In [5]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [6]:
#loading MAt file
from scipy.io import loadmat

In [18]:
data=loadmat('C:/Users/lenovo/Desktop/AI ML/data/cardio.mat')

In [19]:
#data=loadmat('C:/Users/lenovo/Desktop/AI ML/data/arrhythmia.mat')

In [20]:

data


{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [22]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [23]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

In [24]:
#Input(Independent) Feature Shape in Mat file format
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

In [25]:
#Dependent/ Target /Output Feature shape
type(data['y']),data['y'].shape


(numpy.ndarray, (1831, 1))

In [26]:
df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc',
              'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD',
              'OCSVM', 'PCA']

In [27]:
#ROC Performance evulotion table
roc_df = pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [28]:
prn_df = pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [29]:
time_df = pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [30]:
from time import time
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('data', mat_file))

    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                        random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor': CBLOF(
            contamination=outliers_fraction, check_estimator=False,
            random_state=random_state),
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)': MCD(
            contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)': PCA(
            contamination=outliers_fraction, random_state=random_state),
    }

    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(
            clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 2.2437s
Cluster-based Local Outlier Factor ROC:0.7789, precision @ rank n:0.4643, execution time: 2.6091s
Feature Bagging ROC:0.7796, precision @ rank n:0.4643, execution time: 0.7695s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 2.8014s




Isolation Forest ROC:0.8637, precision @ rank n:0.6071, execution time: 0.6526s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.1239s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.1029s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 1.6084s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.057s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.0939s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5892, precision @ rank n:0.1918, execution time: 0.8755s
Cluster-based Local Outlier Factor ROC:0.8845, precision @ rank n:0.4932, execution time: 0.2498s
Feature Bagging ROC:0.6385, precision @ rank n:0.1781, execution time: 1.3212s
Histogram-base Outlier Detection (HBOS) ROC:0.8373, precision @ rank n:0.4521, execution time: 0.013s




Isolation Forest ROC:0.951, precision @ rank n:0.6027, execution time: 0.5926s
K Nearest Neighbors (KNN) ROC:0.734, precision @ rank n:0.3562, execution time: 0.2768s
Local Outlier Factor (LOF) ROC:0.588, precision @ rank n:0.1507, execution time: 0.1506s




Minimum Covariance Determinant (MCD) ROC:0.8524, precision @ rank n:0.411, execution time: 1.0144s
One-class SVM (OCSVM) ROC:0.9478, precision @ rank n:0.5342, execution time: 0.1319s
Principal Component Analysis (PCA) ROC:0.9616, precision @ rank n:0.6849, execution time: 0.01s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6951, precision @ rank n:0.25, execution time: 0.0939s
Cluster-based Local Outlier Factor ROC:0.811, precision @ rank n:0.25, execution time: 0.066s
Feature Bagging ROC:0.7073, precision @ rank n:0.25, execution time: 0.055s
Histogram-base Outlier Detection (HBOS) ROC:0.7073, precision @ rank n:0.0, execution time: 0.005s




Isolation Forest ROC:0.7134, precision @ rank n:0.25, execution time: 0.3915s
K Nearest Neighbors (KNN) ROC:0.8384, precision @ rank n:0.25, execution time: 0.017s
Local Outlier Factor (LOF) ROC:0.7043, precision @ rank n:0.25, execution time: 0.005s
Minimum Covariance Determinant (MCD) ROC:0.8293, precision @ rank n:0.0, execution time: 0.0785s
One-class SVM (OCSVM) ROC:0.6585, precision @ rank n:0.25, execution time: 0.003s
Principal Component Analysis (PCA) ROC:0.686, precision @ rank n:0.25, execution time: 0.0036s

... Processing ionosphere.mat ...




Angle-based Outlier Detector (ABOD) ROC:0.9181, precision @ rank n:0.8431, execution time: 0.1549s
Cluster-based Local Outlier Factor ROC:0.9176, precision @ rank n:0.8039, execution time: 0.085s
Feature Bagging ROC:0.9303, precision @ rank n:0.8039, execution time: 0.1019s
Histogram-base Outlier Detection (HBOS) ROC:0.6052, precision @ rank n:0.3922, execution time: 0.019s




Isolation Forest ROC:0.8516, precision @ rank n:0.6078, execution time: 0.4267s
K Nearest Neighbors (KNN) ROC:0.932, precision @ rank n:0.8824, execution time: 0.032s
Local Outlier Factor (LOF) ROC:0.9227, precision @ rank n:0.7843, execution time: 0.01s
Minimum Covariance Determinant (MCD) ROC:0.9669, precision @ rank n:0.8627, execution time: 0.1199s
One-class SVM (OCSVM) ROC:0.8257, precision @ rank n:0.6863, execution time: 0.008s
Principal Component Analysis (PCA) ROC:0.7941, precision @ rank n:0.5686, execution time: 0.039s

... Processing letter.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.8783, precision @ rank n:0.4375, execution time: 0.7605s
Cluster-based Local Outlier Factor ROC:0.7783, precision @ rank n:0.1875, execution time: 0.2471s
Feature Bagging ROC:0.8947, precision @ rank n:0.4062, execution time: 1.1023s
Histogram-base Outlier Detection (HBOS) ROC:0.6063, precision @ rank n:0.0938, execution time: 0.02s




Isolation Forest ROC:0.6279, precision @ rank n:0.0625, execution time: 0.8075s
K Nearest Neighbors (KNN) ROC:0.8573, precision @ rank n:0.3125, execution time: 0.2428s
Local Outlier Factor (LOF) ROC:0.8765, precision @ rank n:0.3438, execution time: 0.1353s
Minimum Covariance Determinant (MCD) ROC:0.8061, precision @ rank n:0.1875, execution time: 2.5274s
One-class SVM (OCSVM) ROC:0.5927, precision @ rank n:0.125, execution time: 0.1729s
Principal Component Analysis (PCA) ROC:0.5216, precision @ rank n:0.125, execution time: 0.01s

... Processing lympho.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9831, precision @ rank n:0.0, execution time: 0.075s
Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.066s
Feature Bagging ROC:1.0, precision @ rank n:1.0, execution time: 0.0919s
Histogram-base Outlier Detection (HBOS) ROC:1.0, precision @ rank n:1.0, execution time: 0.017s




Isolation Forest ROC:1.0, precision @ rank n:1.0, execution time: 0.5067s
K Nearest Neighbors (KNN) ROC:1.0, precision @ rank n:1.0, execution time: 0.013s
Local Outlier Factor (LOF) ROC:1.0, precision @ rank n:1.0, execution time: 0.004s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ rank n:1.0, execution time: 0.0939s
One-class SVM (OCSVM) ROC:1.0, precision @ rank n:1.0, execution time: 0.01s
Principal Component Analysis (PCA) ROC:1.0, precision @ rank n:1.0, execution time: 0.003s

... Processing mnist.mat ...




Angle-based Outlier Detector (ABOD) ROC:0.7628, precision @ rank n:0.3367, execution time: 13.4477s
Cluster-based Local Outlier Factor ROC:0.8389, precision @ rank n:0.3912, execution time: 1.8319s
Feature Bagging ROC:0.7157, precision @ rank n:0.3741, execution time: 75.4396s
Histogram-base Outlier Detection (HBOS) ROC:0.5766, precision @ rank n:0.1361, execution time: 0.0929s




Isolation Forest ROC:0.7915, precision @ rank n:0.2687, execution time: 3.0731s
K Nearest Neighbors (KNN) ROC:0.8498, precision @ rank n:0.432, execution time: 10.7664s


KeyboardInterrupt: 

In [31]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7789,0.7796,0.8511,0.8637,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5892,0.8845,0.6385,0.8373,0.951,0.734,0.588,0.8524,0.9478,0.9616
0,glass,214,9,4.2056,0.6951,0.811,0.7073,0.7073,0.7134,0.8384,0.7043,0.8293,0.6585,0.686
0,ionosphere,351,33,35.8974,0.9181,0.9176,0.9303,0.6052,0.8516,0.932,0.9227,0.9669,0.8257,0.7941
0,letter,1600,32,6.25,0.8783,0.7783,0.8947,0.6063,0.6279,0.8573,0.8765,0.8061,0.5927,0.5216
0,lympho,148,18,4.0541,0.9831,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.4643,0.5714,0.6071,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1918,0.4932,0.1781,0.4521,0.6027,0.3562,0.1507,0.411,0.5342,0.6849
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8431,0.8039,0.8039,0.3922,0.6078,0.8824,0.7843,0.8627,0.6863,0.5686
0,letter,1600,32,6.25,0.4375,0.1875,0.4062,0.0938,0.0625,0.3125,0.3438,0.1875,0.125,0.125
0,lympho,148,18,4.0541,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,2.2437,2.6091,0.7695,2.8014,0.6526,0.1239,0.1029,1.6084,0.057,0.0939
0,cardio,1831,21,9.6122,0.8755,0.2498,1.3212,0.013,0.5926,0.2768,0.1506,1.0144,0.1319,0.01
0,glass,214,9,4.2056,0.0939,0.066,0.055,0.005,0.3915,0.017,0.005,0.0785,0.003,0.0036
0,ionosphere,351,33,35.8974,0.1549,0.085,0.1019,0.019,0.4267,0.032,0.01,0.1199,0.008,0.039
0,letter,1600,32,6.25,0.7605,0.2471,1.1023,0.02,0.8075,0.2428,0.1353,2.5274,0.1729,0.01
0,lympho,148,18,4.0541,0.075,0.066,0.0919,0.017,0.5067,0.013,0.004,0.0939,0.01,0.003


In [34]:
# Define the number of inliers and outliers
n_samples = len(y)
clusters_separation = [0]

n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

In [35]:
# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth)

Number of inliers: 6903
Number of outliers: 700
Ground truth shape is (7603,). Outlier are 1 and inliers are 0.

[0 0 0 ... 1 1 1]


In [36]:
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

Model 1 Angle-based Outlier Detector (ABOD)
Model 2 Cluster-based Local Outlier Factor
Model 3 Feature Bagging
Model 4 Histogram-base Outlier Detection (HBOS)
Model 5 Isolation Forest
Model 6 K Nearest Neighbors (KNN)
Model 7 Local Outlier Factor (LOF)
Model 8 Minimum Covariance Determinant (MCD)
Model 9 One-class SVM (OCSVM)
Model 10 Principal Component Analysis (PCA)
