@author Nassir Mohammad

# Preliminaries

In [1]:
import os
import sys 
sys.path.append('../')
sys.path.append('../scripts')

import warnings
from perception_nassir import Perception

import dataframe_image as dfi

import numpy as np
import pandas as pd

from scripts.utilities import apply_classifiers
from scripts.utilities import get_file_data

from sklearn.preprocessing import StandardScaler
from rendering_functions import highlight_max, highlight_min

image_save_path = ''
image_save_switch = False

# Paper 1 datasets 

In [2]:
# make table for dataset, # dimensions, # samples, # percentage of anomalies

base_path = "../data/ODDS_multivariate/"
data_properties_df = None

# loop over datasets in directory
for file_name in os.listdir(base_path):

    dataset_name, X_original, y = get_file_data(base_path, file_name)

    if dataset_name is None:
        continue

    # write dataset summary to dataframe
    data_properties_temp = pd.DataFrame({
        'Name': [dataset_name],
        '\# examples': [X_original.shape[0]],
        '\# features': [X_original.shape[1]],
        # '# anomalies': [y.sum()],
        '\% anomalies': [float(np.round(y.sum()/X_original.shape[0]*100, 2))],
    })

    data_properties_df = pd.concat(
        [data_properties_df, data_properties_temp]).reset_index(drop=True)
            

In [3]:
img_title = "Dataset properties"
path_save = image_save_path + "dataset_properties.png"

# order the dataset rows by name
data_properties_df = data_properties_df.sort_values(by=['Name']).reset_index(drop=True)

data_properties_df_styled = data_properties_df.style.format({'\% anomalies': "{:.2f}"}).hide()

#data_properties_df_styled = data_properties_df.style.hide_index()

#dfi.export(data_properties_df,path_save)

data_properties_df_styled

Name,\# examples,\# features,\% anomalies
cardio,1831,21,9.61
credit-card,284807,29,0.17
http,567498,3,0.39
musk,3062,166,3.17
satimage-2,5803,36,1.22
shuttle,49097,9,7.15
smtp,95156,3,0.03
thyroid,3772,6,2.47
wbc,378,30,5.56


In [4]:
# file names
# ########################
# file_name = "wbc.mat" 
# file_name = "cardio.mat"
# file_name = "thyroid.mat"
# file_name = "musk.mat"
# file_name = "shuttle.mat"
# file_name = "satimage-2.mat"
# file_name = "http.matv7"
# file_name = "smtp.matv7"
# file_name = "credit-card.csv"

classifiers = [
    'HBOS',  # to be ignored, first run in loop slower
    'HBOS',
    'IForest',
    'KNN',
    'LOF',
    'MCD',
    'OCSVM',
    'Perception',
]

metrics_df = None

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    # loop over datasets in directory
    for file_name in os.listdir(base_path):

        dataset_name, X_original, y = get_file_data(base_path, file_name)

        if dataset_name is None:
            continue

        # scaling (very important to get right)
        # scale to zero mean and unit standard deviation along each feature
        sc = StandardScaler(with_mean=False)
        sc.fit(X_original)
        X = sc.transform(X_original)

        # Apply each classifier to dataset
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            print('current file in progress ...: {}'.format(dataset_name))

            metrics_temp = apply_classifiers(classifiers, dataset_name,
                                             predict_data=X,
                                             predict_labels=y,
                                             train_data=X)

        metrics_df = pd.concat([metrics_df, metrics_temp])

    metrics_df.reset_index(drop=True)

current file in progress ...: credit-card
current classifier in progress: HBOS
total run time: 2.418627542210743
current classifier in progress: HBOS
total run time: 0.765158957336098
current classifier in progress: IForest
total run time: 3.294221875956282
current classifier in progress: KNN
current classifier in progress: LOF
current classifier in progress: MCD
total run time: 43.30430595809594
current classifier in progress: OCSVM
current classifier in progress: Perception
total run time: 0.29077124991454184
current file in progress ...: cardio
current classifier in progress: HBOS
total run time: 0.005209125112742186
current classifier in progress: HBOS
total run time: 0.004532125312834978
current classifier in progress: IForest
total run time: 0.08156112511642277
current classifier in progress: KNN
total run time: 0.2786257080733776
current classifier in progress: LOF
total run time: 0.036602583015337586
current classifier in progress: MCD
total run time: 0.4049022498074919
current

In [5]:
# create dataframe for precision
df_precision = metrics_df[['Dataset', 'Classifier', 'Precision']]
df_precision = pd.pivot_table(df_precision, values = 'Precision', index=['Classifier'], columns='Dataset').reset_index()
df_precision.columns.name = None

cols = [col for col in df_precision.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.3f}"
formatdict.pop('Classifier', None)

sub = df_precision.columns.values.tolist()
sub.remove('Classifier')
sub

df_precision = df_precision.style.hide().apply(highlight_max, subset=sub).format(formatdict)

# img_title = "Precision results"
# path_save = image_save_path + "dataset_precision.png"

# dfi.export(df_precision,path_save)

df_precision

Classifier,cardio,credit-card,http,musk,satimage-2,shuttle,smtp,thyroid,wbc
HBOS,0.443,0.015,0.051,0.316,0.114,0.689,0.003,0.202,0.421
IForest,0.443,0.015,0.039,0.316,0.12,0.704,0.002,0.233,0.395
KNN,0.35,,0.002,0.139,0.093,0.208,0.003,0.238,0.4
LOF,0.19,,0.001,0.137,0.04,0.115,0.003,0.06,0.4
MCD,0.478,0.015,0.039,0.316,0.122,0.689,0.002,0.235,0.368
OCSVM,0.503,,,0.316,0.12,0.697,0.002,0.214,0.395
Perception,0.591,0.031,0.149,0.89,0.333,0.907,0.006,0.247,0.483


In [6]:
# create dataframe for recall
df_recall = metrics_df[['Dataset', 'Classifier', 'Recall']]
df_recall = pd.pivot_table(df_recall, values = 'Recall', index=['Classifier'], columns='Dataset').reset_index()
df_recall.columns.name = None

cols = [col for col in df_recall.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.2f}"
formatdict.pop('Classifier', None)

sub = df_recall.columns.values.tolist()
sub.remove('Classifier')
sub

df_recall = df_recall.style.hide().apply(highlight_max, subset=sub).format(formatdict)

img_title = "Recall results"
path_save = image_save_path + "dataset_recall.png"

# dfi.export(df_recall,path_save)

df_recall

Classifier,cardio,credit-card,http,musk,satimage-2,shuttle,smtp,thyroid,wbc
HBOS,0.46,0.89,1.0,1.0,0.93,0.96,0.7,0.82,0.76
IForest,0.46,0.88,1.0,1.0,0.99,0.98,0.77,0.95,0.71
KNN,0.28,,0.04,0.27,0.66,0.22,0.73,0.87,0.67
LOF,0.16,,0.02,0.38,0.28,0.14,0.7,0.22,0.67
MCD,0.5,0.85,1.0,1.0,1.0,0.96,0.77,0.96,0.67
OCSVM,0.52,,,1.0,0.99,0.97,0.77,0.87,0.71
Perception,0.37,0.88,1.0,1.0,0.9,0.96,0.7,0.68,0.67


In [7]:
# create dataframe for F1-score
df_f1= metrics_df[['Dataset', 'Classifier', 'F1']]
df_f1 = pd.pivot_table(df_f1, values = 'F1', index=['Classifier'], columns='Dataset').reset_index()
df_f1.columns.name = None

cols = [col for col in df_f1.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.3f}"
formatdict.pop('Classifier', None)

sub = df_f1.columns.values.tolist()
sub.remove('Classifier')
sub

df_f1 = df_f1.style.hide().apply(highlight_max, subset=sub).format(formatdict)

img_title = "F1-score results"
path_save = image_save_path + "dataset_f1-score.png"

# dfi.export(df_f1,path_save)

df_f1

Classifier,cardio,credit-card,http,musk,satimage-2,shuttle,smtp,thyroid,wbc
HBOS,0.451,0.03,0.097,0.48,0.202,0.801,0.005,0.323,0.542
IForest,0.451,0.03,0.075,0.48,0.215,0.821,0.005,0.374,0.508
KNN,0.31,,0.004,0.183,0.163,0.214,0.005,0.374,0.5
LOF,0.176,,0.002,0.201,0.07,0.126,0.005,0.094,0.5
MCD,0.489,0.029,0.075,0.48,0.218,0.803,0.005,0.378,0.475
OCSVM,0.513,,,0.48,0.215,0.812,0.005,0.344,0.508
Perception,0.455,0.061,0.26,0.942,0.487,0.931,0.012,0.362,0.56


In [8]:
# create dataframe for Area under ROC curve
df1= metrics_df[['Dataset', 'Classifier', 'AUC']]
df1 = pd.pivot_table(df1, values = 'AUC', index=['Classifier'], columns='Dataset').reset_index()
df1.columns.name = None

cols = [col for col in df1.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.2f}"
formatdict.pop('Classifier', None)

sub = df1.columns.values.tolist()
sub.remove('Classifier')
sub

df1 = df1.style.hide().apply(highlight_max, subset=sub).format(formatdict)

img_title = "F1-score results"
path_save = image_save_path + "dataset_auc.png"

# dfi.export(df1,path_save)

df1

Classifier,cardio,credit-card,http,musk,satimage-2,shuttle,smtp,thyroid,wbc
HBOS,0.85,0.95,0.99,1.0,0.98,0.98,0.8,0.95,0.96
IForest,0.91,0.95,1.0,1.0,0.99,1.0,0.89,0.98,0.95
KNN,0.69,,0.25,0.62,0.93,0.63,0.91,0.96,0.95
LOF,0.55,,0.4,0.64,0.54,0.52,0.83,0.66,0.94
MCD,0.84,0.93,1.0,1.0,1.0,0.99,0.95,0.99,0.92
OCSVM,0.94,,,1.0,1.0,0.99,0.85,0.96,0.94
Perception,0.77,0.93,1.0,1.0,0.93,0.98,0.8,0.86,0.76


In [9]:
# create dataframe for total training and prediction time
df1= metrics_df[['Dataset', 'Classifier', 'Runtime']]
df1 = pd.pivot_table(df1, values = 'Runtime', index=['Classifier'], columns='Dataset').reset_index()
df1.columns.name = None

cols = [col for col in df1.columns]
formatdict = {}
for col in cols: formatdict[col] = "{:.4f}"
formatdict.pop('Classifier', None)

sub = df1.columns.values.tolist()
sub.remove('Classifier')
sub

df1 = df1.style.hide().apply(highlight_min, subset=sub).format(formatdict)

path_save = image_save_path + "dataset_total_time.png"

# dfi.export(df1,path_save)

df1

Classifier,cardio,credit-card,http,musk,satimage-2,shuttle,smtp,thyroid,wbc
HBOS,0.0045,0.7652,0.1465,0.0574,0.0196,0.0395,0.0264,0.0027,0.0027
IForest,0.0816,3.2942,7.0228,0.1147,0.1648,0.6489,1.2149,0.1263,0.0637
KNN,0.2786,,46.2999,2.1496,1.2763,8.646,6.1079,0.2995,0.034
LOF,0.0366,,18.6887,0.1028,0.2963,12.1906,1.6959,0.1569,0.0068
MCD,0.4049,43.3043,46.288,25.8672,12.8443,8.8137,9.5163,0.4417,0.2262
OCSVM,0.313,,,1.4869,3.9458,173.19,630.1486,0.9182,0.0204
Perception,0.002,0.2908,0.3521,0.0099,0.0064,0.0361,0.0671,0.0027,0.0007
