<a href="https://colab.research.google.com/github/J-DR1/MastersThesis/blob/main/4_Spambase_GridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Packages

---

In [None]:
#Basic Packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

#Pyod
try:
  import pyod
except:
  !pip install pyod
finally:
  import pyod
  from pyod.models.iforest import IForest
  from pyod.models.abod import ABOD
  from pyod.models.ocsvm import OCSVM
  from pyod.models.lof import LOF
  from pyod.models.cblof import CBLOF
  from pyod.models.knn import KNN
  from pyod.utils.data import evaluate_print

#Isotree
try:
  import isotree
except: 
  !pip install isotree
finally:
  from isotree import IsolationForest

#Machine Learning - Sci-kit Learn

##Pre-processing
from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 

##Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

##Hyper Parameter Tuning
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import GridSearchCV

#Warnings
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)

#Functions & Data Import

---

In [None]:
#Functions
#Train-test splits 
def split_x_y(dataframe):
  X= dataframe.drop('original.label', axis = 1)
  y= dataframe['original.label']
  return X, y

def get_contamination_percentages(label_series, normal_value):
    return len(label_series[label_series != normal_value])/len(label_series)


In [None]:
bm_control = pd.read_csv("spambase_benchmark_0902.csv")
bm_922 = pd.read_csv("spambase_benchmark_0922.csv")
bm_943 = pd.read_csv("spambase_benchmark_0943.csv")
bm_944 = pd.read_csv("spambase_benchmark_0944.csv")

bm_control = bm_control.drop(['point.id','origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_922 = bm_922.drop(['point.id','origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_944 = bm_944.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_943 = bm_943.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)

# Matthews correlation coefficient

# BM_922 : scattered

In [None]:
X, y = split_x_y(bm_control)

In [None]:
#Stratified- train-test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, stratify=y) 

In [None]:
#Standardization
scaler = RobustScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.009994739610731194
Percentage Test Set Contamination:      0.011041009463722398


## GridSearch
-----

In [None]:
score_func = make_scorer(average_precision_score)

#Function for GridSearch
def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.05, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.507
Best Parameters: {'contamination': 0.05, 'max_samples': 2048, 'n_estimators': 150}
Test-set Score: 0.010


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.05, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.084
Best Parameters: {'contamination': 0.01, 'max_samples': 1024, 'n_estimators': 50}
Test-set Score: 0.007


## LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cityblock', 'n_neighbors': 75}
Test-set Score: 0.017


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 125, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


Best Mean Cross-Validation Score: nan
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cityblock', 'n_neighbors': 75}
Test-set Score: 0.024


## CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.550
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 30}
Test-set Score: 0.122


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: nan
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 15}
Test-set Score: 0.022


## KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.550
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 150}
Test-set Score: 0.122


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


Best Mean Cross-Validation Score: nan
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.022


## ABOD
---


In [None]:
#Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / 

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'contamination': 0.01, 'n_neighbors': 1}
Test-set Score: 0.012


In [None]:
#No Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]} 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  recall = tps / tps[-1]
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  re

Best Mean Cross-Validation Score: nan
Best Parameters: {'contamination': 0.01, 'n_neighbors': 1}
Test-set Score: 0.012


  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


## OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.506
Best Parameters: {'gamma': 0.001, 'kernel': 'rbf'}
Test-set Score: 0.014


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

# BM_944 : less clustered

In [None]:
X, y = split_x_y(bm_944)

In [None]:
#Stratified train-test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, stratify=y) 

In [None]:
#standardization
scaler = RobustScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5005307855626328
Percentage Test Set Contamination:      0.011041009463722398


## GridSearch

In [None]:
score_func = make_scorer(matthews_corrcoef)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
   'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {
   'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

# BM_943 : most clustered

In [None]:
X, y = split_x_y(bm_943)

In [None]:
#Stratified train-test split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, stratify=y) 

In [None]:
#Standardization
scaler = RobustScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.009994739610731194
Percentage Test Set Contamination:      0.011041009463722398


## GridSearch

In [None]:
score_func = make_scorer(matthews_corrcoef)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: 0.042
Best Parameters: {'contamination': 0.05, 'max_samples': 256, 'n_estimators': 150}
Test-set Score: 0.036


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.163
Best Parameters: {'contamination': 0.01, 'max_samples': 256, 'n_estimators': 50}
Test-set Score: -0.010


## LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: -0.021
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'euclidean', 'n_neighbors': 500}
Test-set Score: -0.011


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Best Mean Cross-Validation Score: -0.008
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'euclidean', 'n_neighbors': 20}
Test-set Score: -0.008


## CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: -0.021
Best Parameters: {'alpha': 0.9, 'beta': 20, 'contamination': 0.01, 'n_clusters': 20}
Test-set Score: -0.010


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.065
Best Parameters: {'alpha': 0.9, 'beta': 20, 'contamination': 0.1, 'n_clusters': 25}
Test-set Score: -0.032


## KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: -0.021
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'median', 'n_neighbors': 500}
Test-set Score: -0.010


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.061
Best Parameters: {'algorithm': 'auto', 'contamination': 0.07, 'method': 'mean', 'n_neighbors': 20}
Test-set Score: -0.027


## OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: -0.024
Best Parameters: {'contamination': 0.01, 'gamma': 0.0001, 'kernel': 'rbf'}
Test-set Score: -0.010


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.065
Best Parameters: {'contamination': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
Test-set Score: -0.024


## ABOD
---

In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'contamination': [0.005, 0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, r

In [None]:
#not Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'contamination': [0.005, 0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

# Recall

## BM_902 : control group

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_control, 1170, 760, 42)

In [None]:
train_X = pd.DataFrame(scaler.fit_transform(train_X),
columns=train_X.columns)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 125, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination' : [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ABOD

___

In [None]:
#Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]} 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## BM_922 : scattered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_922, 1170, 760, 42)

In [None]:
train_X = scaler.fit_transform(train_X.values.reshape(-1, 1))
train_y = scaler.fit_transform(train_y.values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 125, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination' : [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ABOD

___

In [None]:
#Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]} 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## BM_944 : less clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_944, 1170, 760, 42)

In [None]:
train_X = scaler.fit_transform(train_X.values.reshape(-1, 1))
train_y = scaler.fit_transform(train_y.values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
   'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {
   'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## BM_943 : most clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_943, 1170, 760, 42)

In [None]:
train_X = scaler.fit_transform(train_X.values.reshape(-1, 1))
train_y = scaler.fit_transform(train_y.values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)