<a href="https://colab.research.google.com/github/J-DR1/MastersThesis/blob/main/4_Spambase_GridSearch_stratified.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Basic Packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

#Pyod
try:
  import pyod
except:
  !pip install pyod
finally:
  import pyod
  from pyod.models.iforest import IForest
  from pyod.models.abod import ABOD
  from pyod.models.ocsvm import OCSVM
  from pyod.models.lof import LOF
  from pyod.models.cblof import CBLOF
  from pyod.models.knn import KNN
  from pyod.utils.data import evaluate_print

#Isotree
try:
  import isotree
except: 
  !pip install isotree
finally:
  from isotree import IsolationForest

#Machine Learning - Sci-kit Learn

##Pre-processing
from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing

##Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

##Hyper Parameter Tuning
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import GridSearchCV

#Warnings
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)

Collecting pyod
[?25l  Downloading https://files.pythonhosted.org/packages/37/50/94ac3c301b06e291ce52938e4a037b147cf01b40ff458dea5441ac42addf/pyod-0.8.7.tar.gz (101kB)
[K     |███▎                            | 10kB 14.8MB/s eta 0:00:01[K     |██████▌                         | 20kB 11.6MB/s eta 0:00:01[K     |█████████▊                      | 30kB 8.2MB/s eta 0:00:01[K     |█████████████                   | 40kB 7.2MB/s eta 0:00:01[K     |████████████████▏               | 51kB 4.7MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 4.4MB/s eta 0:00:01[K     |██████████████████████▊         | 71kB 5.0MB/s eta 0:00:01[K     |██████████████████████████      | 81kB 5.2MB/s eta 0:00:01[K     |█████████████████████████████▏  | 92kB 5.2MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 3.6MB/s 
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-0.8.7-cp37-none



In [None]:
#Functions 

scaler = RobustScaler()
#Input full dataframe, train and test size in integers and optionally random_state (defaulted at 1)
#Output two data frames of predictiors train and test X and two series of response labels train and test
#Output order is trainX, testX, trainY, testY
def random_sample_from_kdd(full_dataframe, train_size, test_size, random_state = 1):
    train_set = full_dataframe.sample(random_state = random_state, n = train_size)
    train_X = train_set.drop('original.label', axis = 1) 
    train_y = train_set['original.label']
    
    test_set = full_dataframe.drop(train_set.index).sample(random_state = random_state, n = test_size)
    test_X = test_set.drop('original.label', axis = 1)
    test_y = test_set['original.label']
    return train_X, test_X, train_y, test_y

def get_contamination_percentages(label_series, normal_value):
    return len(label_series[label_series != normal_value])/len(label_series)


In [None]:
bm_control = pd.read_csv("spambase_benchmark_0902.csv")
bm_922 = pd.read_csv("spambase_benchmark_0922.csv")
bm_943 = pd.read_csv("spambase_benchmark_0943.csv")
bm_944 = pd.read_csv("spambase_benchmark_0944.csv")

bm_control = bm_control.drop(['point.id','origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_922 = bm_922.drop(['point.id','origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_944 = bm_944.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_943 = bm_943.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)

In [None]:
bm_922.shape

(2535, 58)

# Recall

## BM_902 : control group

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_control, 1170, 760, 42)

In [None]:
train_X = pd.DataFrame(scaler.fit_transform(train_X),
columns=train_X.columns)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.4993514915693904
Percentage Test Set Contamination:      0.006578947368421052


GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.189
Best Parameters: {'contamination': 0.1, 'max_samples': 256, 'n_estimators': 12}
Test-set Score: 0.200


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.567
Best Parameters: {'contamination': 0.1, 'max_samples': 'auto', 'n_estimators': 100}
Test-set Score: 0.600


### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.163
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'euclidean', 'n_neighbors': 150}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 125, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.300
Best Parameters: {'algorithm': 'auto', 'contamination': 0.05, 'metric': 'cityblock', 'n_neighbors': 75}
Test-set Score: 0.200


### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.172
Best Parameters: {'alpha': 0.8, 'beta': 25, 'contamination': 0.1, 'n_clusters': 75}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.300
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.1, 'n_clusters': 30}
Test-set Score: 0.000


### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination' : [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.172
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'median', 'n_neighbors': 75}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.300
Best Parameters: {'algorithm': 'auto', 'contamination': 0.05, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.000


ABOD

___

In [None]:
#Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / 

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'contamination': 0.01, 'n_neighbors': 1}
Test-set Score: 0.000


  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [None]:
#No Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]} 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / 

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'contamination': 0.01, 'n_neighbors': 1}
Test-set Score: 0.000


  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.535
Best Parameters: {'gamma': 0.5, 'kernel': 'rbf'}
Test-set Score: 1.000


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 1.000
Best Parameters: {'gamma': 0.3, 'kernel': 'rbf'}
Test-set Score: 1.000


## BM_922 : scattered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_922, 1170, 760, 42)

In [None]:
train_X = pd.DataFrame(scaler.fit_transform(train_X),
columns=train_X.columns)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

ValueError: ignored

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 125, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination' : [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ABOD

___

In [None]:
#Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'contamination': [0.01],
                         'n_neighbors': [1, 5, 7, 12, 15]} 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## BM_944 : less clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_944, 1170, 760, 42)

In [None]:
train_X = pd.DataFrame(scaler.fit_transform(train_X),
columns=train_X.columns)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
   'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {
   'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[20, 30, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

## BM_943 : most clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_943, 1170, 760, 42)

In [None]:
train_X = pd.DataFrame(scaler.fit_transform(train_X),
columns=train_X.columns)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5008643042350908
Percentage Test Set Contamination:      0.010526315789473684


GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.129
Best Parameters: {'contamination': 0.1, 'max_samples': 128, 'n_estimators': 50}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.467
Best Parameters: {'contamination': 0.1, 'max_samples': 128, 'n_estimators': 12}
Test-set Score: 0.000


## LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.041
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 150}
Test-set Score: 0.625


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.100
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cityblock', 'n_neighbors': 150}
Test-set Score: 0.000


## CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.016
Best Parameters: {'alpha': 0.9, 'beta': 20, 'contamination': 0.1, 'n_clusters': 50}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.100
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.1, 'n_clusters': 15}
Test-set Score: 0.000


## KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.020
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'mean', 'n_neighbors': 5}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 75, 100, 150],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.100
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.000


## OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.279
Best Parameters: {'gamma': 0.5, 'kernel': 'rbf'}
Test-set Score: 1.000


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.3, 0.5, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 1.000
Best Parameters: {'gamma': 0.5, 'kernel': 'rbf'}
Test-set Score: 1.000
