<a href="https://colab.research.google.com/github/J-DR1/MastersThesis/blob/main/Benchmark_GridSearch_spambase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
#Basic Packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

#Pyod
try:
  import pyod
except:
  !pip install pyod
finally:
  import pyod
  from pyod.models.iforest import IForest
  from pyod.models.abod import ABOD
  from pyod.models.ocsvm import OCSVM
  from pyod.models.lof import LOF
  from pyod.models.cblof import CBLOF
  from pyod.models.knn import KNN
  from pyod.utils.data import evaluate_print

#Isotree
try:
  import isotree
except: 
  !pip install isotree
finally:
  from isotree import IsolationForest

#Machine Learning - Sci-kit Learn

##Pre-processing
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

##Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

##Hyper Parameter Tuning
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import GridSearchCV

#Warnings
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)

In [3]:
#Functions 

#Input full dataframe, train and test size in integers and optionally random_state (defaulted at 1)
#Output two data frames of predictiors train and test X and two series of response labels train and test
#Output order is trainX, testX, trainY, testY
def random_sample_from_kdd(full_dataframe, train_size, test_size, random_state = 1):
    train_set = full_dataframe.sample(random_state = random_state, n = train_size)
    train_X = train_set.drop('original.label', axis = 1) 
    train_y = train_set['original.label']
    
    test_set = full_dataframe.drop(train_set.index).sample(random_state = random_state, n = test_size)
    test_X = test_set.drop('original.label', axis = 1)
    test_y = test_set['original.label']
    return train_X, test_X, train_y, test_y

def get_contamination_percentages(label_series, normal_value):
    return len(label_series[label_series != normal_value])/len(label_series)


In [11]:
bm_922 = pd.read_csv("spambase_benchmark_0922.csv")
bm_943 = pd.read_csv("spambase_benchmark_0943.csv")
bm_944 = pd.read_csv("spambase_benchmark_0944.csv")

bm_922 = bm_922.drop(['point.id','origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_944 = bm_944.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
bm_943 = bm_943.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)

In [13]:
bm_922.shape

(2535, 58)

#BM_922 : scattered

In [14]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_922, 1170, 760, 42)

In [15]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [16]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5
Percentage Test Set Contamination:      0.011842105263157895


GridSearch

In [34]:
score_func = make_scorer(average_precision_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

IForest

---



In [44]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [12, 25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'contamination': 0.01, 'max_samples': 'auto', 'n_estimators': 12}
Test-set Score: 0.012


In [43]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


Best Mean Cross-Validation Score: nan
Best Parameters: {'contamination': 0.01, 'max_samples': 'auto', 'n_estimators': 25}
Test-set Score: 0.151


LOF

---



In [18]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cityblock', 'n_neighbors': 75}
Test-set Score: 0.017


In [19]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 125, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


Best Mean Cross-Validation Score: nan
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cityblock', 'n_neighbors': 75}
Test-set Score: 0.024


CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 15}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.009
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 15}
Test-set Score: 0.009


KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.009
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.009


ABOD

___

In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'gamma': 'auto', 'kernel': 'rbf'}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.011
Best Parameters: {'gamma': 'auto', 'kernel': 'rbf'}
Test-set Score: 0.009


# BM_944

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_944, 1170, 760, 42)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.4993526111350885
Percentage Test Set Contamination:      0.009210526315789473


GridSearch

In [None]:
score_func = make_scorer(average_precision_score, average= 'micro')

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'contamination': 0.01, 'max_samples': 128, 'n_estimators': 100}
Test-set Score: 0.022


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.058
Best Parameters: {'contamination': 0.01, 'max_samples': 'auto', 'n_estimators': 50}
Test-set Score: 0.134


LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[20, 50, 100, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cityblock', 'n_neighbors': 20}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.074
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cosine', 'n_neighbors': 30}
Test-set Score: 0.009


CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 15}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.009
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 15}
Test-set Score: 0.009


KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.009
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.009


OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'gamma': 'auto', 'kernel': 'rbf'}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.011
Best Parameters: {'gamma': 'auto', 'kernel': 'rbf'}
Test-set Score: 0.009


# BM_943

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(bm_943, 1170, 760, 42)

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.4993526111350885
Percentage Test Set Contamination:      0.009210526315789473


GridSearch

In [None]:
score_func = make_scorer(average_precision_score, average= 'micro')

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'contamination': 0.01, 'max_samples': 128, 'n_estimators': 100}
Test-set Score: 0.022


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.058
Best Parameters: {'contamination': 0.01, 'max_samples': 'auto', 'n_estimators': 50}
Test-set Score: 0.134


LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[20, 50, 100, 150],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cityblock', 'n_neighbors': 20}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40],
                         'contamination': [0.01],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.074
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'metric': 'cosine', 'n_neighbors': 30}
Test-set Score: 0.009


CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 15}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.009
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.01, 'n_clusters': 15}
Test-set Score: 0.009


KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.009
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.009


OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.500
Best Parameters: {'gamma': 'auto', 'kernel': 'rbf'}
Test-set Score: 0.009


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.011
Best Parameters: {'gamma': 'auto', 'kernel': 'rbf'}
Test-set Score: 0.009
