<a href="https://colab.research.google.com/github/J-DR1/MastersThesis/blob/main/1_KDD_GridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Basic Packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

#Pyod
try:
  import pyod
except:
  !pip install pyod
finally:
  import pyod
  from pyod.models.iforest import IForest
  from pyod.models.ocsvm import OCSVM
  from pyod.models.lof import LOF
  from pyod.models.cblof import CBLOF
  from pyod.models.knn import KNN
  from pyod.utils.data import evaluate_print

#Isotree
try:
  import isotree
except: 
  !pip install isotree
finally:
  from isotree import IsolationForest

#Machine Learning - Sci-kit Learn

##Pre-processing
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

##Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

##Hyper Parameter Tuning
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import GridSearchCV

#Warnings
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def random_sample_from_kdd(full_dataframe, train_size, test_size, random_state = 1):
    train_set = full_dataframe.sample(random_state = random_state, n = train_size)
    train_X = train_set.drop('Attack Type', axis = 1) 
    train_y = train_set['Attack Type']
    test_set = full_dataframe.drop(train_set.index).sample(random_state = random_state, n = test_size)
    test_X = test_set.drop('Attack Type', axis = 1)
    test_y = test_set['Attack Type']
    return train_X, test_X, train_y, test_y

def get_contamination_percentages(label_series, normal_value):
    return len(label_series[label_series != normal_value])/len(label_series)


In [None]:
#Cleaned Dataset
kdd_http = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/kdd_http_cleaned.csv")
kdd_http.head(5) 

Unnamed: 0,duration,src_bytes,dst_bytes,hot,logged_in,num_compromised,root_shell,num_root,num_access_files,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Attack Type,x0_REJ,x0_RSTO,x0_RSTR,x0_S0,x0_S1,x0_S2,x0_S3,x0_SF
0,0,181,5450,0,1,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,239,486,0,1,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,235,1337,0,1,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,219,1337,0,1,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,217,2032,0,1,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(kdd_http, 45000, 19000, 42)

In [None]:
#Oversampling with Adasyn
list_trainy = list(train_y)
for i in range(len(list_trainy)):
    if(list_trainy[i] == 'normal'):
        list_trainy[i] = 0
    else:
        list_trainy[i] = 1

train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, list_trainy)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y, 'normal'))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 'normal'))

Percentage Training Set Contamination:  0.038
Percentage Test Set Contamination:      0.036263157894736844


GridSearch

In [None]:
score_func = make_scorer(matthews_corrcoef)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y, normal_value, scaler =None, Adasyn = False):
  if (scaler is not None):
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.fit_transform(test_X)

  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  if (Adasyn):
    list_trainy = train_y
  else:
    list_trainy = list(train_y)
    for i in range(len(list_trainy)):
      if (list_trainy[i] == normal_value):
        list_trainy[i] = 0
      else:
        list_trainy[i] = 1

  list_testy = list(test_y)
  for i in range(len(list_testy)):
    if (list_testy[i] == normal_value):
      list_testy[i] = 0
    else:
      list_testy[i] = 1

  grid.fit(train_X, list_trainy)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, list_testy)))

IForest

---



In [None]:
#Standardized-Oversampled
parameter_search_grid = {
    'n_estimators': [100],
    'max_samples': [128, 256],
    'contamination': [0.01, 0.04, 0.1, 0.15, 0.2]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = None, Adasyn = True)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
#Standardized-No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.037]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = False)

Best Mean Cross-Validation Score: 0.330
Best Parameters: {'contamination': 0.037, 'max_samples': 'auto', 'n_estimators': 50}
Test-set Score: 0.281


In [None]:
#No Standardizaed-Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.5]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = None, Adasyn = True)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Best Mean Cross-Validation Score: 0.232
Best Parameters: {'contamination': 0.037, 'max_samples': 256, 'n_estimators': 100}
Test-set Score: 0.115


In [None]:
#No Standardizaed- No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': [ 128, 256],
    'contamination': [0.01, 0.04, 0.1, 0.15, 0.2]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = None, Adasyn = False)

Best Mean Cross-Validation Score: 0.474
Best Parameters: {'contamination': 0.15, 'max_samples': 128, 'n_estimators': 150}
Test-set Score: 0.476


LOF

---



In [None]:
#Standardized- Oversampled
parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.037],
                         'algorithm' : ['auto']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = True)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Best Mean Cross-Validation Score: 0.134
Best Parameters: {'algorithm': 'auto', 'contamination': 0.037, 'n_neighbors': 500}
Test-set Score: 0.042


In [None]:
#Standardized- No Oversampled
parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.037],
                         'algorithm' : ['auto']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = False)

Best Mean Cross-Validation Score: 0.479
Best Parameters: {'algorithm': 'auto', 'contamination': 0.037, 'n_neighbors': 500}
Test-set Score: 0.438


CBLOF

---




In [None]:
#Standardized- Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.037],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = True)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: 0.128
Best Parameters: {'alpha': 0.8, 'beta': 15, 'contamination': 0.037, 'n_clusters': 15}
Test-set Score: 0.141


In [None]:
#Standardized- No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.037],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = False)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.715
Best Parameters: {'alpha': 0.9, 'beta': 15, 'contamination': 0.037, 'n_clusters': 15}
Test-set Score: 0.858


KNN

---



In [None]:
#Standardized- Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.037],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = True)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: 0.128
Best Parameters: {'algorithm': 'auto', 'contamination': 0.037, 'method': 'median', 'n_neighbors': 500}
Test-set Score: 0.122


In [None]:
#Standardized- No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.037],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = False)

Best Mean Cross-Validation Score: 0.213
Best Parameters: {'algorithm': 'auto', 'contamination': 0.037, 'method': 'largest', 'n_neighbors': 500}
Test-set Score: 0.220


OCSVM

---



In [None]:
#Standardized- Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = True)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Best Mean Cross-Validation Score: 0.380
Best Parameters: {'gamma': 0.0001, 'kernel': 'rbf'}
Test-set Score: 0.508


In [None]:
#Standardized- No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y, 
                      normal_value = 'normal', scaler = StandardScaler(), Adasyn = False)

Best Mean Cross-Validation Score: 0.595
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.559
