<a href="https://colab.research.google.com/github/J-DR1/MastersThesis/blob/main/3_GridSearch_skin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Packages

---

In [None]:
#Basic Packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

#Pyod
try:
  import pyod
except:
  !pip install pyod
finally:
  import pyod
  from pyod.models.iforest import IForest
  from pyod.models.ocsvm import OCSVM
  from pyod.models.lof import LOF
  from pyod.models.cblof import CBLOF
  from pyod.models.knn import KNN
  from pyod.models.abod import ABOD
  from pyod.utils.data import evaluate_print

#Isotree
try:
  import isotree
except: 
  !pip install isotree
finally:
  from isotree import IsolationForest

#Machine Learning - Sci-kit Learn

##Pre-processing
from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing

##Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

##Hyper Parameter Tuning
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import GridSearchCV

#Warnings
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)

#Functions & Data Import

---

In [None]:
#Functions 

#Standardization
scaler = RobustScaler()

#Train-test splits
def random_sample(full_dataframe, train_size, test_size, random_state = 1):
    train_set = full_dataframe.sample(random_state = random_state, n = train_size)
    train_X = train_set.drop('original.label', axis = 1) 
    train_y = train_set['original.label']
    
    test_set = full_dataframe.drop(train_set.index).sample(random_state = random_state, n = test_size)
    test_X = test_set.drop('original.label', axis = 1)
    test_y = test_set['original.label']
    return train_X, test_X, train_y, test_y

def get_contamination_percentages(label_series, normal_value):
    return len(label_series[label_series != normal_value])/len(label_series)


In [None]:
#skin_control = pd.read_csv("skin_benchmark_0903.csv")
#skin_924 = pd.read_csv("skin_benchmark_0924.csv")
#skin_941 = pd.read_csv("skin_benchmark_0941.csv")
skin_942 = pd.read_csv("skin_benchmark_0942.csv")

#skin_control = skin_control.drop(['point.id','origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
#skin_924 = skin_924.drop(['point.id','origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
#skin_941 = skin_941.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)
skin_942 = skin_942.drop(['point.id', 'origin', 'motherset', 'ground.truth', 'diff.score'],axis=1)

In [None]:
#skin_control['original.label'] = skin_control['original.label'].apply(lambda z: 0 if(z == 2) else 1)
#skin_924['original.label'] = skin_924['original.label'].apply(lambda z: 0 if(z == 2) else 1)
#skin_941['original.label'] = skin_941['original.label'].apply(lambda z: 0 if(z == 2) else 1)
skin_942['original.label'] = skin_942['original.label'].apply(lambda z: 0 if(z == 2) else 1)

In [None]:
skin_942

Unnamed: 0,original.label,R,G,B
0,0,0.882403,0.792321,0.149153
1,1,-1.205761,-0.492271,0.590154
2,0,-1.559143,-1.009445,-0.801756
3,1,-1.237887,-0.609052,0.342091
4,0,1.187596,1.075932,0.535029
...,...,...,...,...
5995,0,0.288079,0.225098,1.251656
5996,0,0.737837,0.608808,-0.030004
5997,0,-1.944650,1.976815,1.816688
5998,0,0.496896,0.375246,-0.360755


# F1 score

##GridSearch

In [None]:
score_func = make_scorer(f1_score)

#GridSearch
def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

#GridSearch for iso-tree
def hyperparameter_tuning_iso(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  pred = grid.fit(train_X, train_y)

  temp_pred = np.array(pred)
  temp_pred[temp_pred > 0.50] = 1
  temp_pred[temp_pred < 0.50] = 0

  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## BM_942 : most clustered

In [None]:
#Train-test Split
train_X, test_X, train_y, test_y = random_sample_from_kdd(skin_942, 4200, 1800, 42)

In [None]:
#Standardization
train_X['B'] = scaler.fit_transform(train_X['B'].values.reshape(-1, 1))
train_X['G'] = scaler.fit_transform(train_X['G'].values.reshape(-1, 1))
train_X['R'] = scaler.fit_transform(train_X['R'].values.reshape(-1, 1))

In [None]:
test_X['B'] = scaler.transform(test_X['B'].values.reshape(-1, 1))
test_X['G'] = scaler.transform(test_X['G'].values.reshape(-1, 1))
test_X['R'] = scaler.transform(test_X['R'].values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.010
Best Parameters: {'contamination': 0.1, 'max_samples': 1024, 'n_estimators': 150}
Test-set Score: 0.005


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.081
Best Parameters: {'contamination': 0.1, 'max_samples': 2048, 'n_estimators': 150}
Test-set Score: 0.009


### SCiForest
---

In [None]:
#Oversampled
parameter_search_grid = {'ndim': [1, 2, 3, 4],
                         'ntrees': [10, 50, 75, 100],
                         'sample_size': [256]
                         } 

hyperparameter_tuning_iso(model = IsolationForest(penalize_range=True, prob_pick_avg_gain=1,prob_pick_pooled_gain=0, ntry=10, missing_action = "fail"), 
                      parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.091
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 75}
Test-set Score: 0.091


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.017
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 30}
Test-set Score: 0.010


### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.031
Best Parameters: {'alpha': 0.8, 'beta': 25, 'contamination': 0.1, 'n_clusters': 15}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.213
Best Parameters: {'alpha': 0.9, 'beta': 25, 'contamination': 0.05, 'n_clusters': 25}
Test-set Score: 0.056


### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.053
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'mean', 'n_neighbors': 5}
Test-set Score: 0.030


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.168
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'mean', 'n_neighbors': 20}
Test-set Score: 0.087


### ABOD
---

In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.002
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.069
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.000


# Matthews_corrcoef

## GridSearch

In [None]:
score_func = make_scorer(matthews_corrcoef)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## BM_942 : most clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(skin_942, 4200, 1800, 42)

In [None]:
train_X['B'] = scaler.fit_transform(train_X['B'].values.reshape(-1, 1))
train_X['G'] = scaler.fit_transform(train_X['G'].values.reshape(-1, 1))
train_X['R'] = scaler.fit_transform(train_X['R'].values.reshape(-1, 1))

In [None]:
test_X['B'] = scaler.transform(test_X['B'].values.reshape(-1, 1))
test_X['G'] = scaler.transform(test_X['G'].values.reshape(-1, 1))
test_X['R'] = scaler.transform(test_X['R'].values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5001802234771117
Percentage Test Set Contamination:      0.011111111111111112


### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: -0.022
Best Parameters: {'contamination': 0.01, 'max_samples': 1024, 'n_estimators': 100}
Test-set Score: -0.017


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.151
Best Parameters: {'contamination': 0.1, 'max_samples': 2048, 'n_estimators': 50}
Test-set Score: 0.047


### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: 0.005
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 100}
Test-set Score: 0.091


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.001
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 30}
Test-set Score: -0.018


### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: -0.025
Best Parameters: {'alpha': 0.9, 'beta': 20, 'contamination': 0.01, 'n_clusters': 30}
Test-set Score: -0.012


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.288
Best Parameters: {'alpha': 0.9, 'beta': 25, 'contamination': 0.07, 'n_clusters': 30}
Test-set Score: 0.111


### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * co

Best Mean Cross-Validation Score: -0.019
Best Parameters: {'algorithm': 'auto', 'contamination': 0.01, 'method': 'largest', 'n_neighbors': 20}
Test-set Score: -0.017


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.284
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'mean', 'n_neighbors': 20}
Test-set Score: 0.180


### ABOD
---

In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'contamination': [0.005, 0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, r

In [None]:
#not Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'contamination': [0.005, 0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  **kwargs)
  arrmean, r

### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Best Mean Cross-Validation Score: -0.086
Best Parameters: {'gamma': 0.001, 'kernel': 'rbf'}
Test-set Score: -0.053


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.100
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: -0.031


# ROC

## GridSearch

In [None]:
from sklearn.model_selection import StratifiedKFold
def hyperparameter_tuning(model, parameter_dict, cross_fold, train_X, train_y, test_X, test_y):
  kfold = StratifiedKFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring='roc_auc')

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## BM_942 : most clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(skin_942, 4200, 1800, 42)

In [None]:
test_y.value_counts()

0    1780
1      20
Name: original.label, dtype: int64

In [None]:
train_X['B'] = scaler.fit_transform(train_X['B'].values.reshape(-1, 1))
train_X['G'] = scaler.fit_transform(train_X['G'].values.reshape(-1, 1))
train_X['R'] = scaler.fit_transform(train_X['R'].values.reshape(-1, 1))

In [None]:
test_X['B'] = scaler.transform(test_X['B'].values.reshape(-1, 1))
test_X['G'] = scaler.transform(test_X['G'].values.reshape(-1, 1))
test_X['R'] = scaler.transform(test_X['R'].values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5001802234771117
Percentage Test Set Contamination:      0.011111111111111112


In [None]:
test_y.value_counts()

0    1780
1      20
Name: original.label, dtype: int64

In [None]:
train_y_resampled.value_counts()

### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.360
Best Parameters: {'contamination': 0.03, 'max_samples': 2048, 'n_estimators': 50}
Test-set Score: 0.571


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.049
Best Parameters: {'contamination': 0.1, 'max_samples': 2048, 'n_estimators': 50}
Test-set Score: 0.018


### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.488
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 150}
Test-set Score: 0.025


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.114
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 30}
Test-set Score: 0.300


### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.400
Best Parameters: {'alpha': 0.8, 'beta': 20, 'contamination': 0.1, 'n_clusters': 20}
Test-set Score: 0.004


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.971
Best Parameters: {'alpha': 0.9, 'beta': 25, 'contamination': 0.1, 'n_clusters': 25}
Test-set Score: 1.000


### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.975
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'mean', 'n_neighbors': 20}
Test-set Score: 1.000


### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.200
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.010


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.415
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.350


# Precision

GridSearch

In [None]:
score_func = make_scorer(precision_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## BM_942 : most clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(skin_942, 4200, 1800, 42)

In [None]:
train_X['B'] = scaler.fit_transform(train_X['B'].values.reshape(-1, 1))
train_X['G'] = scaler.fit_transform(train_X['G'].values.reshape(-1, 1))
train_X['R'] = scaler.fit_transform(train_X['R'].values.reshape(-1, 1))

In [None]:
test_X['B'] = scaler.transform(test_X['B'].values.reshape(-1, 1))
test_X['G'] = scaler.transform(test_X['G'].values.reshape(-1, 1))
test_X['R'] = scaler.transform(test_X['R'].values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5001802234771117
Percentage Test Set Contamination:      0.011111111111111112


### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.203
Best Parameters: {'contamination': 0.1, 'max_samples': 2048, 'n_estimators': 150}
Test-set Score: 0.010


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.049
Best Parameters: {'contamination': 0.1, 'max_samples': 2048, 'n_estimators': 50}
Test-set Score: 0.018


### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.488
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 150}
Test-set Score: 0.025


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.114
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 30}
Test-set Score: 0.300


### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.400
Best Parameters: {'alpha': 0.8, 'beta': 20, 'contamination': 0.1, 'n_clusters': 20}
Test-set Score: 0.004


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.971
Best Parameters: {'alpha': 0.9, 'beta': 25, 'contamination': 0.1, 'n_clusters': 25}
Test-set Score: 1.000


### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.975
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'mean', 'n_neighbors': 20}
Test-set Score: 1.000


### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.200
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.010


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.415
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.350


# Recall

## GridSearch

In [None]:
score_func = make_scorer(recall_score)

def hyperparameter_tuning(model, parameter_dict, score, cross_fold, train_X, train_y, test_X, test_y):
  kfold = KFold(n_splits=cross_fold)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=kfold, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))

## BM_942 : most clustered

In [None]:
train_X, test_X, train_y, test_y = random_sample_from_kdd(skin_942, 4200, 1800, 42)

In [None]:
train_X['B'] = scaler.fit_transform(train_X['B'].values.reshape(-1, 1))
train_X['G'] = scaler.fit_transform(train_X['G'].values.reshape(-1, 1))
train_X['R'] = scaler.fit_transform(train_X['R'].values.reshape(-1, 1))

In [None]:
test_X['B'] = scaler.transform(test_X['B'].values.reshape(-1, 1))
test_X['G'] = scaler.transform(test_X['G'].values.reshape(-1, 1))
test_X['R'] = scaler.transform(test_X['R'].values.reshape(-1, 1))

In [None]:
#Oversampling with Adasyn
train_X_resampled, train_y_resampled= ADASYN(ratio = 'minority').fit_sample(train_X, train_y)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5001802234771117
Percentage Test Set Contamination:      0.011111111111111112


### IForest

---



In [None]:
#Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.087
Best Parameters: {'contamination': 0.1, 'max_samples': 'auto', 'n_estimators': 100}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256, 512, 1024, 2048],
    'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.576
Best Parameters: {'contamination': 0.1, 'max_samples': 2048, 'n_estimators': 100}
Test-set Score: 0.500


### LOF

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[75, 100, 150],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.131
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cityblock', 'n_neighbors': 150}
Test-set Score: 0.150


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[20, 25, 30, 40, 50, 75],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.114
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'metric': 'cosine', 'n_neighbors': 30}
Test-set Score: 0.300


### CBLOF

---




In [None]:
#Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.045
Best Parameters: {'alpha': 0.9, 'beta': 20, 'contamination': 0.1, 'n_clusters': 30}
Test-set Score: 0.050


In [None]:
#No Oversampled
parameter_search_grid = {'n_clusters':[15, 20, 25, 30],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in the range of (0, 1).

ValueError: alpha is set to 1. Not in th

Best Mean Cross-Validation Score: 0.971
Best Parameters: {'alpha': 0.9, 'beta': 25, 'contamination': 0.1, 'n_clusters': 25}
Test-set Score: 1.000


### KNN

---



In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.112
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'largest', 'n_neighbors': 5}
Test-set Score: 0.700


In [None]:
#No Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.975
Best Parameters: {'algorithm': 'auto', 'contamination': 0.1, 'method': 'mean', 'n_neighbors': 20}
Test-set Score: 1.000


### OCSVM

---



In [None]:
#Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.001
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.000


In [None]:
#No Oversampled
parameter_search_grid = {'gamma': ['auto', 'scale', 0.03, 0.05, 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, cross_fold = 5, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

Best Mean Cross-Validation Score: 0.415
Best Parameters: {'gamma': 'scale', 'kernel': 'rbf'}
Test-set Score: 0.350
