<a href="https://colab.research.google.com/github/J-DR1/MastersThesis/blob/main/2_GridSearch_Fraud_31_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages

In [None]:
#Basic Packages
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

#Pyod
try:
  import pyod
  import isotree
except:
  !pip install pyod
  !pip install isotree
finally:
  import pyod
  import isotree
  from pyod.models.iforest import IForest
  from pyod.models.ocsvm import OCSVM
  from pyod.models.lof import LOF
  from pyod.models.cblof import CBLOF
  from pyod.models.knn import KNN
  from pyod.utils.data import evaluate_print
  from pyod.models.hbos import HBOS
  from pyod.models.abod import ABOD

##Metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

##Hyper Parameter Tuning
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline

#split data
from sklearn.model_selection import TimeSeriesSplit

from sklearn.preprocessing import RobustScaler

import sklearn.exceptions
from isotree import IsolationForest

#Warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Collecting pyod
[?25l  Downloading https://files.pythonhosted.org/packages/37/50/94ac3c301b06e291ce52938e4a037b147cf01b40ff458dea5441ac42addf/pyod-0.8.7.tar.gz (101kB)
[K     |███▎                            | 10kB 15.6MB/s eta 0:00:01[K     |██████▌                         | 20kB 20.4MB/s eta 0:00:01[K     |█████████▊                      | 30kB 10.4MB/s eta 0:00:01[K     |█████████████                   | 40kB 9.1MB/s eta 0:00:01[K     |████████████████▏               | 51kB 7.2MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 7.3MB/s eta 0:00:01[K     |██████████████████████▊         | 71kB 7.5MB/s eta 0:00:01[K     |██████████████████████████      | 81kB 7.5MB/s eta 0:00:01[K     |█████████████████████████████▏  | 92kB 7.5MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 4.9MB/s 
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25l[?25hdone
  Created wheel for pyod: filename=pyod-0.8.7-cp37-non



In [None]:
#Dataset
fraud = pd.read_csv("creditcard.csv")
fraud.head(5) 

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
#Train test split
def time_split_train_test(full_dataframe,):
  train_set = full_dataframe.iloc[:200000]
  train_X = train_set.drop('Class', axis = 1)
  train_y = train_set['Class']
  test_set = full_dataframe.iloc[-70000:]
  test_X = test_set.drop('Class', axis = 1)
  test_y = test_set['Class']
  return train_X, test_X, train_y, test_y

In [None]:
train_X, test_X, train_y, test_y = time_split_train_test(fraud)

### Standardization

In [None]:
scaler = RobustScaler()

train_X['normAmount'] = scaler.fit_transform(train_X['Amount'].values.reshape(-1, 1))
train_X['normTime'] = scaler.fit_transform(train_X['Time'].values.reshape(-1, 1))

test_X['normAmount'] = scaler.transform(test_X['Amount'].values.reshape(-1, 1))
test_X['normTime'] = scaler.transform(test_X['Time'].values.reshape(-1, 1))

train_X_std = train_X.drop(['Amount'],axis=1)
train_X_std = train_X.drop(['Time'],axis=1)
test_X_std = test_X.drop(['Amount'], axis=1)
test_X_std = test_X.drop(['Time'], axis=1)

train_X = train_X.drop(['normAmount'],axis=1)
train_X = train_X.drop(['normTime'],axis=1)
test_X = test_X.drop(['normAmount'], axis=1)
test_X = test_X.drop(['normTime'], axis=1)


### Resampling

In [None]:
count_class_0 = 46500

In [None]:
train_X_us, train_y_us= NearMiss(sampling_strategy={0: count_class_0}).fit_resample(train_X, train_y)
train_X_std_us, train_y_std_us= NearMiss(sampling_strategy={0: count_class_0}).fit_resample(train_X_std, train_y)

In [None]:
count_class_1 = 3500

train_X, train_y= SMOTE(sampling_strategy={1: count_class_1}).fit_resample(train_X_us, train_y_us)
train_X_std, train_y_std= SMOTE(sampling_strategy={1: count_class_1}).fit_resample(train_X_std_us, train_y_std_us)

In [None]:
train_X_resampled, train_y_resampled= SMOTE(ratio = 'minority').fit_sample(train_X, train_y)
train_X_std_resampled, train_y_std_resampled= SMOTE(ratio = 'minority').fit_sample(train_X_std, train_y_std)

In [None]:
def get_contamination_percentages(label_series, normal_value):
    return len(label_series[label_series != normal_value])/len(label_series)

In [None]:
print("Percentage Training Set Contamination: ".ljust(39), get_contamination_percentages(train_y_resampled, 0))
print("Percentage Test Set Contamination: ".ljust(39), get_contamination_percentages(test_y, 0))

Percentage Training Set Contamination:  0.5
Percentage Test Set Contamination:      0.0013142857142857142


# Matthews Correlation Coefficient

#### GridSearch

In [None]:
score_func = make_scorer(matthews_corrcoef)

def hyperparameter_tuning2(model, parameter_dict, score, train_X, train_y, test_X, test_y):
  timeseries_split = TimeSeriesSplit(n_splits=5)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=timeseries_split, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))


#### RandomSearch

In [None]:
score_func = make_scorer(matthews_corrcoef)
def hyperparameter_tuning(model, parameter_dict, score, train_X, train_y, test_X, test_y):
  timeseries_split = TimeSeriesSplit(n_splits=5)
  RS = RandomizedSearchCV(model, param_distributions=parameter_dict, cv=timeseries_split, scoring = score)

  RS.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(RS.best_score_))
  print("Best Parameters:", RS.best_params_)
  print("Test-set Score: {:.3f}".format(RS.score(test_X, test_y)))

## Iforest

In [None]:
#oversampled
#standardized
parameter_search_grid = {
    'n_estimators': [50, 75, 100, 125, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X= train_X_std_resampled, train_y= train_y_std_resampled, test_X= test_X_std, test_y= test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_estimators': 125, 'max_samples': 'auto', 'contamination': 0.09}
Test-set Score: 0.001


In [None]:
#oversampled
#not standardized
parameter_search_grid = {
    'n_estimators': [50, 75, 100, 125, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X= train_X_resampled, train_y= train_y_resampled, test_X= test_X, test_y= test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_estimators': 50, 'max_samples': 128, 'contamination': 0.05}
Test-set Score: -0.002


In [None]:
#not standardized
#not oversampled
parameter_search_grid = {
    'n_estimators': [50, 75, 100, 125, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X= train_X, train_y= train_y, test_X= test_X, test_y= test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.170
Best Parameters: {'n_estimators': 75, 'max_samples': 128, 'contamination': 0.05}
Test-set Score: 0.161


In [None]:
#standardized
#not oversampled 
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.173
Best Parameters: {'n_estimators': 50, 'max_samples': 'auto', 'contamination': 0.01}
Test-set Score: -0.000


## HBOS (not included in the experiments)

In [None]:
#standardized
#oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presen

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'tol': 0.5, 'n_bins': 20, 'contamination': 0.05, 'alpha': 0.1}
Test-set Score: 0.085


In [None]:
#not standardized
#oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
ValueError: `bins` must be positive, when an integer

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presen

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'tol': 0.5, 'n_bins': 80, 'contamination': 0.07, 'alpha': 0.1}
Test-set Score: -0.002


In [None]:
#not standardized
#not oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.162
Best Parameters: {'tol': 0.5, 'n_bins': 100, 'contamination': 0.05, 'alpha': 0.1}
Test-set Score: 0.146


In [None]:
#standardized
#not oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.171
Best Parameters: {'tol': 0.5, 'n_bins': 80, 'contamination': 0.01, 'alpha': 0.1}
Test-set Score: 0.013


## Local Outlier Factor

In [None]:
#standardized
#oversampled

parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_neighbors': 30, 'metric': 'cosine', 'contamination': 0.09, 'algorithm': 'auto'}
Test-set Score: 0.020


In [None]:
#not standardized
#oversampled
parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_neighbors': 30, 'metric': 'cosine', 'contamination': 0.01, 'algorithm': 'auto'}
Test-set Score: 0.127


In [None]:
#not standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.129
Best Parameters: {'n_neighbors': 700, 'metric': 'cosine', 'contamination': 0.05, 'algorithm': 'auto'}
Test-set Score: -0.002


In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.176
Best Parameters: {'n_neighbors': 100, 'metric': 'cityblock', 'contamination': 0.01, 'algorithm': 'auto'}
Test-set Score: -0.020


## CBLOF

In [None]:
#oversampled
#standardized

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_y

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_clusters': 15, 'contamination': 0.07, 'beta': 15, 'alpha': 0.8}
Test-set Score: 0.009


In [None]:
#oversampled
#not standardized

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_clusters': 25, 'contamination': 0.01, 'beta': 20, 'alpha': 0.9}
Test-set Score: 0.000


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
#not oversampled 
#not standardized

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.058
Best Parameters: {'n_clusters': 25, 'contamination': 0.09, 'beta': 25, 'alpha': 0.8}
Test-set Score: 0.000


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning.")
ValueError: alpha is set to 1. Not in the range of (0, 1).

  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.144
Best Parameters: {'n_clusters': 20, 'contamination': 0.03, 'beta': 25, 'alpha': 0.9}
Test-set Score: 0.003


## KNN

In [None]:
#standardized
#oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_neighbors': 100, 'method': 'median', 'contamination': 0.03, 'algorithm': 'auto'}
Test-set Score: 0.011


In [None]:
#not standardized
#oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning."

Best Mean Cross-Validation Score: 0.000
Best Parameters: {'n_neighbors': 5, 'method': 'mean', 'contamination': 0.03, 'algorithm': 'auto'}
Test-set Score: 0.000


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
#not standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.062
Best Parameters: {'n_neighbors': 500, 'method': 'median', 'contamination': 0.07, 'algorithm': 'auto'}
Test-set Score: 0.000


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  "y should not be presented in unsupervised l

Best Mean Cross-Validation Score: 0.170
Best Parameters: {'n_neighbors': 100, 'method': 'largest', 'contamination': 0.01, 'algorithm': 'auto'}
Test-set Score: 0.002


## ABOD

In [None]:
#standardized
#oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  "y should not be presented in unsupervised learning.")
  **kwargs)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         }  

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#not standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

## OCSVM

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

# F1 score

In [None]:
score_func = make_scorer(f1_score)

def hyperparameter_tuning2(model, parameter_dict, score, train_X, train_y, test_X, test_y):
  timeseries_split = TimeSeriesSplit(n_splits=5)
  grid = GridSearchCV(model, param_grid=parameter_dict, cv=timeseries_split, scoring = score)

  grid.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(grid.best_score_))
  print("Best Parameters:", grid.best_params_)
  print("Test-set Score: {:.3f}".format(grid.score(test_X, test_y)))


In [None]:
def hyperparameter_tuning(model, parameter_dict, score, train_X, train_y, test_X, test_y):
  timeseries_split = TimeSeriesSplit(n_splits=5)
  RS = RandomizedSearchCV(model, param_distributions=parameter_dict, cv=timeseries_split, scoring = score)

  RS.fit(train_X, train_y)
  print("Best Mean Cross-Validation Score: {:.3f}".format(RS.best_score_))
  print("Best Parameters:", RS.best_params_)
  print("Test-set Score: {:.3f}".format(RS.score(test_X, test_y)))

## Iforest

In [None]:
#oversampled
#standardized
parameter_search_grid = {
    'n_estimators': [50, 75, 100, 125, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X= train_X_std_resampled, train_y= train_y_std_resampled, test_X= test_X_std, test_y= test_y)

In [None]:
#oversampled
#not standardized
parameter_search_grid = {
    'n_estimators': [50, 75, 100, 125, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X= train_X_resampled, train_y= train_y_resampled, test_X= test_X, test_y= test_y)

In [None]:
#not standardized
#not oversampled
parameter_search_grid = {
    'n_estimators': [50, 75, 100, 125, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X= train_X, train_y= train_y, test_X= test_X, test_y= test_y)

In [None]:
#standardized
#not oversampled 
parameter_search_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': ['auto', 128, 256],
    'contamination': [0.01, 0.03, 0.05, 0.07, 0.09]
}

hyperparameter_tuning(model = IForest(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y, test_X = test_X_std, test_y = test_y)

## HBOS

In [None]:
#standardized
#oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X, test_y = test_y)

In [None]:
#not standardized
#oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#not standardized
#not oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
#standardized
#not oversampled
parameter_search_grid = {'n_bins':[0, 20, 40, 60, 80, 100],
                         'alpha' :[0.1],
                         'tol' :[0.5], 
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         } 

hyperparameter_tuning(model = HBOS(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X, test_y = test_y)

## Local Outlier Factor

In [None]:
#standardized
#oversampled

parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

In [None]:
#not standardized
#oversampled
parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#not standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[30, 100, 500, 700, 1000],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm' : ['auto'],
                         'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
                         } 

hyperparameter_tuning(model = LOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

## CLOF

In [None]:
#oversampled
#standardized

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

In [None]:
#oversampled
#not standardized

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#not oversampled 
#not standardized

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_clusters':[15, 20, 25],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'alpha': [0.8, 0.9, 1],
                         'beta': [15, 20, 25]
                         } 

hyperparameter_tuning(model = CBLOF(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

## KNN

In [None]:
#standardized
#oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

In [None]:
#not standardized
#oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#not standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 500],
                         'method': ['largest', 'mean', 'median'],
                         'contamination': [0.01, 0.03, 0.05, 0.07, 0.09],
                         'algorithm': ['auto']
                         } 

hyperparameter_tuning(model = KNN(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

## ABOD

In [None]:
#standardized
#oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

In [None]:
#Oversampled
parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         }  

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func , cross_fold = 5, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
#not standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
#standardized
#not oversampled

parameter_search_grid = {'n_neighbors':[5, 10, 20, 100, 150, 200, 500],
                         'contamination': [0.01, 0.03 ,0.05, 0.07, 0.1]                         
                         } 

hyperparameter_tuning(model = ABOD(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)

## OCSVM

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X, train_y = train_y, test_X = test_X, test_y = test_y)

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_resampled, train_y = train_y_resampled, test_X = test_X, test_y = test_y)

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std_resampled, train_y = train_y_std_resampled, test_X = test_X_std, test_y = test_y)

In [None]:
parameter_search_grid = {'gamma': ['auto', 'scale', 0.001, 0.0001],
                         'kernel': ['rbf']
                         } 

hyperparameter_tuning(model = OCSVM(), parameter_dict = parameter_search_grid, score = score_func, 
                      train_X = train_X_std, train_y = train_y_std, test_X = test_X_std, test_y = test_y)