<a href="https://colab.research.google.com/github/MSaber9/2021Project3/blob/main/6_binary_classifier_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/Colab Notebooks/Newi/2021Project3/

/content/drive/My Drive/Colab Notebooks/Newi/2021Project3


In [None]:
!ls

01_data_cleanup.ipynb	04_dl_anomaly_detection.ipynb  Dataupdate  models5
02_data_analysis.ipynb	05_dl_classifier.ipynb	       mlids	   processed
03_ml_classifier.ipynb	Data			       models4	   tmp


In [None]:
pip install catboost



In [None]:
pip install hyperopt



In [None]:
pip install "hyperopt==0.2.3"



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import uuid
import joblib as jl
import tensorflow as tf
from sklearn.metrics import average_precision_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool
from tensorflow.keras import models
from hyperopt import fmin, hp, tpe, atpe, Trials, STATUS_OK
from hyperopt.plotting import main_plot_history, main_plot_vars


rand_state = 42
tf.random.set_seed(rand_state)
np.random.seed(rand_state)

%matplotlib inline

%load_ext autoreload
%autoreload 2

In [None]:
!ls '/content/drive/My Drive/Colab Notebooks/Newi/2021Project3/mlids'

conf.py      keras   model_selection.py  tf_utils.py  utils3.py
data	     libs    prediction.py	 transform    utils.py
__init__.py  models  __pycache__	 utils2.py    visualization.py


In [None]:
import mlids.keras.prediction as kp
from mlids.data.dataset import load_dataset
from mlids.data.metadata import FEATURES_NO_VARIANCE
from mlids.visualization import plot_hist, print_binary_performance, plot_pr_curve, plot_pr_curves, plot_pr_threshold_curves
from mlids.keras.metrics import AveragePrecisionScoreMetric
from mlids.model_selection import best_precision_for_target_recall
from mlids.prediction import predict_proba_positive, predict_decision_boundary
from mlids.tf_utils import enable_gpu_memory_growth


In [None]:
!ls

01_data_cleanup.ipynb	04_dl_anomaly_detection.ipynb  Dataupdate  models5
02_data_analysis.ipynb	05_dl_classifier.ipynb	       mlids	   processed
03_ml_classifier.ipynb	Data			       models4	   tmp


In [None]:
from mlids.utils3 import transform_data, get_best_model_path, print_trial_results



In [None]:
dataset_base_path = '/content/drive/My Drive/Colab Notebooks/Newi/2021Project3/Dataupdate'

##Data Loading & Preparation



In [None]:
dataset = load_dataset(dataset_base_path,
                       omit_cols=FEATURES_NO_VARIANCE + ['timestamp', 'dst_port', 'protocol'],
                       preserve_neg_value_cols=['init_fwd_win_byts', 'init_bwd_win_byts'])

X_train, y_train, X_val, y_val, X_test, y_test, column_names = transform_data(dataset=dataset,
                                                                              imputer_strategy='median',
                                                                              scaler=StandardScaler,
                                                                              attack_samples=100000,
                                                                              random_state=rand_state)

del dataset

Samples:
Training: (12986354, 68)
Val:      (1623294, 68)
Test:     (1623295, 68)

Training labels:
Benign                      10787766
DDOS attack-HOIC              548809
DDoS attacks-LOIC-HTTP        460953
DoS attacks-Hulk              369530
Bot                           228953
FTP-BruteForce                154688
SSH-Bruteforce                150071
Infilteration                 129547
DoS attacks-SlowHTTPTest      111912
DoS attacks-GoldenEye          33206
DoS attacks-Slowloris           8792
DDOS attack-LOIC-UDP            1384
Brute Force -Web                 489
Brute Force -XSS                 184
SQL Injection                     70
Name: label, dtype: int64

Validation labels:
Benign                      1348471
DDOS attack-HOIC              68601
DDoS attacks-LOIC-HTTP        57619
DoS attacks-Hulk              46191
Bot                           28619
FTP-BruteForce                19336
SSH-Bruteforce                18759
Infilteration                 16193
DoS attacks



Samples:
Training: (13542229, 68)

Training labels:
Counter({0: 10787766, 8: 548809, 10: 460953, 5: 369530, 1: 228953, 11: 154688, 14: 150071, 12: 129547, 6: 111912, 2: 100000, 4: 100000, 7: 100000, 9: 100000, 13: 100000, 3: 100000})


## 2 Class Weight Calculation



In [None]:
y_train_is_attack = (y_train != 0).astype('int')

minority_class_weight = len(y_train_is_attack[y_train_is_attack == 0]) / len(y_train_is_attack[y_train_is_attack == 1])

class_weights = { 
    0: 1, 
    1: minority_class_weight
}

class_weights_catboost = [1, minority_class_weight]

## 3. Random Forest Model + Hyperparameter Search-Space

 

In [None]:
def train_random_forest(args):
    print('\nRun') 
    print('==========')
    print('Parameters:\n{}'.format(args))
    
    nr_estimators = int(args['nr_estimators'])
    criterion = args['criterion']
    max_depth = int(args['max_depth']['depth']) if args['max_depth']['depth'] else None
    max_features = args['max_features']
    min_samples_split = int(args['min_samples_split'])
    min_samples_leaf = int(args['min_samples_leaf'])
    
    model_path = 'models/rf_{}.joblib'.format(uuid.uuid4())
    
    clf = RandomForestClassifier(n_estimators=nr_estimators,
                                 criterion=criterion,
                                 max_depth=max_depth,
                                 max_features=max_features,
                                 min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf,
                                 class_weight='balanced',
                                 n_jobs=-1,
                                 verbose=1,
                                 random_state=rand_state)
    
    clf.fit(X_train, y_train_is_attack)
    
    pred = predict_proba_positive(clf, X_val)
    pr_score = average_precision_score(y_val.label_is_attack, pred)
    
    jl.dump(clf, model_path)

    print('PR Score: {}'.format(pr_score))
    
    return {
        'loss': -pr_score,
        'status': STATUS_OK,
        'model_path': model_path
    }

In [None]:
trials_rf = Trials()

space = { 
    'nr_estimators': hp.quniform('nr_estimators', 10, 100, 1),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_depth': hp.choice('max_depth', [
        {
            'depth': None
        },
        {
            'depth': hp.quniform('nr_max_depth', 10, 100, 1),
        }
    ]),  
    'max_features': hp.choice('max_features', ['sqrt', 'log2']),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1)
}

best_run_rf = fmin(fn=train_random_forest,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=50,
                   trials=trials_rf)


Run
Parameters:
{'criterion': 'entropy', 'max_depth': {'depth': None}, 'max_features': 'log2', 'min_samples_leaf': 2.0, 'min_samples_split': 7.0, 'nr_estimators': 88.0}
  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 40 concurrent workers.



### Hyperparameter Search Results for RF



In [None]:
plt.figure(figsize=(20, 5))
main_plot_history(trials_rf)

plt.figure(figsize=(20, 20))
main_plot_vars(trials_rf, columns=3, colorize_best=10, arrange_by_loss=False)

 Optimal Model Parameters

Looking at the best performing model, we can observe the following parameter configuration:
* `80` tree estimators,
* the `entropy` information gain criterion,
* a maximum depth of `32` nodes per tree,
* `sqrt` as the selector for the maximum number of features considered when splitting a node,
* a minimum of `3` samples per leaf node and
* a minimum of `4` samples necessary to split a node.

In [None]:
print_trial_results(trials_rf, best_run_rf)

### Performance for RF

PR score of `0.9810`, with a precision of `0.967` and a recall of `0.955` in respect to the positive class.    


In [None]:
clf_rf = jl.load(get_best_model_path(trials_rf))

In [None]:
pred_val_proba_rf = predict_proba_positive(clf_rf, X_val)
plot_pr_curve(y_val.label_is_attack.values, pred_val_proba_rf)

In [None]:
pred_val_rf = clf_rf.predict(X_val)
print_binary_performance(y_val, y_val.label_is_attack, pred_val_rf)

## Gradient Boosted Tree Model + Hyperparameter Search-Space

In [None]:
train_pool = Pool(X_train, y_train_is_attack)
val_pool = Pool(X_val, y_val.label_is_attack)

NR_LOGS = 10

In [None]:
def train_gradient_boost(args):
    print('\nRun') 
    print('==========')
    print('Parameters:\n{}'.format(args))
    
    nr_iterations = int(args['nr_iterations'])
    tree_depth = int(args['tree_depth'])
    l2_reg = args['l2_reg']
    border_count = int(args['border_count'])
    random_strength = int(args['random_strength'])
    
    model_path = 'models/gb_{}.catboost'.format(uuid.uuid4())
    
    clf = CatBoostClassifier(loss_function='Logloss',                     
                             iterations=nr_iterations,
                             depth=tree_depth,
                             l2_leaf_reg=l2_reg,
                             border_count=border_count,
                             random_strength=random_strength,
                             task_type='GPU',
                             class_weights=class_weights_catboost,
                             verbose=(nr_iterations // NR_LOGS),
                             random_seed=rand_state)

    clf.fit(train_pool, eval_set=val_pool)
    
    pred = predict_proba_positive(clf, val_pool)
    pr_score = average_precision_score(y_val.label_is_attack, pred)
    
    clf.save_model(model_path)
    
    print('PR Score: {}'.format(pr_score))
    
    return {
        'loss': -pr_score,
        'status': STATUS_OK,
        'model_path': model_path
    }

In [None]:
trials_gb = Trials()

space = { 
    'nr_iterations': hp.quniform('nr_iterations', 100, 2000, 100),
    'tree_depth': hp.quniform('tree_depth', 4, 10, 1),
    'l2_reg': hp.uniform('l2_reg', 1, 10),
    'border_count': hp.choice('border_count', [128, 254]),
    'random_strength': hp.quniform('random_strength', 0, 5, 1)
}

best_run_gb = fmin(fn=train_gradient_boost,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=100,
                   trials=trials_gb)

### Hyperparameter Search Results for GBT


In [None]:
plt.figure(figsize=(20, 5))
main_plot_history(trials_gb)

plt.figure(figsize=(20, 20))
main_plot_vars(trials_gb, columns=3, colorize_best=10, arrange_by_loss=False)

Optimal Model Parameters for GBT

The optimal parameter configuration obtained via the best performing model is as follows:
* usage of `1900` trees, 
* a maximum depth of `10` per tree,
* a L2 regularization coefficient of `4.8139`,
* a border count of `254` and
* a random strenght parameter of `5`.

In [None]:
print_trial_results(trials_gb, best_run_gb)

### Performance for GBT

PR score of `0.9826`, 
Precision of `0.964` and a Recall of `0.957` for the positive class.    


In [None]:
clf_gb = CatBoostClassifier()
clf_gb.load_model(get_best_model_path(trials_gb))

In [None]:
pred_val_proba_gb = predict_proba_positive(clf_gb, val_pool)
plot_pr_curve(y_val.label_is_attack.values, pred_val_proba_gb)

In [None]:
pred_val_gb = clf_gb.predict(X_val)
print_binary_performance(y_val, y_val.label_is_attack, pred_val_gb)

## 5. Deep Neural Network Model

previous [experiment](https://github.com/MSaber9/2021Project3/blob/main/05_dl_classifier.ipynb)

In [None]:
!ls

01_data_cleanup.ipynb	04_dl_anomaly_detection.ipynb  Dataupdate  models5
02_data_analysis.ipynb	05_dl_classifier.ipynb	       mlids	   processed
03_ml_classifier.ipynb	Data			       models4	   tmp


In [None]:
!ls '/content/drive/My Drive/Colab Notebooks/Newi/2021Project3/models4/'

denoising_autoencoder_model.h5	simple_autoencoder_model.h5
opt_model.h5			stacked_autoencoder_model.h5


In [None]:
clf_dl = models.load_model('/content/drive/My Drive/Colab Notebooks/Newi/2021Project3/models4/opt_model.h5')

In [None]:
pred_val_proba_dl = kp.predict_proba(clf_dl, X_val)
plot_pr_curve(y_val.label_is_attack.values, pred_val_proba_dl)

In [None]:
pred_val_dl = kp.predict(clf_dl, X_val)
print_binary_performance(y_val, y_val.label_is_attack, pred_val_dl)

## Model Comparison

In [None]:
plot_pr_curves(y_val.label_is_attack, 
               {
                   'Random Forest': pred_val_proba_rf,
                   'Gradient Boost': pred_val_proba_gb,
                   'Neural Network': pred_val_proba_dl
               }, 
               size=(20, 8))


|Model|PR Score|Precision Positive|Recall Positive|False-Positives|False-Negatives|
|---|---|---|---|---|---|
|Random Forest         |0.98102|**0.967**|0.955|**8820**|12322|
|Gradient Boosted Trees|**0.98266**|0.964|**0.957**|9784|**11748**|
|Deep Neural Network   |0.97816|**0.967**|0.954|8966|12629|

> Gradient Bosted Tree model going forward, as it achieves the best overall performance.

## Precision / Recall Decision Boundary


In [None]:
plot_pr_threshold_curves(y_val.label_is_attack, pred_val_proba_gb)

In [None]:
recalls = np.arange(0.96, 1, 0.01)

pred_val_proba_gb = predict_proba_positive(clf_gb, X_val)

for recall in recalls:
    b = best_precision_for_target_recall(y_val.label_is_attack, pred_val_proba_gb, target_recall=recall)
    pred_val_gb = predict_decision_boundary(clf_gb, X_val, b)
    print('Target Recall of {}'.format(recall))
    print('=====================')
    print('Decision Boundary: {}\n'.format(b))
    print(classification_report(y_val.label_is_attack, pred_val_gb, digits=3))
    print('')



|Decision Boundary|Precision Positive|Recall Positive|F1 (macro)|
|---|---|---|---|
|0.5 (default)|0.964|0.957|0.976|
|0.3556       |0.952|0.960|0.974|
|0.0786       |0.774|0.970|0.914|
|0.0539       |0.543|0.980|0.803|
|0.0420       |0.380|0.990|0.675|
|0.0007       |0.201|1.000|0.327|


Performance on Test Set

In [None]:
pred_test_gb = clf_gb.predict(X_test)
print_binary_performance(y_test, y_test.label_is_attack, pred_test_gb)