In [46]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.ensemble import AdaBoostClassifier

In [12]:
train_dataset = pd.read_pickle('Datasets/train_dataset.pkl')
test_dataset = pd.read_pickle('Datasets/test_dataset.pkl')

In [25]:
remove_cols = ['Land',
 'Root_shell',
 'Su_attempted',
 'Is_hot_login',
 'Is_guest_login',
 'Flag_OTH',
 'Flag_RSTO',
 'Flag_RSTOS0',
 'Flag_S1',
 'Flag_S2',
 'Flag_S3',
 'Flag_SH',
 'Num_failed_logins_scaled',
 'Num_file_creations_scaled',
 'Num_access_files_scaled',
 'attack_type',
 'is_attack',
 'attack_category']

final_cols_no_pca = [col for col in train_dataset.columns if (col not in remove_cols) and ('PCA' not in col)]

final_cols_pca = ['Land', 'Logged_in', 'Root_shell', 'Su_attempted', 'Is_hot_login', 'Is_guest_login', 'Protocol_type_icmp',
       'Protocol_type_tcp', 'Protocol_type_udp', 'Flag_OTH', 'Flag_REJ',
       'Flag_RSTO', 'Flag_RSTOS0', 'Flag_RSTR', 'Flag_S0', 'Flag_S1',
       'Flag_S2', 'Flag_S3', 'Flag_SF', 'Flag_SH', 'Service_encoded'] + [('PCA' + str(i)) for i in range(1,14 + 1)]

only_pca = [('PCA' + str(i)) for i in range(1,14 + 1)]

In [36]:
X_train_no_pca = train_dataset[final_cols_no_pca]
y_train_no_pca_attack_cat = train_dataset['attack_category']

X_test_no_pca = test_dataset[final_cols_no_pca]
y_test_no_pca_attack_cat = test_dataset['attack_category']

In [15]:
X_train_pca = train_dataset[final_cols_pca]
y_train_pca_attack_cat = train_dataset['attack_category']

X_test_pca = test_dataset[final_cols_pca]
y_test_pca_attack_cat = test_dataset['attack_category']

In [29]:
X_train_only_pca = train_dataset[only_pca]
y_train_only_pca_attack_cat = train_dataset['attack_category']

X_test_only_pca = test_dataset[only_pca]
y_test_only_pca_attack_cat = test_dataset['attack_category']

In [24]:
X_train_pca

Unnamed: 0,Land,Logged_in,Root_shell,Su_attempted,Is_hot_login,Is_guest_login,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,Flag_OTH,...,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14
0,0,0,0,0,0,0,0.0,0.0,1.0,0.0,...,-3.474426,0.903497,-0.694587,0.172230,0.294604,0.150987,0.234315,-0.280996,-0.277381,-0.345195
1,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,0.267239,0.152820,0.044606,0.091991,-0.028531,-0.008606,0.013408,-0.070227,-0.019272,0.013048
2,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.935711,0.391520,-0.006278,0.234796,0.123361,0.141845,0.123492,-0.188316,0.021826,-0.916639
3,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.629514,0.325415,0.044665,0.266450,0.126206,-0.024708,0.152279,-0.276950,0.050839,0.332695
4,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,1.253406,-0.111212,0.035100,-0.160819,-0.337899,-0.085937,-0.275472,0.239464,0.066540,0.157528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25186,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,1.241572,0.092561,-0.035499,-0.147807,-0.345770,-0.083090,-0.266670,0.250470,0.079154,0.174025
25187,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,-1.111415,-0.746458,0.168355,-0.085463,-0.098003,0.090475,-0.120282,-0.030141,-0.356743,-1.728345
25188,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,1.242395,0.076610,-0.042149,-0.140812,-0.331942,-0.080067,-0.257339,0.240103,0.085038,0.166029
25189,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,0.208735,0.023409,0.117842,0.070309,-0.050357,-0.015558,-0.009259,-0.066782,-0.051849,0.004690


In [59]:
def evaluate(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)
    
    return -f1

AdaBoost 

In [48]:
# Without PCA 
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
evaluate(adaboost_clf, X_train_no_pca, y_train_no_pca_attack_cat, X_test_no_pca, y_test_no_pca_attack_cat)

Accuracy: 49.93%
Precision: 54.60%
Recall: 49.93%
F1 Score: 46.49%
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.61      0.58      5890
           1       0.42      0.53      0.47      2629
           2       0.42      0.71      0.53      1097
           3       0.73      0.06      0.12      2199
           4       0.11      0.09      0.10        35

    accuracy                           0.50     11850
   macro avg       0.45      0.40      0.36     11850
weighted avg       0.55      0.50      0.46     11850



-0.46489738415833054

In [49]:
# PCA + Other variables 
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
evaluate(adaboost_clf, X_train_pca, y_train_pca_attack_cat, X_test_pca, y_test_pca_attack_cat)

Accuracy: 38.73%
Precision: 38.86%
Recall: 38.73%
F1 Score: 35.49%
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.44      0.47      5890
           1       0.35      0.42      0.38      2629
           2       0.25      0.83      0.39      1097
           3       0.19      0.00      0.00      2199
           4       0.00      0.00      0.00        35

    accuracy                           0.39     11850
   macro avg       0.26      0.34      0.25     11850
weighted avg       0.39      0.39      0.35     11850



-0.3549159403952657

In [50]:
# Only PCA 
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
evaluate(adaboost_clf, X_train_only_pca, y_train_only_pca_attack_cat, X_test_only_pca, y_test_only_pca_attack_cat)

Accuracy: 50.07%
Precision: 56.54%
Recall: 50.07%
F1 Score: 47.07%
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.58      0.55      5890
           1       0.48      0.54      0.51      2629
           2       0.40      0.79      0.54      1097
           3       0.85      0.10      0.18      2199
           4       0.19      0.09      0.12        35

    accuracy                           0.50     11850
   macro avg       0.49      0.42      0.38     11850
weighted avg       0.57      0.50      0.47     11850



-0.4707079303484721

In [18]:
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Beat Parameters:', best)
    return best 

In [61]:
# optimize f1_score
adaboost_search_space = {
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R']),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50)
}

def objective_func(search_space):
    adaboost_clf = AdaBoostClassifier(algorithm=search_space['algorithm'],
                            learning_rate=search_space['learning_rate'],
                            n_estimators=int(search_space['n_estimators']),
                            random_state=0)
    
    return evaluate(adaboost_clf, X_train_pca, y_train_pca_attack_cat, X_test_pca, y_test_pca_attack_cat)

adaboost_best = best_params(objective_func, adaboost_search_space)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Accuracy:                                             
0.5517299578059072                                    
Precision:                                            
0.637974607600486                                     
Recall:                                               
0.5517299578059072                                    
F1 Score:                                             
0.49655604097289124                                   
Classification Report:                                

              precision    recall  f1-score   support 

           0       0.55      0.68      0.61      5890
           1       0.64      0.72      0.68      2629
           2       0.40      0.60      0.48      1097
           3       1.00      0.00      0.00      2199
           4       0.00      0.00      0.00        35

    accuracy                           0.55     11850
   macro avg       0.52      0.40      0.35     11850
weighted avg       0.64      0.55      0.50     11850

Accuracy:     

In [55]:
adaboost_best = {'algorithm': 0, 'learning_rate': 0.6760232404850841, 'n_estimators': 150.0}
adaboost_clf = AdaBoostClassifier(algorithm='SAMME',
                            learning_rate=adaboost_best['learning_rate'],
                            n_estimators=int(adaboost_best['n_estimators']),
                            random_state=0)
evaluate(adaboost_clf,  X_train_no_pca, y_train_no_pca_attack_cat, X_test_no_pca, y_test_no_pca_attack_cat)

Accuracy: 68.10%
Precision: 62.28%
Recall: 68.10%
F1 Score: 61.24%
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.80      0.72      5890
           1       0.82      0.86      0.84      2629
           2       0.57      1.00      0.73      1097
           3       0.33      0.00      0.00      2199
           4       0.00      0.00      0.00        35

    accuracy                           0.68     11850
   macro avg       0.48      0.53      0.46     11850
weighted avg       0.62      0.68      0.61     11850



-0.6124088948399126