In [22]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.ensemble import AdaBoostClassifier

In [23]:
data = pd.read_csv('Datasets/preprocessed_data.csv')
data.head()

Unnamed: 0,Duration,Src_bytes,Dst_bytes,Land,Wrong_fragment,Urgent,Hot,Num_failed_logins,Logged_in,Num_compromised,...,Service_telnet,Service_tim_i,Service_time,Service_urh_i,Service_urp_i,Service_uucp,Service_uucp_path,Service_vmnet,Service_whois,attack_category
0,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
X = data.drop(columns=['attack_category'])
y = data['attack_category']

In [44]:
def evaluate(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='macro')
        recall = recall_score(y_test, pred, average='macro') 
        f1 = f1_score(y_test, pred, average='macro') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
        model.__class__.__name__, 
        np.mean(accuracy_scores),
        np.mean(precision_scores),
        np.mean(recall_scores),
        np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [45]:
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
evaluate(adaboost_clf, X, y)

AdaBoostClassifier Accuracy:  0.74055 Precision:  0.66046 Recall:  0.60075 F1_Score:  0.58471


-0.5847127009488127

In [28]:
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Beat Parameters:', best)
    return best 

In [36]:
# optimize f1_score
adaboost_search_space = {
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R']),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50)
}

def objective_func(search_space):
    adaboost_clf = AdaBoostClassifier(algorithm=search_space['algorithm'],
                            learning_rate=search_space['learning_rate'],
                            n_estimators=int(search_space['n_estimators']),
                            random_state=0)
    
    return evaluate(adaboost_clf, X, y)

adaboost_best = best_params(objective_func, adaboost_search_space)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

AdaBoostClassifier Accuracy:  0.96201 Precision:  0.64615 Recall:  0.56607 F1_Score:  0.57582
AdaBoostClassifier Accuracy:  0.64702 Precision:  0.61042 Recall:  0.61609 F1_Score:  0.54019
AdaBoostClassifier Accuracy:  0.97793 Precision:  0.81330 Recall:  0.73036 F1_Score:  0.76039
AdaBoostClassifier Accuracy:  0.81708 Precision:  0.68819 Recall:  0.73039 F1_Score:  0.67091
AdaBoostClassifier Accuracy:  0.97555 Precision:  0.81523 Recall:  0.73519 F1_Score:  0.76310
AdaBoostClassifier Accuracy:  0.97614 Precision:  0.81095 Recall:  0.71658 F1_Score:  0.74985
AdaBoostClassifier Accuracy:  0.54901 Precision:  0.38862 Recall:  0.47025 F1_Score:  0.38874
AdaBoostClassifier Accuracy:  0.85912 Precision:  0.81331 Recall:  0.73817 F1_Score:  0.75500
AdaBoostClassifier Accuracy:  0.56087 Precision:  0.49122 Recall:  0.53352 F1_Score:  0.46078
AdaBoostClassifier Accuracy:  0.97709 Precision:  0.86965 Recall:  0.76882 F1_Score:  0.80356
AdaBoostClassifier Accuracy:  0.95562 Precision:  0.83431 Re

In [46]:
# {'algorithm': 0, 'learning_rate': 0.5997669570977865, 'n_estimators': 300.0}
adaboost_clf = AdaBoostClassifier(algorithm='SAMME',
                            learning_rate=adaboost_best['learning_rate'],
                            n_estimators=int(adaboost_best['n_estimators']),
                            random_state=0)
evaluate(adaboost_clf, X, y)

AdaBoostClassifier Accuracy:  0.96527 Precision:  0.84660 Recall:  0.79599 F1_Score:  0.81523


-0.8152279287250644

In [41]:
pca_data = pd.read_csv('Datasets/PCA_train_data.csv')
pca_data.head()

Unnamed: 0,Land,Logged_in,Root_shell,Su_attempted,Is_hot_login,Is_guest_login,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,Flag_OTH,...,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,attack_category
0,0,0,0,0,0,0,0.0,0.0,1.0,0.0,...,0.903497,-0.694587,0.17223,0.294604,0.150987,0.234315,-0.280996,-0.277381,-0.345195,0
1,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,0.15282,0.044606,0.091991,-0.028531,-0.008606,0.013408,-0.070227,-0.019272,0.013048,1
2,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.39152,-0.006278,0.234796,0.123361,0.141845,0.123492,-0.188316,0.021826,-0.916639,0
3,0,1,0,0,0,0,0.0,1.0,0.0,0.0,...,0.325415,0.044665,0.26645,0.126206,-0.024708,0.152279,-0.27695,0.050839,0.332695,0
4,0,0,0,0,0,0,0.0,1.0,0.0,0.0,...,-0.111212,0.0351,-0.160819,-0.337899,-0.085937,-0.275472,0.239464,0.06654,0.157528,1


In [42]:
pca_X = data.drop(columns=['attack_category'])
pca_y = data['attack_category']

In [47]:
adaboost_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
evaluate(adaboost_clf, pca_X, pca_y)

AdaBoostClassifier Accuracy:  0.74055 Precision:  0.66046 Recall:  0.60075 F1_Score:  0.58471


-0.5847127009488127

In [48]:
# optimize f1_score
adaboost_search_space = {
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R']),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50)
}

def objective_func(search_space):
    adaboost_clf = AdaBoostClassifier(algorithm=search_space['algorithm'],
                            learning_rate=search_space['learning_rate'],
                            n_estimators=int(search_space['n_estimators']),
                            random_state=0)
    
    return evaluate(adaboost_clf, pca_X, pca_y)

adaboost_best = best_params(objective_func, adaboost_search_space)

AdaBoostClassifier Accuracy:  0.96201 Precision:  0.64615 Recall:  0.56607 F1_Score:  0.57582
AdaBoostClassifier Accuracy:  0.64702 Precision:  0.61042 Recall:  0.61609 F1_Score:  0.54019
AdaBoostClassifier Accuracy:  0.97793 Precision:  0.81330 Recall:  0.73036 F1_Score:  0.76039
AdaBoostClassifier Accuracy:  0.81708 Precision:  0.68819 Recall:  0.73039 F1_Score:  0.67091
AdaBoostClassifier Accuracy:  0.97555 Precision:  0.81523 Recall:  0.73519 F1_Score:  0.76310
AdaBoostClassifier Accuracy:  0.97614 Precision:  0.81095 Recall:  0.71658 F1_Score:  0.74985
AdaBoostClassifier Accuracy:  0.54901 Precision:  0.38862 Recall:  0.47025 F1_Score:  0.38874
AdaBoostClassifier Accuracy:  0.85912 Precision:  0.81331 Recall:  0.73817 F1_Score:  0.75500
AdaBoostClassifier Accuracy:  0.56087 Precision:  0.49122 Recall:  0.53352 F1_Score:  0.46078
AdaBoostClassifier Accuracy:  0.97709 Precision:  0.86965 Recall:  0.76882 F1_Score:  0.80356
AdaBoostClassifier Accuracy:  0.95562 Precision:  0.83431 Re

In [49]:
# {'algorithm': 0, 'learning_rate': 0.5997669570977865, 'n_estimators': 300.0}
adaboost_clf = AdaBoostClassifier(algorithm='SAMME',
                            learning_rate=adaboost_best['learning_rate'],
                            n_estimators=int(adaboost_best['n_estimators']),
                            random_state=0)
evaluate(adaboost_clf, pca_X, pca_y)

AdaBoostClassifier Accuracy:  0.96527 Precision:  0.84660 Recall:  0.79599 F1_Score:  0.81523


-0.8152279287250644