In [11]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

<h3>Prepare Dataset</h3>

In [12]:
train_dataset = pd.read_csv('../Datasets/train_dataset.csv')
test_dataset = pd.read_csv('../Datasets/test_dataset.csv')

In [13]:
X_train = train_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_train = train_dataset['attack_category']

X_test = test_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_test = test_dataset['attack_category']

In [2]:
train_pca_dataset = pd.read_csv('../Datasets/PCA_train_data.csv')
test_pca_dataset = pd.read_csv('../Datasets/PCA_test_data.csv')

In [3]:
X_pca_train = train_pca_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_pca_train = train_pca_dataset['attack_category']

X_pca_test = test_pca_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_pca_test = test_pca_dataset['attack_category']

In [20]:
train_tree_dataset = pd.read_csv('../Datasets/Tree_train_data.csv')
test_tree_dataset = pd.read_csv('../Datasets/Tree_test_data.csv')

In [21]:
X_tree_train = train_tree_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_tree_train = train_tree_dataset['attack_category']

X_tree_test = test_tree_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_tree_test = test_tree_dataset['attack_category']

In [14]:
def train_model(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
    #     model.__class__.__name__, 
    #     np.mean(accuracy_scores),
    #     np.mean(precision_scores),
    #     np.mean(recall_scores),
    #     np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [15]:
def evaluate(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("Model: ", model.__class__.__name__)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("####################################")
    
    # report = classification_report(y_test, y_pred)
    # print("Classification Report:\n", report)
    
    return -f1

In [6]:
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Beat Parameters:', best)
    return best 

<h3>Original Dataset</h3>

In [17]:
adaboost_search_space = {
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R']),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 300, 50)
}

def objective_func(search_space):
    adaboost_clf = AdaBoostClassifier(
                            algorithm=search_space['algorithm'],
                            learning_rate=search_space['learning_rate'],
                            n_estimators=int(search_space['n_estimators']),
                            random_state=0)
    
    return train_model(adaboost_clf, X_train, y_train)

adaboost_best = best_params(objective_func, adaboost_search_space)

100%|██████████| 20/20 [22:11<00:00, 66.56s/trial, best loss: -0.9513468978164757] 
Beat Parameters: {'algorithm': 0, 'learning_rate': 0.8166140904941112, 'n_estimators': 200.0}


In [18]:
adaboost_best = {'algorithm': 0, 'learning_rate': 0.8166140904941112, 'n_estimators': 200.0}
adaboost_clf = AdaBoostClassifier(algorithm='SAMME',
                            learning_rate=adaboost_best['learning_rate'],
                            n_estimators=int(adaboost_best['n_estimators']),
                            random_state=0)
evaluate(adaboost_clf,  X_train, y_train, X_test, y_test)
    

Model:  AdaBoostClassifier
Accuracy:  0.7675212916962385
Precision:  0.7822387092378328
Recall:  0.7675212916962385
F1 Score:  0.7189770856784043
####################################


-0.7189770856784043

<h3>PCA Dataset</h3>

In [8]:
adaboost_search_space = {
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R']),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 300, 50)
}

def objective_func(search_space):
    adaboost_clf = AdaBoostClassifier(
                            algorithm=search_space['algorithm'],
                            learning_rate=search_space['learning_rate'],
                            n_estimators=int(search_space['n_estimators']),
                            random_state=0)
    
    return train_model(adaboost_clf, X_pca_train, y_pca_train)

adaboost_best = best_params(objective_func, adaboost_search_space)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 20/20 [33:51<00:00, 101.59s/trial, best loss: -0.9220142395103217]
Beat Parameters: {'algorithm': 0, 'learning_rate': 0.27407851601952515, 'n_estimators': 200.0}


In [10]:
# pca best params 
adaboost_best = {'algorithm': 0, 'learning_rate': 0.27407851601952515, 'n_estimators': 200.0}
adaboost_clf = AdaBoostClassifier(algorithm='SAMME',
                            learning_rate=adaboost_best['learning_rate'],
                            n_estimators=int(adaboost_best['n_estimators']),
                            random_state=0)
evaluate(adaboost_clf,  X_pca_train, y_pca_train, X_pca_test, y_pca_test)

Model:  AdaBoostClassifier
Accuracy:  0.7192601135557133
Precision:  0.7525886095343133
Recall:  0.7192601135557133
F1 Score:  0.6703417892228876
####################################


-0.6703417892228876

<h3>Tree Dataset</h3>

In [22]:
adaboost_search_space = {
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R']),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 300, 50)
}

def objective_func(search_space):
    adaboost_clf = AdaBoostClassifier(
                            algorithm=search_space['algorithm'],
                            learning_rate=search_space['learning_rate'],
                            n_estimators=int(search_space['n_estimators']),
                            random_state=0)
    
    return train_model(adaboost_clf, X_tree_train, y_tree_train)

adaboost_best = best_params(objective_func, adaboost_search_space)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 20/20 [16:52<00:00, 50.64s/trial, best loss: -0.9513468978164757]
Beat Parameters: {'algorithm': 0, 'learning_rate': 0.8166140904941112, 'n_estimators': 200.0}


In [23]:
adaboost_best = {'algorithm': 0, 'learning_rate': 0.8166140904941112, 'n_estimators': 200.0}
adaboost_clf = AdaBoostClassifier(algorithm='SAMME',
                            learning_rate=adaboost_best['learning_rate'],
                            n_estimators=int(adaboost_best['n_estimators']),
                            random_state=0)
evaluate(adaboost_clf,  X_tree_train, y_tree_train, X_tree_test, y_tree_test)

Model:  AdaBoostClassifier
Accuracy:  0.7675212916962385
Precision:  0.7822387092378328
Recall:  0.7675212916962385
F1 Score:  0.7189770856784043
####################################


-0.7189770856784043