In [34]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

<h3>Prepare Datasets</h3>

In [2]:
train_dataset = pd.read_csv('Datasets/train_dataset.csv')
test_dataset = pd.read_csv('Datasets/test_dataset.csv')

In [4]:
train_pca_dataset = pd.read_csv('Datasets/PCA_train_data.csv')
test_pca_dataset = pd.read_csv('Datasets/PCA_test_data.csv')

In [6]:
train_tree_dataset = pd.read_csv('Datasets/Tree_train_data.csv')
test_tree_dataset = pd.read_csv('Datasets/Tree_test_data.csv')

In [15]:
X_train = train_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_train = train_dataset['attack_category']

X_test = test_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_test = test_dataset['attack_category']

In [18]:
X_pca_train = train_pca_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_pca_train = train_pca_dataset['attack_category']

X_pca_test = test_pca_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_pca_test = test_pca_dataset['attack_category']

In [41]:
X_tree_train = train_tree_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_tree_train = train_tree_dataset['attack_category']

X_tree_test = test_tree_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_tree_test = test_tree_dataset['attack_category']

<h3>General Functions Needed For Modelling</h3>

In [56]:
def train_model(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
    #     model.__class__.__name__, 
    #     np.mean(accuracy_scores),
    #     np.mean(precision_scores),
    #     np.mean(recall_scores),
    #     np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [32]:
def evaluate(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("Model: ", model.__class__.__name__)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("####################################")
    
    # report = classification_report(y_test, y_pred)
    # print("Classification Report:\n", report)
    
    return -f1

<h3>Original Dataset Modelling</h3> 

In [26]:
# Set random_states as 42 
rs_value = 42

In [51]:
lr_model = LogisticRegression(random_state=rs_value)
rf_model = RandomForestClassifier(random_state=rs_value)
xgb_clf = xgb.XGBClassifier(random_state=rs_value)
lgbm_clf = LGBMClassifier(random_state=rs_value)
adaboost_clf = AdaBoostClassifier(random_state=rs_value)

clf_models = [lr_model, rf_model, xgb_clf, lgbm_clf, adaboost_clf]
scores = np.multiply([evaluate(model, X_train, y_train, X_test, y_test) for model in clf_models], -1)

Model:  LogisticRegression
Accuracy:  0.7348296664300923
Precision:  0.7546736251553713
Recall:  0.7348296664300923
F1 Score:  0.6884020700966463
####################################
Model:  RandomForestClassifier
Accuracy:  0.7615773598296665
Precision:  0.8231482128601714
Recall:  0.7615773598296665
F1 Score:  0.7156440736259226
####################################
Model:  XGBClassifier
Accuracy:  0.7703601845280341
Precision:  0.8247819252525248
Recall:  0.7703601845280341
F1 Score:  0.7298555723568333
####################################
Model:  LGBMClassifier
Accuracy:  0.7254701916252662
Precision:  0.7389869315791423
Recall:  0.7254701916252662
F1 Score:  0.6929380191506069
####################################
Model:  AdaBoostClassifier
Accuracy:  0.6239354151880766
Precision:  0.5862870113326686
Recall:  0.6239354151880766
F1 Score:  0.5660824307458675
####################################


In [52]:
models = ['Logistic', 'RandomForest', 'XGB', 'LGBM', 'AdaBoost']
models_df = pd.Series(scores, index=models, name='scores').sort_values(ascending=True)

fig = px.bar(x=models_df.index, y=models_df.values, 
             text=np.round(models_df.values,5),
             color=models_df.values,
             color_continuous_scale='Teal')

fig.update_layout(
    title_text='Model Evaluations',
    xaxis=dict(
        title='Model',
        titlefont_size=16
    ),
    yaxis=dict(
        title='F1 Score',
        titlefont_size=16
    ),
    template='plotly_dark'
)
fig.show()

<h3>PCA Dataset Modelling</h3> 

In [38]:
lr_model = LogisticRegression(random_state=rs_value)
rf_model = RandomForestClassifier(random_state=rs_value)
xgb_clf = xgb.XGBClassifier(random_state=rs_value)
lgbm_clf = LGBMClassifier(random_state=rs_value)
adaboost_clf = AdaBoostClassifier(random_state=rs_value)

clf_models = [lr_model, rf_model, xgb_clf, lgbm_clf, adaboost_clf]
pca_scores = np.multiply([evaluate(model, X_pca_train, y_pca_train, X_pca_test, y_pca_test) for model in clf_models], -1)

Model:  LogisticRegression
Accuracy:  0.7463183108587651
Precision:  0.7953537254873454
Recall:  0.7463183108587651
F1 Score:  0.7032609304063231
####################################
Model:  RandomForestClassifier
Accuracy:  0.7637065294535131
Precision:  0.8209155354466824
Recall:  0.7637065294535131
F1 Score:  0.7275564586946798
####################################
Model:  XGBClassifier
Accuracy:  0.7624201561391057
Precision:  0.813984288759765
Recall:  0.7624201561391057
F1 Score:  0.7315784848035174
####################################
Model:  LGBMClassifier
Accuracy:  0.757097232079489
Precision:  0.7872121071196098
Recall:  0.757097232079489
F1 Score:  0.7248944847027852
####################################
Model:  AdaBoostClassifier
Accuracy:  0.6931777856635912
Precision:  0.7232606546898176
Recall:  0.6931777856635912
F1 Score:  0.6587872902324983
####################################


In [48]:
models = ['Logistic', 'RandomForest', 'XGB', 'LGBM', 'AdaBoost']
pca_models_df = pd.Series(pca_scores, index=models, name='scores').sort_values(ascending=True)

fig = px.bar(x=pca_models_df.index, y=pca_models_df.values, 
             text=np.round(pca_models_df.values,5),
             color=pca_models_df.values,
             color_continuous_scale='Teal')

fig.update_layout(
    title_text='PCA Dataset Model Evaluations',
    xaxis=dict(
        title='Model',
        titlefont_size=16
    ),
    yaxis=dict(
        title='F1 Score',
        titlefont_size=16
    ),
    template='plotly_dark'
)
fig.show()

<h3>Tree Dataset Modelling</h3> 

In [42]:
lr_model = LogisticRegression(random_state=rs_value)
rf_model = RandomForestClassifier(random_state=rs_value)
xgb_clf = xgb.XGBClassifier(random_state=rs_value)
lgbm_clf = LGBMClassifier(random_state=rs_value)
adaboost_clf = AdaBoostClassifier(random_state=rs_value)

clf_models = [lr_model, rf_model, xgb_clf, lgbm_clf, adaboost_clf]
tree_scores = np.multiply([evaluate(model, X_tree_train, y_tree_train, X_tree_test, y_tree_test) for model in clf_models], -1)

Model:  LogisticRegression
Accuracy:  0.7338981547196594
Precision:  0.7522934051925361
Recall:  0.7338981547196594
F1 Score:  0.6876300839120686
####################################
Model:  RandomForestClassifier
Accuracy:  0.7609119943222143
Precision:  0.8203167692705314
Recall:  0.7609119943222143
F1 Score:  0.7148706258898813
####################################
Model:  XGBClassifier
Accuracy:  0.7635290986515259
Precision:  0.8215443556657224
Recall:  0.7635290986515259
F1 Score:  0.7232799697682103
####################################
Model:  LGBMClassifier
Accuracy:  0.7190383250532293
Precision:  0.7491825007233943
Recall:  0.7190383250532293
F1 Score:  0.6764317031443482
####################################
Model:  AdaBoostClassifier
Accuracy:  0.6239354151880766
Precision:  0.5862870113326686
Recall:  0.6239354151880766
F1 Score:  0.5660824307458675
####################################


In [47]:
models = ['Logistic', 'RandomForest', 'XGB', 'LGBM', 'AdaBoost']
tree_models_df = pd.Series(tree_scores, index=models, name='scores').sort_values(ascending=True)

fig = px.bar(x=tree_models_df.index, y=tree_models_df.values, 
             text=np.round(tree_models_df.values,5),
             color=tree_models_df.values,
             color_continuous_scale='Teal')

fig.update_layout(
    title_text='Tree Dataset Model Evaluations',
    xaxis=dict(
        title='Model',
        titlefont_size=16
    ),
    yaxis=dict(
        title='F1 Score',
        titlefont_size=16
    ),
    template='plotly_dark'
)
fig.show()

<h3>Hyperoptimization</h3>

In [50]:
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=50,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Beat Parameters:', best)
    return best 

In [57]:
# optimize f1_score
adaboost_search_space = {
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R']),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50)
}

def objective_func(search_space):
    adaboost_clf = AdaBoostClassifier(algorithm=search_space['algorithm'],
                            learning_rate=search_space['learning_rate'],
                            n_estimators=int(search_space['n_estimators']),
                            random_state=0)
    
    return train_model(adaboost_clf, X_pca_train, y_pca_train)

adaboost_best = best_params(objective_func, adaboost_search_space)

100%|██████████| 50/50 [5:14:37<00:00, 377.56s/trial, best loss: -0.9375391397742133]    
Beat Parameters: {'algorithm': 0, 'learning_rate': 0.3782708765028089, 'n_estimators': 100.0}


In [58]:
adaboost_best = {'algorithm': 0, 'learning_rate': 0.3782708765028089, 'n_estimators': 100.0}
adaboost_clf = AdaBoostClassifier(algorithm='SAMME',
                            learning_rate=adaboost_best['learning_rate'],
                            n_estimators=int(adaboost_best['n_estimators']),
                            random_state=0)
evaluate(adaboost_clf,  X_pca_train, y_pca_train, X_pca_test, y_pca_test)

Model:  AdaBoostClassifier
Accuracy:  0.7212562100780695
Precision:  0.7389133780547105
Recall:  0.7212562100780695
F1 Score:  0.6717987099058366
####################################


-0.6717987099058366