In [295]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.over_sampling import SMOTE


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [296]:
train_dataset = pd.read_csv('Datasets/train_dataset.csv')
test_dataset = pd.read_csv('Datasets/test_dataset.csv')

PCA_train_dataset = pd.read_csv('Datasets/PCA_train_data.csv')
PCA_test_dataset = pd.read_csv('Datasets/PCA_test_data.csv')

Tree_train_dataset = pd.read_csv('Datasets/Tree_train_data.csv')
Tree_test_dataset = pd.read_csv('Datasets/Tree_test_data.csv')

In [297]:
X_train = train_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_train = train_dataset['attack_category']

X_test = test_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_test = test_dataset['attack_category']

X_pca_train = PCA_train_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_pca_train = PCA_train_dataset["attack_category"]

X_pca_test = PCA_test_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_pca_test= PCA_test_dataset["attack_category"]

X_tree_train = Tree_train_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_tree_train = Tree_train_dataset["attack_category"]

X_tree_test = Tree_test_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_tree_test= Tree_test_dataset["attack_category"]

In [298]:
def train_model(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
    #     model.__class__.__name__, 
    #     np.mean(accuracy_scores),
    #     np.mean(precision_scores),
    #     np.mean(recall_scores),
    #     np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [299]:
def evaluate(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("Model: ", model.__class__.__name__)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("####################################")
    
    # report = classification_report(y_test, y_pred)
    # print("Classification Report:\n", report)
    
    return f1

In [300]:
def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Best Parameters:', best)
    return best 

# Hyperparameter Tuning for XGB, LOGREG, LGBM

## Original Dataset - SMOTE

In [301]:
print("Before SMOTE:", Counter(y_train))
sampling_strategy = {3: 10000, 4: 5000}

# Instantiating the SMOTE object
smote = SMOTE(random_state = 42, sampling_strategy = sampling_strategy)

# Applying SMOTE to generate synthetic samples
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("After SMOTE:", Counter(y_train_resampled))

Before SMOTE: Counter({0: 67343, 1: 45927, 2: 11656, 3: 995, 4: 52})
After SMOTE: Counter({0: 67343, 1: 45927, 2: 11656, 3: 10000, 4: 5000})


### XGBoost - Original Dataset

In [302]:
# optimize f1_score
xgboost_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def xgboost_objective_func(search_space):
    xgboost_clf = xgb.XGBClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(xgboost_clf, X_train, y_train)

xgboost_best = best_params(xgboost_objective_func, xgboost_search_space)

100%|██████████| 20/20 [2:08:16<00:00, 384.84s/trial, best loss: -0.9991431830611784]   
Best Parameters: {'colsample_bytree': 0.6947001151470121, 'learning_rate': 0.1903979884533492, 'max_depth': 4.0, 'min_child_weight': 3.0, 'n_estimators': 300.0, 'subsample': 0.8968355124649539}


In [303]:
xgboost_best

{'colsample_bytree': 0.6947001151470121,
 'learning_rate': 0.1903979884533492,
 'max_depth': 4.0,
 'min_child_weight': 3.0,
 'n_estimators': 300.0,
 'subsample': 0.8968355124649539}

In [304]:
xgboost_clf = xgb.XGBClassifier(
    learning_rate=xgboost_best['learning_rate'],
    n_estimators=int(xgboost_best['n_estimators']),
    max_depth=int(xgboost_best['max_depth']),
    min_child_weight=int(xgboost_best['min_child_weight']),
    subsample=xgboost_best['subsample'],
    colsample_bytree=xgboost_best['colsample_bytree'],
    random_state=42
)

evaluate(xgboost_clf, X_train, y_train, X_test, y_test)


Model:  XGBClassifier
Accuracy:  0.771735273243435
Precision:  0.8267240141848498
Recall:  0.771735273243435
F1 Score:  0.7307784698534782
####################################


0.7307784698534782

### Logistic Regression - Original Dataset

In [305]:
logreg_search_space = {
    'C': hp.loguniform('C', -5, 2),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs', 'saga']),
    'max_iter': hp.quniform('max_iter', 50, 500, 50),
}

def logreg_objective_func(search_space):
    logreg_clf = LogisticRegression(
        C=search_space['C'],
        solver=search_space['solver'],
        max_iter=int(search_space['max_iter']),
        random_state=42
    )
    
    return train_model(logreg_clf, X_train, y_train)

logreg_best = best_params(logreg_objective_func, logreg_search_space)


100%|██████████| 20/20 [06:47<00:00, 20.38s/trial, best loss: -0.9603507555240267]
Best Parameters: {'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}


In [306]:
logreg_best

{'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}

In [307]:
logreg_clf = LogisticRegression(
    C=logreg_best['C'],
    solver='saga',
    max_iter=int(logreg_best['max_iter']),
    random_state=42
)

evaluate(logreg_clf, X_train, y_train, X_test, y_test)

Model:  LogisticRegression
Accuracy:  0.7347409510290986
Precision:  0.7451520047618645
Recall:  0.7347409510290986
F1 Score:  0.6880444014186261
####################################


0.6880444014186261

### LightGBM - Original Dataset

In [308]:
lgb_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def lgb_objective_func(search_space):
    lgb_clf = lgb.LGBMClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        num_leaves=int(search_space['num_leaves']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(lgb_clf, X_train, y_train)

lgb_best = best_params(lgb_objective_func, lgb_search_space)

100%|██████████| 20/20 [04:54<00:00, 14.75s/trial, best loss: -0.9987556299061202]
Best Parameters: {'colsample_bytree': 0.7421428951838873, 'learning_rate': 0.030571346892657877, 'max_depth': 7.0, 'n_estimators': 200.0, 'num_leaves': 30.0, 'subsample': 0.7006367676212871}


In [309]:
lgb_best

{'colsample_bytree': 0.7421428951838873,
 'learning_rate': 0.030571346892657877,
 'max_depth': 7.0,
 'n_estimators': 200.0,
 'num_leaves': 30.0,
 'subsample': 0.7006367676212871}

In [310]:
lgb_clf = lgb.LGBMClassifier(
    learning_rate=lgb_best['learning_rate'],
    n_estimators=int(lgb_best['n_estimators']),
    max_depth=int(lgb_best['max_depth']),
    num_leaves=int(lgb_best['num_leaves']),
    subsample=lgb_best['subsample'],
    colsample_bytree=lgb_best['colsample_bytree'],
    random_state=42
)

evaluate(lgb_clf, X_train, y_train, X_test, y_test)


Model:  LGBMClassifier
Accuracy:  0.75
Precision:  0.8186114231426869
Recall:  0.75
F1 Score:  0.7095278534996156
####################################


0.7095278534996156

## PCA Dataset

### XGBoost - PCA Dataset

In [311]:
# optimize f1_score
xgboost_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def xgboost_objective_func(search_space):
    xgboost_clf = xgb.XGBClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(xgboost_clf, X_pca_train, y_pca_train)

xgboost_pca_best = best_params(xgboost_objective_func, xgboost_search_space)

100%|██████████| 20/20 [4:13:33<00:00, 760.66s/trial, best loss: -0.9951946942223394]   
Best Parameters: {'colsample_bytree': 0.7943552681682089, 'learning_rate': 0.296360168110823, 'max_depth': 6.0, 'min_child_weight': 2.0, 'n_estimators': 450.0, 'subsample': 0.7184859038205977}


In [312]:
xgboost_pca_best

{'colsample_bytree': 0.7943552681682089,
 'learning_rate': 0.296360168110823,
 'max_depth': 6.0,
 'min_child_weight': 2.0,
 'n_estimators': 450.0,
 'subsample': 0.7184859038205977}

In [313]:
xgboost_clf = xgb.XGBClassifier(
    learning_rate=xgboost_pca_best['learning_rate'],
    n_estimators=int(xgboost_pca_best['n_estimators']),
    max_depth=int(xgboost_pca_best['max_depth']),
    min_child_weight=int(xgboost_pca_best['min_child_weight']),
    subsample=xgboost_pca_best['subsample'],
    colsample_bytree=xgboost_pca_best['colsample_bytree'],
    random_state=42
)

evaluate(xgboost_clf, X_pca_train, y_pca_train, X_pca_test, y_pca_test)


Model:  XGBClassifier
Accuracy:  0.7622427253371186
Precision:  0.8160068744163382
Recall:  0.7622427253371186
F1 Score:  0.734746336798068
####################################


0.734746336798068

### Logistic Regression - PCA Dataset

In [314]:
logreg_search_space = {
    'C': hp.loguniform('C', -5, 2),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs', 'saga']),
    'max_iter': hp.quniform('max_iter', 50, 500, 50),
}

def logreg_objective_func(search_space):
    logreg_clf = LogisticRegression(
        C=search_space['C'],
        solver=search_space['solver'],
        max_iter=int(search_space['max_iter']),
        random_state=42
    )
    
    return train_model(logreg_clf, X_pca_train, y_pca_train)

logreg_pca_best = best_params(logreg_objective_func, logreg_search_space)



100%|██████████| 20/20 [04:49<00:00, 14.49s/trial, best loss: -0.9431113401879945]
Best Parameters: {'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}


In [315]:
{'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}

{'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}

In [316]:
logreg_pca_best

{'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}

In [317]:
logreg_clf = LogisticRegression(
    C=logreg_pca_best['C'],
    solver='saga',
    max_iter=int(logreg_pca_best['max_iter']),
    random_state=42
)

evaluate(logreg_clf, X_pca_train, y_pca_train, X_pca_test, y_pca_test)

Model:  LogisticRegression
Accuracy:  0.7457860184528035
Precision:  0.7958965417573522
Recall:  0.7457860184528035
F1 Score:  0.7028068065123463
####################################


0.7028068065123463

### Light GBM - PCA Dataset

In [318]:
lgb_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def lgb_objective_func(search_space):
    lgb_clf = lgb.LGBMClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        num_leaves=int(search_space['num_leaves']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(lgb_clf, X_pca_train, y_pca_train)

lgb_pca_best = best_params(lgb_objective_func, lgb_search_space)

100%|██████████| 20/20 [05:45<00:00, 17.25s/trial, best loss: -0.9938732916966033]
Best Parameters: {'colsample_bytree': 0.7971403330338507, 'learning_rate': 0.031232104259997354, 'max_depth': 8.0, 'n_estimators': 200.0, 'num_leaves': 40.0, 'subsample': 0.8614591695193656}


In [319]:
lgb_pca_best

{'colsample_bytree': 0.7971403330338507,
 'learning_rate': 0.031232104259997354,
 'max_depth': 8.0,
 'n_estimators': 200.0,
 'num_leaves': 40.0,
 'subsample': 0.8614591695193656}

In [320]:
lgb_clf = lgb.LGBMClassifier(
    learning_rate=lgb_pca_best['learning_rate'],
    n_estimators=int(lgb_pca_best['n_estimators']),
    max_depth=int(lgb_pca_best['max_depth']),
    num_leaves=int(lgb_pca_best['num_leaves']),
    subsample=lgb_pca_best['subsample'],
    colsample_bytree=lgb_pca_best['colsample_bytree'],
    random_state=42
)

evaluate(lgb_clf, X_pca_train, y_pca_train, X_pca_test, y_pca_test)


Model:  LGBMClassifier
Accuracy:  0.7660131298793471
Precision:  0.8138079501332512
Recall:  0.7660131298793471
F1 Score:  0.736568341369667
####################################


0.736568341369667

# Tree Dataset

### XGBoost - Tree Dataset

In [321]:
# optimize f1_score
xgboost_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def xgboost_objective_func(search_space):
    xgboost_clf = xgb.XGBClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(xgboost_clf, X_tree_train, y_tree_train)

xgboost_tree_best = best_params(xgboost_objective_func, xgboost_search_space)

100%|██████████| 20/20 [1:24:12<00:00, 252.64s/trial, best loss: -0.9991657848464588]
Best Parameters: {'colsample_bytree': 0.6947001151470121, 'learning_rate': 0.1903979884533492, 'max_depth': 4.0, 'min_child_weight': 3.0, 'n_estimators': 300.0, 'subsample': 0.8968355124649539}


In [322]:
xgboost_clf = xgb.XGBClassifier(
    learning_rate=xgboost_tree_best['learning_rate'],
    n_estimators=int(xgboost_tree_best['n_estimators']),
    max_depth=int(xgboost_tree_best['max_depth']),
    min_child_weight=int(xgboost_tree_best['min_child_weight']),
    subsample=xgboost_tree_best['subsample'],
    colsample_bytree=xgboost_tree_best['colsample_bytree'],
    random_state=42
)

evaluate(xgboost_clf, X_tree_train, y_tree_train, X_tree_test, y_tree_test)


Model:  XGBClassifier
Accuracy:  0.7754613200851668
Precision:  0.8284324772446929
Recall:  0.7754613200851668
F1 Score:  0.7343409168130508
####################################


0.7343409168130508

### Logistic Regression - Tree Dataset

In [323]:
logreg_search_space = {
    'C': hp.loguniform('C', -5, 2),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs', 'saga']),
    'max_iter': hp.quniform('max_iter', 50, 500, 50),
}

def logreg_objective_func(search_space):
    logreg_clf = LogisticRegression(
        C=search_space['C'],
        solver=search_space['solver'],
        max_iter=int(search_space['max_iter']),
        random_state=42
    )
    
    return train_model(logreg_clf, X_tree_train, y_tree_train)

logreg_tree_best = best_params(logreg_objective_func, logreg_search_space)



100%|██████████| 20/20 [05:29<00:00, 16.50s/trial, best loss: -0.9603079146464533]
Best Parameters: {'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}


In [324]:
logreg_clf = LogisticRegression(
    C=logreg_tree_best['C'],
    solver='saga',
    max_iter=int(logreg_tree_best['max_iter']),
    random_state=42
)

evaluate(logreg_clf, X_tree_train, y_tree_train, X_tree_test, y_tree_test)

Model:  LogisticRegression
Accuracy:  0.7344748048261178
Precision:  0.7449249065708662
Recall:  0.7344748048261178
F1 Score:  0.6878405507132683
####################################


0.6878405507132683

### LGBM - Tree Dataset

In [325]:
lgb_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def lgb_objective_func(search_space):
    lgb_clf = lgb.LGBMClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        num_leaves=int(search_space['num_leaves']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(lgb_clf, X_tree_train, y_tree_train)

lgb_tree_best = best_params(lgb_objective_func, lgb_search_space)

100%|██████████| 20/20 [04:51<00:00, 14.59s/trial, best loss: -0.9988211819717405]
Best Parameters: {'colsample_bytree': 0.7421428951838873, 'learning_rate': 0.030571346892657877, 'max_depth': 7.0, 'n_estimators': 200.0, 'num_leaves': 30.0, 'subsample': 0.7006367676212871}


In [326]:
lgb_clf = lgb.LGBMClassifier(
    learning_rate=lgb_tree_best['learning_rate'],
    n_estimators=int(lgb_tree_best['n_estimators']),
    max_depth=int(lgb_tree_best['max_depth']),
    num_leaves=int(lgb_tree_best['num_leaves']),
    subsample=lgb_tree_best['subsample'],
    colsample_bytree=lgb_tree_best['colsample_bytree'],
    random_state=42
)

evaluate(lgb_clf, X_tree_train, y_tree_train, X_tree_test, y_tree_test)


Model:  LGBMClassifier
Accuracy:  0.7493789921930447
Precision:  0.8152826143202561
Recall:  0.7493789921930447
F1 Score:  0.708959288221762
####################################


0.708959288221762