In [281]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.over_sampling import SMOTE


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [260]:
train_dataset = pd.read_csv('Datasets/train_dataset.csv')
test_dataset = pd.read_csv('Datasets/test_dataset.csv')

PCA_train_dataset = pd.read_csv('Datasets/PCA_train_data.csv')
PCA_test_dataset = pd.read_csv('Datasets/PCA_test_data.csv')

Tree_train_dataset = pd.read_csv('Datasets/Tree_train_data.csv')
Tree_test_dataset = pd.read_csv('Datasets/Tree_test_data.csv')

In [261]:
X_train = train_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_train = train_dataset['attack_category']

X_test = test_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_test = test_dataset['attack_category']

X_pca_train = PCA_train_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_pca_train = PCA_train_dataset["attack_category"]

X_pca_test = PCA_test_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_pca_test= PCA_test_dataset["attack_category"]

X_tree_train = Tree_train_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_tree_train = Tree_train_dataset["attack_category"]

X_tree_test = Tree_test_dataset.drop(columns=['is_attack', 'attack_category',"attack"], axis = 1)
y_tree_test= Tree_test_dataset["attack_category"]

In [262]:
def train_model(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
    #     model.__class__.__name__, 
    #     np.mean(accuracy_scores),
    #     np.mean(precision_scores),
    #     np.mean(recall_scores),
    #     np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [263]:
def evaluate(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("Model: ", model.__class__.__name__)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("####################################")
    
    # report = classification_report(y_test, y_pred)
    # print("Classification Report:\n", report)
    
    return f1

In [264]:
def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Best Parameters:', best)
    return best 

# Hyperparameter Tuning for XGB, LOGREG, LGBM

## Original Dataset - SMOTE

In [285]:
print("Before SMOTE:", Counter(y_train))
sampling_strategy = {3: 10000, 4: 5000}

# Instantiating the SMOTE object
smote = SMOTE(random_state = 42, sampling_strategy = sampling_strategy)

# Applying SMOTE to generate synthetic samples
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("After SMOTE:", Counter(y_train_resampled))

Before SMOTE: Counter({0: 67343, 1: 45927, 2: 11656, 3: 995, 4: 52})
After SMOTE: Counter({0: 67343, 1: 45927, 2: 11656, 3: 10000, 4: 5000})


### XGBoost - Original Dataset w SMOTE

In [292]:
# optimize f1_score
xgboost_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def xgboost_objective_func(search_space):
    xgboost_clf = xgb.XGBClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(xgboost_clf, X_train_resampled, y_train_resampled)

xgboost_smote_best = best_params(xgboost_objective_func, xgboost_search_space)

100%|██████████| 20/20 [1:24:19<00:00, 252.98s/trial, best loss: -0.9992065653101951]
Best Parameters: {'colsample_bytree': 0.8371007539065887, 'learning_rate': 0.1473837710525076, 'max_depth': 5.0, 'min_child_weight': 3.0, 'n_estimators': 400.0, 'subsample': 0.9500425300788807}


In [293]:
xgboost_smote_best

{'colsample_bytree': 0.8371007539065887,
 'learning_rate': 0.1473837710525076,
 'max_depth': 5.0,
 'min_child_weight': 3.0,
 'n_estimators': 400.0,
 'subsample': 0.9500425300788807}

In [294]:
xgboost_clf = xgb.XGBClassifier(
    learning_rate=xgboost_smote_best['learning_rate'],
    n_estimators=int(xgboost_smote_best['n_estimators']),
    max_depth=int(xgboost_smote_best['max_depth']),
    min_child_weight=int(xgboost_smote_best['min_child_weight']),
    subsample=xgboost_smote_best['subsample'],
    colsample_bytree=xgboost_smote_best['colsample_bytree'],
    random_state=42
)

evaluate(xgboost_clf, X_train_resampled, y_train_resampled, X_test, y_test)


Model:  XGBClassifier
Accuracy:  0.7837562100780695
Precision:  0.8348013303379662
Recall:  0.7837562100780695
F1 Score:  0.7522161328476877
####################################


0.7522161328476877

### Logistic Regression - Original Dataset w SMOTE

In [286]:
logreg_search_space = {
    'C': hp.loguniform('C', -5, 2),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs', 'saga']),
    'max_iter': hp.quniform('max_iter', 50, 500, 50),
}

def logreg_objective_func(search_space):
    logreg_clf = LogisticRegression(
        C=search_space['C'],
        solver=search_space['solver'],
        max_iter=int(search_space['max_iter']),
        random_state=42
    )
    
    return train_model(logreg_clf, X_train_resampled, y_train_resampled)

logreg_smote_best = best_params(logreg_objective_func, logreg_search_space)


100%|██████████| 20/20 [05:56<00:00, 17.83s/trial, best loss: -0.9444115330081966]
Best Parameters: {'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}


In [287]:
logreg_smote_best

{'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}

In [288]:
logreg_clf = LogisticRegression(
    C=logreg_smote_best['C'],
    solver='saga',
    max_iter=int(logreg_smote_best['max_iter']),
    random_state=42
)

evaluate(logreg_clf, X_train_resampled, y_train_resampled, X_test, y_test)

Model:  LogisticRegression
Accuracy:  0.7460521646557843
Precision:  0.7931649549038085
Recall:  0.7460521646557843
F1 Score:  0.7158311682656731
####################################


0.7158311682656731

### LightGBM - Original Dataset w SMOTE

In [289]:
lgb_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def lgb_objective_func(search_space):
    lgb_clf = lgb.LGBMClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        num_leaves=int(search_space['num_leaves']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=42
    )
    
    return train_model(lgb_clf, X_train_resampled, y_train_resampled)

lgb_smote_best = best_params(lgb_objective_func, lgb_search_space)

100%|██████████| 20/20 [10:10<00:00, 30.51s/trial, best loss: -0.9993495805343382]
Best Parameters: {'colsample_bytree': 0.9804937070050257, 'learning_rate': 0.1750046026665177, 'max_depth': 6.0, 'n_estimators': 200.0, 'num_leaves': 20.0, 'subsample': 0.8896424410710098}


In [290]:
lgb_smote_best

{'colsample_bytree': 0.9804937070050257,
 'learning_rate': 0.1750046026665177,
 'max_depth': 6.0,
 'n_estimators': 200.0,
 'num_leaves': 20.0,
 'subsample': 0.8896424410710098}

In [291]:
lgb_clf = lgb.LGBMClassifier(
    learning_rate=lgb_smote_best['learning_rate'],
    n_estimators=int(lgb_smote_best['n_estimators']),
    max_depth=int(lgb_smote_best['max_depth']),
    num_leaves=int(lgb_smote_best['num_leaves']),
    subsample=lgb_smote_best['subsample'],
    colsample_bytree=lgb_smote_best['colsample_bytree'],
    random_state=42
)

evaluate(lgb_clf, X_train_resampled, y_train_resampled, X_test, y_test)


Model:  LGBMClassifier
Accuracy:  0.7802519517388219
Precision:  0.8304835338889357
Recall:  0.7802519517388219
F1 Score:  0.7486411557201836
####################################


0.7486411557201836

## PCA Dataset

### XGBoost - PCA Dataset

In [265]:
# optimize f1_score
xgboost_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def xgboost_objective_func(search_space):
    xgboost_clf = xgb.XGBClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        min_child_weight=int(search_space['min_child_weight']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=0
    )
    
    return train_model(xgboost_clf, X_pca_train, y_pca_train)

xgboost_best = best_params(xgboost_objective_func, xgboost_search_space)

100%|██████████| 20/20 [1:31:53<00:00, 275.67s/trial, best loss: -0.9950311762259508]
Best Parameters: {'colsample_bytree': 0.7916541987972994, 'learning_rate': 0.2368123608945097, 'max_depth': 7.0, 'min_child_weight': 4.0, 'n_estimators': 400.0, 'subsample': 0.8609679373696384}


In [266]:
xgboost_best

{'colsample_bytree': 0.7916541987972994,
 'learning_rate': 0.2368123608945097,
 'max_depth': 7.0,
 'min_child_weight': 4.0,
 'n_estimators': 400.0,
 'subsample': 0.8609679373696384}

In [269]:
rs_value = 42

In [283]:
xgboost_clf = xgb.XGBClassifier(
    learning_rate=xgboost_best['learning_rate'],
    n_estimators=int(xgboost_best['n_estimators']),
    max_depth=int(xgboost_best['max_depth']),
    min_child_weight=int(xgboost_best['min_child_weight']),
    subsample=xgboost_best['subsample'],
    colsample_bytree=xgboost_best['colsample_bytree'],
    random_state=0
)

evaluate(xgboost_clf, X_pca_train, y_pca_train, X_pca_test, y_pca_test)


Model:  XGBClassifier
Accuracy:  0.7635734563520227
Precision:  0.8166541466503601
Recall:  0.7635734563520227
F1 Score:  0.7361704185811515
####################################


0.7361704185811515

In [284]:
evaluate(xgboost_clf, X_pca_resampled, y_pca_resampled, X_pca_test, y_pca_test)

Model:  XGBClassifier
Accuracy:  0.7710255500354861
Precision:  0.8219488537733871
Recall:  0.7710255500354861
F1 Score:  0.7454700187953394
####################################


0.7454700187953394

### Logistic Regression - PCA Dataset

In [270]:
logreg_search_space = {
    'C': hp.loguniform('C', -5, 2),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs', 'saga']),
    'max_iter': hp.quniform('max_iter', 50, 500, 50),
}

def logreg_objective_func(search_space):
    logreg_clf = LogisticRegression(
        C=search_space['C'],
        solver=search_space['solver'],
        max_iter=int(search_space['max_iter']),
        random_state=0
    )
    
    return train_model(logreg_clf, X_pca_train, y_pca_train)

logreg_best = best_params(logreg_objective_func, logreg_search_space)



100%|██████████| 20/20 [03:58<00:00, 11.91s/trial, best loss: -0.9431113401879945]
Best Parameters: {'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}


In [None]:
{'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}

In [273]:
logreg_best

{'C': 0.7041774471868815, 'max_iter': 450.0, 'solver': 2}

In [275]:
logreg_clf = LogisticRegression(
    C=logreg_best['C'],
    solver='saga',
    max_iter=int(logreg_best['max_iter']),
    random_state=0
)

evaluate(logreg_clf, X_pca_train, y_pca_train, X_pca_test, y_pca_test)

Model:  LogisticRegression
Accuracy:  0.7457860184528035
Precision:  0.7958965417573522
Recall:  0.7457860184528035
F1 Score:  0.7028068065123463
####################################


0.7028068065123463

### Light GBM - PCA Dataset

In [277]:
lgb_search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

def lgb_objective_func(search_space):
    lgb_clf = lgb.LGBMClassifier(
        learning_rate=search_space['learning_rate'],
        n_estimators=int(search_space['n_estimators']),
        max_depth=int(search_space['max_depth']),
        num_leaves=int(search_space['num_leaves']),
        subsample=search_space['subsample'],
        colsample_bytree=search_space['colsample_bytree'],
        random_state=0
    )
    
    return train_model(lgb_clf, X_pca_train, y_pca_train)

lgb_best = best_params(lgb_objective_func, lgb_search_space)

100%|██████████| 20/20 [08:06<00:00, 24.33s/trial, best loss: -0.9940773858182965]
Best Parameters: {'colsample_bytree': 0.7971403330338507, 'learning_rate': 0.031232104259997354, 'max_depth': 8.0, 'n_estimators': 200.0, 'num_leaves': 40.0, 'subsample': 0.8614591695193656}


In [278]:
lgb_best

{'colsample_bytree': 0.7971403330338507,
 'learning_rate': 0.031232104259997354,
 'max_depth': 8.0,
 'n_estimators': 200.0,
 'num_leaves': 40.0,
 'subsample': 0.8614591695193656}

In [279]:
lgb_clf = lgb.LGBMClassifier(
    learning_rate=lgb_best['learning_rate'],
    n_estimators=int(lgb_best['n_estimators']),
    max_depth=int(lgb_best['max_depth']),
    num_leaves=int(lgb_best['num_leaves']),
    subsample=lgb_best['subsample'],
    colsample_bytree=lgb_best['colsample_bytree'],
    random_state=0
)

evaluate(lgb_clf, X_pca_train, y_pca_train, X_pca_test, y_pca_test)


Model:  LGBMClassifier
Accuracy:  0.769916607523066
Precision:  0.8176418461597722
Recall:  0.769916607523066
F1 Score:  0.7403061205224614
####################################


0.7403061205224614