In [3]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.linear_model import LogisticRegression

In [4]:
train_dataset = pd.read_csv('../Datasets/train_dataset.csv')
test_dataset = pd.read_csv('../Datasets/test_dataset.csv')

In [5]:
X_train = train_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_train = train_dataset['attack_category']

X_test = test_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_test = test_dataset['attack_category']

In [2]:
train_pca_dataset = pd.read_csv('../Datasets/PCA_train_data.csv')
test_pca_dataset = pd.read_csv('../Datasets/PCA_test_data.csv')

In [3]:
X_pca_train = train_pca_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_pca_train = train_pca_dataset['attack_category']

X_pca_test = test_pca_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_pca_test = test_pca_dataset['attack_category']

In [6]:
train_tree_dataset = pd.read_csv('../Datasets/Tree_train_data.csv')
test_tree_dataset = pd.read_csv('../Datasets/Tree_test_data.csv')

In [7]:
X_tree_train = train_tree_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_tree_train = train_tree_dataset['attack_category']

X_tree_test = test_tree_dataset.drop(['is_attack' , 'attack_category' , 'attack'], axis=1)
y_tree_test = test_tree_dataset['attack_category']

In [9]:
def train_model(model, X, y):
    skf = StratifiedKFold(n_splits=5)

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred, average='weighted')
        recall = recall_score(y_test, pred, average='weighted') 
        f1 = f1_score(y_test, pred, average='weighted') 

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    print('{0} Accuracy: {1: .5f} Precision: {2: .5f} Recall: {3: .5f} F1_Score: {4: .5f}'.format(
        model.__class__.__name__, 
        np.mean(accuracy_scores),
        np.mean(precision_scores),
        np.mean(recall_scores),
        np.mean(f1_scores)))
    
    return -np.mean(f1_scores)

In [10]:
def evaluate(model, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("Model: ", model.__class__.__name__)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("####################################")
    
    # report = classification_report(y_test, y_pred)
    # print("Classification Report:\n", report)
    
    return -f1

In [6]:
from hyperopt import hp 
from hyperopt import fmin, tpe, Trials 

def best_params(objective_func, search_space):
    trials = Trials() 

    best = fmin(fn=objective_func,
                space=search_space,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
                rstate=np.random.default_rng(seed=30))

    print('Beat Parameters:', best)
    return best 

<h3>Original Dataset</h3>

In [18]:
lr_search_space = {
    'C': hp.uniform('C', 0.1, 10.0),  # Regularization parameter
    'solver': hp.choice('solver', ['liblinear', 'saga']),  # Solver options
    'penalty': hp.choice('penalty', ['l1', 'l2']),  # Penalty type
    'max_iter': hp.choice('max_iter', range(100, 500)),  # Maximum number of iterations
}

def objective_func(search_space):
    lr_model = LogisticRegression(
                            C=search_space['C'],
                            solver=search_space['solver'],
                            penalty=search_space['penalty'],
                            max_iter=int(search_space['max_iter']),
                            random_state=42)
    
    return train_model(lr_model, X_train, y_train)
lr_best = best_params(objective_func, lr_search_space)

LogisticRegression Accuracy:  0.96661 Precision:  0.96659 Recall:  0.96661 F1_Score:  0.96653
LogisticRegression Accuracy:  0.96352 Precision:  0.96349 Recall:  0.96352 F1_Score:  0.96342
LogisticRegression Accuracy:  0.96977 Precision:  0.96966 Recall:  0.96977 F1_Score:  0.96966
LogisticRegression Accuracy:  0.97107 Precision:  0.97091 Recall:  0.97107 F1_Score:  0.97093
LogisticRegression Accuracy:  0.97107 Precision:  0.97093 Recall:  0.97107 F1_Score:  0.97094
LogisticRegression Accuracy:  0.96804 Precision:  0.96798 Recall:  0.96804 F1_Score:  0.96795
LogisticRegression Accuracy:  0.96490 Precision:  0.96490 Recall:  0.96490 F1_Score:  0.96481
LogisticRegression Accuracy:  0.96402 Precision:  0.96401 Recall:  0.96402 F1_Score:  0.96393
LogisticRegression Accuracy:  0.96493 Precision:  0.96493 Recall:  0.96493 F1_Score:  0.96484
LogisticRegression Accuracy:  0.97065 Precision:  0.97052 Recall:  0.97065 F1_Score:  0.97053
LogisticRegression Accuracy:  0.96295 Precision:  0.96290 Re

In [16]:
# original dataset
lr_best={'C': 6.675403357239448, 'max_iter': 355, 'penalty': 0, 'solver': 1}
lr_model = LogisticRegression(C=lr_best['C'],
                              solver='saga',
                              max_iter=lr_best['max_iter'],
                              penalty='l1')
evaluate(lr_model,  X_train, y_train, X_test, y_test)

Model:  LogisticRegression
Accuracy:  0.7414833215046132
Precision:  0.758691354255746
Recall:  0.7414833215046132
F1 Score:  0.6951494660538822
####################################


-0.6951494660538822

<h3>PCA Dataset</h3>

In [7]:
lr_search_space = {
    'C': hp.uniform('C', 0.1, 10.0),  # Regularization parameter
    'solver': hp.choice('solver', ['liblinear', 'saga']),  # Solver options
    'penalty': hp.choice('penalty', ['l1', 'l2']),  # Penalty type
    'max_iter': hp.choice('max_iter', range(100, 500)),  # Maximum number of iterations
}

def objective_func(search_space):
    lr_model = LogisticRegression(
                            C=search_space['C'],
                            solver=search_space['solver'],
                            penalty=search_space['penalty'],
                            max_iter=int(search_space['max_iter']),
                            random_state=42)
    
    return train_model(lr_model, X_pca_train, y_pca_train)
lr_best = best_params(objective_func, lr_search_space)

LogisticRegression Accuracy:  0.94269 Precision:  0.93911 Recall:  0.94269 F1_Score:  0.93981
LogisticRegression Accuracy:  0.94453 Precision:  0.94427 Recall:  0.94453 F1_Score:  0.94417
LogisticRegression Accuracy:  0.94270 Precision:  0.93919 Recall:  0.94270 F1_Score:  0.93985
LogisticRegression Accuracy:  0.94257 Precision:  0.93741 Recall:  0.94257 F1_Score:  0.93934
LogisticRegression Accuracy:  0.94273 Precision:  0.93925 Recall:  0.94273 F1_Score:  0.93989
LogisticRegression Accuracy:  0.94256 Precision:  0.93819 Recall:  0.94256 F1_Score:  0.93949
LogisticRegression Accuracy:  0.94477 Precision:  0.94465 Recall:  0.94477 F1_Score:  0.94454
LogisticRegression Accuracy:  0.94455 Precision:  0.94431 Recall:  0.94455 F1_Score:  0.94420
LogisticRegression Accuracy:  0.94478 Precision:  0.94467 Recall:  0.94478 F1_Score:  0.94455
LogisticRegression Accuracy:  0.94272 Precision:  0.93920 Recall:  0.94272 F1_Score:  0.93986
LogisticRegression Accuracy:  0.94452 Precision:  0.94427 Re

In [10]:
# PCA best params 
lr_best={'C': 6.289918870722408, 'max_iter': 282, 'penalty': 0, 'solver': 1}
lr_model = LogisticRegression(C=lr_best['C'],
                              solver='saga',
                              max_iter=lr_best['max_iter'],
                              penalty='l1')
evaluate(lr_model,  X_pca_train, y_pca_train, X_pca_test, y_pca_test)

Model:  LogisticRegression
Accuracy:  0.7469393186657204
Precision:  0.7929616104380759
Recall:  0.7469393186657204
F1 Score:  0.7050647383631301
####################################


-0.7050647383631301

<h3>Tree Dataset</h3>

In [23]:
lr_search_space = {
    'C': hp.uniform('C', 0.1, 10.0),  # Regularization parameter
    'solver': hp.choice('solver', ['liblinear', 'saga']),  # Solver options
    'penalty': hp.choice('penalty', ['l1', 'l2']),  # Penalty type
    'max_iter': hp.choice('max_iter', range(100, 500)),  # Maximum number of iterations
}

def objective_func(search_space):
    lr_model = LogisticRegression(
                            C=search_space['C'],
                            solver=search_space['solver'],
                            penalty=search_space['penalty'],
                            max_iter=int(search_space['max_iter']),
                            random_state=42)
    
    return train_model(lr_model, X_tree_train, y_tree_train)
lr_best = best_params(objective_func, lr_search_space)

LogisticRegression Accuracy:  0.96516 Precision:  0.96415 Recall:  0.96516 F1_Score:  0.96441
LogisticRegression Accuracy:  0.96328 Precision:  0.96328 Recall:  0.96328 F1_Score:  0.96319
LogisticRegression Accuracy:  0.96597 Precision:  0.96499 Recall:  0.96597 F1_Score:  0.96526
LogisticRegression Accuracy:  0.95663 Precision:  0.95478 Recall:  0.95663 F1_Score:  0.95512
LogisticRegression Accuracy:  0.96618 Precision:  0.96521 Recall:  0.96618 F1_Score:  0.96548
LogisticRegression Accuracy:  0.95786 Precision:  0.95654 Recall:  0.95786 F1_Score:  0.95673
LogisticRegression Accuracy:  0.96784 Precision:  0.96778 Recall:  0.96784 F1_Score:  0.96775
LogisticRegression Accuracy:  0.96348 Precision:  0.96348 Recall:  0.96348 F1_Score:  0.96340
LogisticRegression Accuracy:  0.97133 Precision:  0.97115 Recall:  0.97133 F1_Score:  0.97119
LogisticRegression Accuracy:  0.96602 Precision:  0.96504 Recall:  0.96602 F1_Score:  0.96531
LogisticRegression Accuracy:  0.96789 Precision:  0.96779 Re

In [11]:
# original dataset
lr_best={'C': 6.675403357239448, 'max_iter': 355, 'penalty': 0, 'solver': 1}
lr_model = LogisticRegression(C=lr_best['C'],
                              solver='saga',
                              max_iter=lr_best['max_iter'],
                              penalty='l1')
evaluate(lr_model,  X_tree_train, y_tree_train, X_tree_test, y_tree_test)

Model:  LogisticRegression
Accuracy:  0.7402413058907026
Precision:  0.7693423061320217
Recall:  0.7402413058907026
F1 Score:  0.693998052756342
####################################


-0.693998052756342