# Imports and model functions

In [124]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

from scipy.stats import randint, uniform
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, make_scorer

import time
import psutil
import threading
from memory_profiler import memory_usage

In [125]:
def apply_lgbm(X_train, y_train, best_params=None, n_jobs=-1, cv=5):
    """Core LightGBM training with resource monitoring"""
    measurement = {}
    best_params = best_params or {}
    try:
        # Ensure no NaN/Inf values
        if np.any(np.isnan(X_train)) or np.any(np.isinf(X_train)):
            raise ValueError("Input contains NaN or Inf values")
            
        # Validate num_leaves vs max_depth
        if best_params.get('num_leaves', 31) > 2**best_params.get('max_depth', 5):
            best_params['num_leaves'] = 2**best_params['max_depth']

        lgbm_model = LGBMClassifier(**best_params, verbose=-1, n_jobs=n_jobs)
        cpu_usage = []
        stop_flag = threading.Event()
        
    except Exception as e:
        print(f"LightGBM training failed: {str(e)}")
        return None, None, None

    def monitor_cpu():
        while not stop_flag.is_set():
            cpu_usage.append(psutil.cpu_percent(interval=0.1))

    try:
        cpu_thread = threading.Thread(target=monitor_cpu)
        cpu_thread.start()
        start_time = time.time()
        
        lgbm_model.fit(X_train, y_train)
        
        training_time = time.time() - start_time
        stop_flag.set()
        cpu_thread.join()

        # Record metrics
        measurement['Training Time (s)'] = training_time
        measurement['Peak CPU (%)'] = max(cpu_usage) if cpu_usage else 0
        measurement['Avg CPU (%)'] = np.mean(cpu_usage) if cpu_usage else 0

        # Cross-validation
        f1_scorer = make_scorer(f1_score, average='weighted')
        cv_scores = cross_val_score(
            lgbm_model, X_train, y_train, cv=cv, scoring=f1_scorer, n_jobs=n_jobs
        )
        
        return cv_scores, measurement, lgbm_model

    except Exception as e:
        print(f"LightGBM training failed: {str(e)}")
        return None, None, None

In [126]:
def eval_dataset_w_LGBM(X_train, X_test, y_train, y_test,
                       params_lgbm={'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': -1}):
    """Evaluation wrapper for LightGBM"""
    cv_scores_lgbm, measurement_lgbm, lgbm_model = apply_lgbm(X_train, y_train, best_params=params_lgbm)
    
    if lgbm_model is None:
        print("⛔ Failed to train LightGBM model")
        return
        
    start_time = time.time()
    y_pred_lgbm = lgbm_model.predict(X_test)
    training_time = time.time() - start_time
    print("Predict Time (s) - ", training_time)

    
    print(f'CV F1: {np.mean(cv_scores_lgbm):.4f} ± {np.std(cv_scores_lgbm):.4f}')
    print(f'Test Accuracy: {accuracy_score(y_test, y_pred_lgbm):.4f}')
    print(classification_report(y_test, y_pred_lgbm, digits=4))
    print("Resource Usage:", measurement_lgbm)

In [127]:
import optuna
from functools import partial

def show_results_LGBM(X_train, X_test, y_train, y_test, n_trials=100):
    def objective(trial, X_train, y_train, cv=5):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
            'objective': 'multiclass' if len(np.unique(y_train)) > 2 else 'binary',
            'metric': 'multi_logloss' if len(np.unique(y_train)) > 2 else 'binary_logloss'
        }
        
        try:
          cv_scores, _, model = apply_lgbm(X_train, y_train, best_params=params, cv=cv)
          score = np.mean(cv_scores) if cv_scores is not None else float('nan')
          return score if not np.isnan(score) else float('nan')
        except Exception as e:
            print(f"Trial failed with error: {e}")
            return float('nan')

    study = optuna.create_study(direction='maximize')
    study.optimize(partial(objective, X_train=X_train, y_train=y_train), n_trials=n_trials)
    best_params = study.best_params

    cv_scores_lgbm, measurement_lgbm, lgbm_model = apply_lgbm(X_train, y_train, best_params=best_params)
    
    if cv_scores_lgbm is None:
        print("LightGBM training failed")
        return

    y_pred_lgbm = lgbm_model.predict(X_test)

    try:
        f1 = f1_score(y_test, y_pred_lgbm, average='weighted')
        accuracy = accuracy_score(y_test, y_pred_lgbm)
        
        print("\nLightGBM Evaluation Results:")
        print("-" * 50)
        print(f'CV F1: {np.mean(cv_scores_lgbm):.4f} ± {np.std(cv_scores_lgbm):.4f}')
        print(f'Test F1: {f1:.4f}')
        print(f'Test Accuracy: {accuracy:.4f}')
        print("\nResource Usage:", measurement_lgbm)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred_lgbm))
    
    except Exception as e:
        print(f"Error in LightGBM evaluation: {str(e)}")
        raise
    
    return lgbm_model, best_params

# Prep for model training cicids2018

In [128]:
# Reading data
df = pd.read_csv("..\cicids2018_training.csv")

  df = pd.read_csv("..\cicids2018_training.csv")


In [129]:
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [130]:
# Preparing training and test splits
X = df.drop('Attack Type', axis=1)
y = df['Attack Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [131]:
# Initialize scaling algos
MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [132]:
y_train.unique()

array(['Normal Traffic', 'Brute Force', 'DoS', 'Bot', 'DDoS',
       'Infilteration'], dtype=object)

In [134]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [135]:
X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS':780000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

# Sync classes

In [92]:
y_train_resampled_scaled_MMS_SMOTE.unique()

array(['Bot', 'Brute Force', 'DDoS', 'DoS', 'Infilteration',
       'Normal Traffic'], dtype=object)

In [93]:
# Function to combine classes
def combine_classes(y, class_mapping):
    return y.map(class_mapping)
# Define the mapping
group_mapping_2018 = {
    'Normal Traffic': 'Normal Traffic',
    'DoS': 'DoS',
    'DDoS': 'DDoS',
    'Brute Force': 'Brute Force',
    'Bot': 'Bots',
    'Infilteration': 'Other'}

In [94]:
df["Attack Type"].unique()

array(['Normal Traffic', 'Bot', 'DoS', 'Brute Force', 'DDoS',
       'Infilteration'], dtype=object)

In [95]:
# Apply to all your sets
y_train = combine_classes(y_train, group_mapping_2018)
y_test = combine_classes(y_test, group_mapping_2018)

y_train_scaled_rus_MMS = combine_classes(y_train_scaled_rus_MMS, group_mapping_2018)
y_train_resampled_scaled_MMS_SMOTE = combine_classes(y_train_resampled_scaled_MMS_SMOTE, group_mapping_2018)

In [None]:
lbgm_model, best_params = show_results_LGBM(X_train_resampled_scaled_MMS_SMOTE, 
                                    X_test_MMS_scaled,
                                    y_train_resampled_scaled_MMS_SMOTE, 
                                    y_test, 
                                    n_trials=60)

In [96]:
eval_dataset_w_LGBM(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test, params_lgbm={'n_estimators': 228, 'learning_rate': 0.07241523535942174, 'max_depth': 14, 'num_leaves': 79, 'subsample': 0.5650088660864082, 'colsample_bytree': 0.8850730957587873, 'reg_alpha': 0.31650105405212536, 'reg_lambda': 3.8724602641849213})



Predict Time (s) -  22.58268141746521
CV F1: nan ± nan
Test Accuracy: 0.9753
                precision    recall  f1-score   support

          Bots     0.9911    0.9975    0.9943     43193
   Brute Force     0.9970    0.9954    0.9962     28463
          DDoS     0.9291    0.9923    0.9596    232641
           DoS     0.9877    0.9995    0.9936     58890
Normal Traffic     0.9907    0.9810    0.9858   2590259
         Other     0.2364    0.3000    0.2644     32259

      accuracy                         0.9753   2985705
     macro avg     0.8553    0.8776    0.8657   2985705
  weighted avg     0.9778    0.9753    0.9764   2985705

Resource Usage: {'Training Time (s)': 50.53769135475159, 'Peak CPU (%)': 100.0, 'Avg CPU (%)': np.float64(99.42433537832311)}


# Binary with cross-val between datasets

In [194]:
# Reading data
df = pd.read_csv("..\cicids2018_training.csv")

X_train = df.drop('Attack Type', axis=1)
y_train = df['Attack Type']

del df

  df = pd.read_csv("..\cicids2018_training.csv")


In [195]:
# Reading data
df = pd.read_csv("..\cicids2017_test_of_2018.csv")

X_test = df.drop('Attack Type', axis=1)
y_test = df['Attack Type']

del df

  df = pd.read_csv("..\cicids2017_test_of_2018.csv")


In [196]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

In [197]:
# Initialize scaling algos
MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [198]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [199]:
X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS':780000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [200]:
group_mapping_2017 = {
    'Normal Traffic': 'BENIGN',
    'DoS': 'Attack',
    'DDoS': 'Attack',
    'Brute Force': 'Attack',
    'Bots': 'Attack',
    'Other': 'Attack'}

In [201]:
group_mapping_2018 = {
    'Normal Traffic': 'BENIGN',
    'DoS': 'Attack',
    'DDoS': 'Attack',
    'Brute Force': 'Attack',
    'Bot': 'Attack',
    'Infilteration': 'Attack'}

In [202]:
y_train_resampled_scaled_MMS_SMOTE.value_counts()

Attack Type
Normal Traffic    1000000
DDoS               780000
DoS                200000
Bot                150000
Infilteration      110000
Brute Force        100000
Name: count, dtype: int64

In [203]:
y_test.value_counts()

Attack Type
Normal Traffic    2095057
DoS                193745
DDoS               128014
Other               92837
Brute Force          9150
Bots                 1948
Name: count, dtype: int64

In [204]:
y_test = y_test.map(group_mapping_2017)

In [205]:
y_train_resampled_scaled_MMS_SMOTE = y_train_resampled_scaled_MMS_SMOTE.map(group_mapping_2018)

In [206]:
y_train_resampled_scaled_MMS_SMOTE.value_counts()

Attack Type
Attack    1340000
BENIGN    1000000
Name: count, dtype: int64

In [208]:
y_train_resampled_scaled_MMS_SMOTE.unique()

array(['Attack', 'BENIGN'], dtype=object)

In [207]:
y_test.value_counts()

Attack Type
BENIGN    2095057
Attack     425694
Name: count, dtype: int64

In [None]:
lbgm_model, best_params = show_results_LGBM(X_train_resampled_scaled_MMS_SMOTE, 
                                    X_test_MMS_scaled,
                                    y_train_resampled_scaled_MMS_SMOTE, 
                                    y_test, 
                                    n_trials=60)

In [209]:
eval_dataset_w_LGBM(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test, params_lgbm={'n_estimators': 285, 'learning_rate': 0.15274828247019778, 'max_depth': 5, 'num_leaves': 44, 'subsample': 0.6210331060028171, 'colsample_bytree': 0.9909317475119969, 'reg_alpha': 0.4761330684134722, 'reg_lambda': 1.8813380938652553})



Predict Time (s) -  2.3496997356414795
CV F1: nan ± nan
Test Accuracy: 0.6524
              precision    recall  f1-score   support

      Attack     0.2236    0.4283    0.2938    425694
      BENIGN     0.8573    0.6979    0.7694   2095057

    accuracy                         0.6524   2520751
   macro avg     0.5405    0.5631    0.5316   2520751
weighted avg     0.7503    0.6524    0.6891   2520751

Resource Usage: {'Training Time (s)': 8.727250099182129, 'Peak CPU (%)': 100.0, 'Avg CPU (%)': np.float64(96.44810126582279)}


# Binary with cross-val on single dataset

In [210]:
# Reading data
df = pd.read_csv("..\cicids2018_training.csv")

# Preparing training and test splits
X = df.drop('Attack Type', axis=1)
y = df['Attack Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

  df = pd.read_csv("..\cicids2018_training.csv")


In [211]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

In [212]:
# Initialize scaling algos
MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [213]:
X_train_resampled_rus, y_train_resampled_rus = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [214]:
X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS':780000, 'DoS': 200000}, random_state=42).fit_resample(X_train_resampled_rus, y_train_resampled_rus)

In [215]:
group_mapping_2018 = {
    'Normal Traffic': 'BENIGN',
    'DoS': 'Attack',
    'DDoS': 'Attack',
    'Brute Force': 'Attack',
    'Bot': 'Attack',
    'Infilteration': 'Attack'}

In [216]:
y_train_resampled_scaled_MMS_SMOTE = y_train_resampled_scaled_MMS_SMOTE.map(group_mapping_2018)
y_test = y_test.map(group_mapping_2018)

In [217]:
y_train_resampled_scaled_MMS_SMOTE.unique()

array(['Attack', 'BENIGN'], dtype=object)

In [218]:
y_test.unique()

array(['BENIGN', 'Attack'], dtype=object)

In [None]:
lbgm_model, best_params = show_results_LGBM(X_train_resampled_scaled_MMS_SMOTE, 
                                    X_test_MMS_scaled,
                                    y_train_resampled_scaled_MMS_SMOTE, 
                                    y_test, 
                                    n_trials=60)

In [219]:
eval_dataset_w_LGBM(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test, params_lgbm={'n_estimators': 285, 'learning_rate': 0.15274828247019778, 'max_depth': 5, 'num_leaves': 44, 'subsample': 0.6210331060028171, 'colsample_bytree': 0.9909317475119969, 'reg_alpha': 0.4761330684134722, 'reg_lambda': 1.8813380938652553})



Predict Time (s) -  2.8006386756896973
CV F1: nan ± nan
Test Accuracy: 0.9748
              precision    recall  f1-score   support

      Attack     0.8795    0.9382    0.9079    395446
      BENIGN     0.9905    0.9804    0.9854   2590259

    accuracy                         0.9748   2985705
   macro avg     0.9350    0.9593    0.9467   2985705
weighted avg     0.9758    0.9748    0.9751   2985705

Resource Usage: {'Training Time (s)': 9.987603187561035, 'Peak CPU (%)': 100.0, 'Avg CPU (%)': np.float64(96.61397849462367)}


# MultiClass with cross-val between datasets

In [221]:
# Reading data
df = pd.read_csv("..\cicids2018_training.csv")

X_train = df.drop('Attack Type', axis=1)
y_train = df['Attack Type']

del df

  df = pd.read_csv("..\cicids2018_training.csv")


In [222]:
# Reading data
df = pd.read_csv("..\cicids2017_test_of_2018.csv")

X_test = df.drop('Attack Type', axis=1)
y_test = df['Attack Type']

del df

  df = pd.read_csv("..\cicids2017_test_of_2018.csv")


In [223]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

In [224]:
# Initialize scaling algos
MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [225]:
X_train_resampled_rus, y_train_resampled_rus = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [226]:
X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS':780000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [227]:
group_mapping_2018 = {
    'Normal Traffic': 'Normal Traffic',
    'DoS': 'DoS',
    'DDoS': 'DDoS',
    'Brute Force': 'Brute Force',
    'Bot': 'Bots',
    'Infilteration': 'Other'}

In [228]:
y_train_resampled_scaled_MMS_SMOTE = y_train_resampled_scaled_MMS_SMOTE.map(group_mapping_2018)

In [229]:
y_train_resampled_scaled_MMS_SMOTE.unique()

array(['Bots', 'Brute Force', 'DDoS', 'DoS', 'Other', 'Normal Traffic'],
      dtype=object)

In [230]:
y_test.unique()

array(['Normal Traffic', 'DDoS', 'Other', 'Bots', 'Brute Force', 'DoS'],
      dtype=object)

In [None]:
lbgm_model, best_params = show_results_LGBM(X_train_resampled_scaled_MMS_SMOTE, 
                                    X_test_MMS_scaled,
                                    y_train_resampled_scaled_MMS_SMOTE, 
                                    y_test, 
                                    n_trials=60)

In [231]:
eval_dataset_w_LGBM(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test, params_lgbm={'n_estimators': 475, 'learning_rate': 0.058716115604362774, 'max_depth': 12, 'num_leaves': 11, 'subsample': 0.5057428968514667, 'colsample_bytree': 0.8101229173073474, 'reg_alpha': 0.006541575882940611, 'reg_lambda': 0.02825941211887404})



Predict Time (s) -  23.394148349761963
CV F1: nan ± nan
Test Accuracy: 0.7929
                precision    recall  f1-score   support

          Bots     0.0000    0.0000    0.0000      1948
   Brute Force     0.0000    0.0000    0.0000      9150
          DDoS     0.0000    0.0000    0.0000    128014
           DoS     0.2763    0.1270    0.1740    193745
Normal Traffic     0.8808    0.9011    0.8909   2095057
         Other     0.3123    0.9275    0.4672     92837

      accuracy                         0.7929   2520751
     macro avg     0.2449    0.3259    0.2553   2520751
  weighted avg     0.7648    0.7929    0.7710   2520751

Resource Usage: {'Training Time (s)': 71.41535210609436, 'Peak CPU (%)': 100.0, 'Avg CPU (%)': np.float64(99.51814223512336)}
