# Imports and model functions

In [48]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
from joblib import parallel_backend

from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, make_scorer

import time
import psutil
import threading
from memory_profiler import memory_usage

import traceback
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import optuna

In [49]:
def apply_adaboost(X_train, y_train, best_params=None, random_state=42, n_jobs=15, cv=5):
    measurement_ada = {}
    best_params = best_params or {}
    
    tree_params = {k.replace('base_', ''): v for k, v in best_params.items() 
                   if k.startswith('base_')}
    ada_params = {k: v for k, v in best_params.items() 
                 if not k.startswith('base_')}
    
    # Removed n_jobs from DecisionTreeClassifier as it's not supported
    base_estimator = DecisionTreeClassifier(random_state=random_state, **tree_params)
    
    # AdaBoostClassifier also doesn't support n_jobs
    ada_model = AdaBoostClassifier(base_estimator=base_estimator,
                                 random_state=random_state,
                                 **ada_params)
    
    with parallel_backend('loky', n_jobs=n_jobs):
        try:
            cpu_usage = []
            stop_flag = threading.Event()

            def monitor_cpu():
                while not stop_flag.is_set():
                    cpu_usage.append(psutil.cpu_percent(interval=0.1))

            def train_model():
                ada_model.fit(X_train, y_train)

            cpu_thread = threading.Thread(target=monitor_cpu)
            cpu_thread.start()

            start_time = time.time()
            train_model()
            training_time = time.time() - start_time

            stop_flag.set()
            cpu_thread.join()

            measurement_ada['Training Time (s)'] = training_time
            measurement_ada['Peak CPU Usage (%)'] = max(cpu_usage)
            measurement_ada['Average CPU Usage (%)'] = sum(cpu_usage) / len(cpu_usage) if cpu_usage else 0

            f1_scorer = make_scorer(f1_score, average='weighted')
            
            # Parallel processing is only available for cross-validation
            cv_scores_ada = cross_val_score(ada_model, X_train, y_train, 
                                          cv=cv, 
                                          n_jobs=n_jobs, 
                                          scoring=f1_scorer,
                                          verbose=1)

            return cv_scores_ada, measurement_ada, ada_model

        except Exception as e:
            print("⛔ Full error traceback:")
            traceback.print_exc()
            print(f"Error during AdaBoost training: {e}")
            return None, None, None

In [50]:
def eval_dataset_w_ada(X_train, X_test, y_train, y_test, params_ada={
    'n_estimators': 50,
    'learning_rate': 1.0,
    'base_max_depth': 3,
    'base_min_samples_split': 2
}):
    cv_scores_ada, measurement_ada, ada_model = apply_adaboost(X_train, y_train, best_params=params_ada)
    
    start_time = time.time()
    y_pred_ada = ada_model.predict(X_test)
    training_time = time.time() - start_time
    print("Predict Time (s) - ", training_time)
    
    cv_scores_mean_ada = np.mean(cv_scores_ada)
    print(f'Cross validation average score: {cv_scores_mean_ada:.4f} +/- standard deviation: {np.std(cv_scores_ada):.4f}')
    
    accuracy_ada = accuracy_score(y_test, y_pred_ada)
    print(f'Accuracy on the test set: {accuracy_ada:.4f}')
    
    print("Resource measurements:", measurement_ada)
    print(classification_report(y_test, y_pred_ada, digits=4))

In [55]:
from functools import partial
def show_results_ada(X_train, X_test, y_train, y_test, n_trials=100):
    def objective(trial, X_train, y_train, cv=5):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
            'base_max_depth': trial.suggest_int('base_max_depth', 1, 50),
            'base_min_samples_split': trial.suggest_int('base_min_samples_split', 2, 20),
            'base_min_samples_leaf': trial.suggest_int('base_min_samples_leaf', 1, 10),
        }
        
        cv_scores, _, model = apply_adaboost(X_train, y_train, best_params=params, cv=cv)
        if cv_scores is None:
            return 0
        return np.mean(cv_scores)
    
    study = optuna.create_study(direction='maximize')    
    with parallel_backend('loky', n_jobs=15):
        study.optimize(lambda trial: objective(trial, X_train, y_train), 
                      n_trials=n_trials,
                      n_jobs=15)
    
    best_params = study.best_params
    
    cv_scores_ada, measurement_ada, ada_model = apply_adaboost(X_train, y_train, best_params=best_params)
    
    if cv_scores_ada is None:
        print("Model training failed")
        return
    
    y_pred_ada = ada_model.predict(X_test)

    print("\nUnique values in test set:", np.unique(y_test_array))
    print("Unique values in predictions:", np.unique(y_pred_array))
    
    cv_scores_mean_ada = np.mean(cv_scores_ada)
    
    try:
        f1 = f1_score(y_test_array, y_pred_array, average='weighted')
        accuracy = accuracy_score(y_test_array, y_pred_array)

        print("\nModel Evaluation Results:")
        print("-" * 50)
        print(f'Cross validation average score (F1): {cv_scores_mean_ada:.4f} +/- standard deviation: {np.std(cv_scores_ada):.4f}')
        print(f'F1 Score on test set: {f1:.4f}')
        print(f'Accuracy on test set: {accuracy:.4f}')
        print("\nResource Usage:")
        print("-" * 50)
        print("Resource measurements:", measurement_ada)
        print("\nDetailed Classification Report:")
        print("-" * 50)
        print(classification_report(y_test_array, y_pred_array))
    
    except Exception as e:
        print(f"Error during metric calculation: {str(e)}")
        print("Types in test set:", y_test_array.dtype)
        print("Types in predictions:", y_pred_array.dtype)
        raise
    
    return ada_model, best_params

# Prep for model training cicids2017

In [6]:
# Reading data
df = pd.read_csv("..\..\data prep\cicids2017_prep\cicids2017_42feat_97percent.csv")

In [7]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

In [8]:
# Preparing training and test splits
X = df.drop('Attack Type', axis=1)
y = df['Attack Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [9]:
# Initialize scaling algos
MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [10]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [11]:
X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 7500, 'Web Attacks': 7500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

# Sync classes

In [12]:
# Function to combine classes
def combine_classes(y, class_mapping):
    return y.map(class_mapping)
# Define the mapping
class_mapping = {
    'Web Attacks': 'Other',
    'Port Scanning': 'Other',
    'Normal Traffic': 'Normal Traffic',
    'Bots': 'Bots',
    'Brute Force': 'Brute Force',
    'DDoS': 'DDoS',
    'DoS': 'DoS'
}

In [13]:
df["Attack Type"].unique()

array(['Normal Traffic', 'DDoS', 'Port Scanning', 'Bots', 'Web Attacks',
       'Brute Force', 'DoS'], dtype=object)

In [14]:
# Apply to all your sets
y_train = combine_classes(y_train, class_mapping)
y_test = combine_classes(y_test, class_mapping)

y_train_scaled_rus_MMS = combine_classes(y_train_scaled_rus_MMS, class_mapping)
y_train_resampled_scaled_MMS_SMOTE = combine_classes(y_train_resampled_scaled_MMS_SMOTE, class_mapping)

In [15]:
y_test.unique()

array(['Normal Traffic', 'DoS', 'DDoS', 'Bots', 'Other', 'Brute Force'],
      dtype=object)

# Search best params for MMS SMOTE

In [56]:
rf_model, best_params = show_results_ada(X_train_resampled_scaled_MMS_SMOTE, 
                                    X_test_MMS_scaled,
                                    y_train_resampled_scaled_MMS_SMOTE, 
                                    y_test, 
                                    n_trials=30)

[I 2025-05-05 22:39:14,621] A new study created in memory with name: no-name-c990a967-7e3a-4869-accf-7c4abe1be617
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done   5 out of   5 | elapsed: 25.9min finished
[I 2025-05-05 23:34:34,153] Trial 11 finished with value: 0.8684359691143608 and parameters: {'n_estimators': 110, 'learning_rate': 0.01358617872079761, 'base_max_depth': 3, 'base_min_samples_split': 19, 'base_min_samples_leaf': 5}. Best is trial 11 with value: 0.8684359691143608.
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent wor

KeyboardInterrupt: 

In [58]:
eval_dataset_w_ada(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test, params_ada={'n_estimators': 185, 'learning_rate': 0.23168053893179077, 'base_max_depth': 15, 'base_min_samples_split': 2, 'base_min_samples_leaf': 1})

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done   5 out of   5 | elapsed: 76.0min finished


Predict Time (s) -  35.96842908859253
Cross validation average score: 0.9920 +/- standard deviation: 0.0136
Accuracy on the test set: 0.9989
Resource measurements: {'Training Time (s)': 3853.644735097885, 'Peak CPU Usage (%)': 86.5, 'Average CPU Usage (%)': 11.685697995120721}
                precision    recall  f1-score   support

          Bots     0.7245    0.9007    0.8031       584
   Brute Force     0.9996    0.9989    0.9993      2745
          DDoS     0.9998    0.9998    0.9998     38404
           DoS     0.9979    0.9991    0.9985     58124
Normal Traffic     0.9997    0.9990    0.9994    628518
         Other     0.9893    0.9968    0.9931     27851

      accuracy                         0.9989    756226
     macro avg     0.9518    0.9824    0.9655    756226
  weighted avg     0.9990    0.9989    0.9989    756226

