# Imports and model functions

In [19]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import randint, uniform
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, make_scorer

import time
import psutil
import threading
from memory_profiler import memory_usage

In [2]:
def apply_knn(X_train, y_train, best_params=None, n_jobs=-1, cv=5):
    """Core KNN training with resource monitoring"""
    measurement = {}
    best_params = best_params or {}
    
    knn_model = KNeighborsClassifier(**best_params)
    cpu_usage = []
    stop_flag = threading.Event()

    def monitor_cpu():
        while not stop_flag.is_set():
            cpu_usage.append(psutil.cpu_percent(interval=0.1))

    try:
        cpu_thread = threading.Thread(target=monitor_cpu)
        cpu_thread.start()
        start_time = time.time()
        
        knn_model.fit(X_train, y_train)
        
        training_time = time.time() - start_time
        stop_flag.set()
        cpu_thread.join()

        # Record metrics
        measurement['Training Time (s)'] = training_time
        measurement['Peak CPU (%)'] = max(cpu_usage) if cpu_usage else 0
        measurement['Avg CPU (%)'] = np.mean(cpu_usage) if cpu_usage else 0

        # Cross-validation
        f1_scorer = make_scorer(f1_score, average='weighted')
        cv_scores = cross_val_score(
            knn_model, X_train, y_train, cv=cv, scoring=f1_scorer, n_jobs=n_jobs
        )
        
        return cv_scores, measurement, knn_model

    except Exception as e:
        print(f"KNN training failed: {str(e)}")
        return None, None, None

In [3]:
def eval_dataset_w_KNN(X_train, X_test, y_train, y_test, 
                      params_knn={'n_neighbors': 5, 'weights': 'uniform', 'n_jobs': -1}):
    """Evaluation wrapper for KNN"""
    cv_scores_knn, measurement_knn, knn_model = apply_knn(X_train, y_train, best_params=params_knn)
    
    if knn_model is None:
        print("⛔ Failed to train KNN model")
        return

    y_pred_knn = knn_model.predict(X_test)
    
    print(f'CV F1: {np.mean(cv_scores_knn):.4f} ± {np.std(cv_scores_knn):.4f}')
    print(f'Test Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}')
    print(classification_report(y_test, y_pred_knn, digits=4))
    print("Resource Usage:", measurement_knn)

In [41]:
import optuna
from functools import partial

def show_results_KNN(X_train, X_test, y_train, y_test, n_trials=100):
    def objective(trial, X_train, y_train, cv=5):
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 5, 50),
            'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
            'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
            'leaf_size': trial.suggest_int('leaf_size', 10, 100),
            'p': trial.suggest_int('p', 1, 2)  # 1=manhattan, 2=euclidean
        }
        
        cv_scores, _, model = apply_knn(X_train, y_train, best_params=params, cv=cv)
        return np.mean(cv_scores) if cv_scores is not None else 0

    study = optuna.create_study(direction='maximize')
    study.optimize(partial(objective, X_train=X_train, y_train=y_train), n_trials=n_trials)
    best_params = study.best_params

    cv_scores_knn, measurement_knn, knn_model = apply_knn(X_train, y_train, best_params=best_params)
    
    if cv_scores_knn is None:
        print("KNN training failed")
        return

    y_pred_knn = knn_model.predict(X_test)
    y_test_array = np.array(y_test)
    y_pred_array = np.array(y_pred_knn)

    print("\nUnique values in test set:", np.unique(y_test_array))
    print("Unique values in predictions:", np.unique(y_pred_array))

    try:
        f1 = f1_score(y_test_array, y_pred_array, average='weighted')
        accuracy = accuracy_score(y_test_array, y_pred_array)
        
        print("\nKNN Evaluation Results:")
        print("-" * 50)
        print(f'CV F1: {np.mean(cv_scores_knn):.4f} ± {np.std(cv_scores_knn):.4f}')
        print(f'Test F1: {f1:.4f}')
        print(f'Test Accuracy: {accuracy:.4f}')
        print("\nResource Usage:", measurement_knn)
        print("\nClassification Report:")
        print(classification_report(y_test_array, y_pred_array))
    
    except Exception as e:
        print(f"Error in KNN evaluation: {str(e)}")
        raise
    
    return knn_model, best_params

# Prep for model training cicids2017

In [7]:
# Reading data
df = pd.read_csv("..\..\data prep\cicids2017_prep\cicids2017_42feat_97percent.csv")

In [8]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

In [9]:
# Preparing training and test splits
X = df.drop('Attack Type', axis=1)
y = df['Attack Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [10]:
# Initialize scaling algos
MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [11]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [12]:
X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 7500, 'Web Attacks': 7500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

# Sync classes

In [13]:
# Function to combine classes
def combine_classes(y, class_mapping):
    return y.map(class_mapping)
# Define the mapping
class_mapping = {
    'Web Attacks': 'Other',
    'Port Scanning': 'Other',
    'Normal Traffic': 'Normal Traffic',
    'Bots': 'Bots',
    'Brute Force': 'Brute Force',
    'DDoS': 'DDoS',
    'DoS': 'DoS'
}

In [14]:
df["Attack Type"].unique()

array(['Normal Traffic', 'DDoS', 'Port Scanning', 'Bots', 'Web Attacks',
       'Brute Force', 'DoS'], dtype=object)

In [15]:
# Apply to all your sets
y_train = combine_classes(y_train, class_mapping)
y_test = combine_classes(y_test, class_mapping)

y_train_scaled_rus_MMS = combine_classes(y_train_scaled_rus_MMS, class_mapping)
y_train_resampled_scaled_MMS_SMOTE = combine_classes(y_train_resampled_scaled_MMS_SMOTE, class_mapping)

In [40]:
print(pd.Series(y_train_resampled_scaled_MMS_SMOTE).value_counts())


Attack Type
Normal Traffic    500000
DoS               200000
DDoS               90000
Other              77500
Bots                7500
Brute Force         7000
Name: count, dtype: int64


# Search best params for MMS SMOTE

In [42]:
rf_model, best_params = show_results_KNN(X_train_resampled_scaled_MMS_SMOTE, 
                                    X_test_MMS_scaled,
                                    y_train_resampled_scaled_MMS_SMOTE, 
                                    y_test, 
                                    n_trials=30)

[I 2025-05-02 13:51:27,376] A new study created in memory with name: no-name-c6dbb544-05bc-4579-82bc-88f4d3e5f063
[I 2025-05-02 14:01:00,662] Trial 0 finished with value: 0.979368545446403 and parameters: {'n_neighbors': 36, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 69, 'p': 2}. Best is trial 0 with value: 0.979368545446403.
[W 2025-05-02 14:01:03,201] Trial 1 failed with parameters: {'n_neighbors': 17, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 56, 'p': 1} because of the following error: The value nan is not acceptable.
[W 2025-05-02 14:01:03,202] Trial 1 failed with value nan.
[I 2025-05-02 14:07:49,814] Trial 2 finished with value: 0.9793594523308121 and parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 63, 'p': 1}. Best is trial 0 with value: 0.979368545446403.
[I 2025-05-02 15:20:47,248] Trial 3 finished with value: 0.979237389450117 and parameters: {'n_neighbors': 40, 'weights': 'distance', 'algorithm': 'ball_tr


Unique values in test set: ['Bots' 'Brute Force' 'DDoS' 'DoS' 'Normal Traffic' 'Other']
Unique values in predictions: ['Bots' 'Brute Force' 'DDoS' 'DoS' 'Normal Traffic' 'Other']

KNN Evaluation Results:
--------------------------------------------------
CV F1: 0.9821 ± 0.0184
Test F1: 0.9909
Test Accuracy: 0.9905

Resource Usage: {'Training Time (s)': 0.5338592529296875, 'Peak CPU (%)': 23.1, 'Avg CPU (%)': 16.966666666666665}

Classification Report:
                precision    recall  f1-score   support

          Bots       0.45      0.91      0.60       584
   Brute Force       0.98      1.00      0.99      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      1.00      1.00     58124
Normal Traffic       1.00      0.99      0.99    628518
         Other       0.84      0.98      0.90     27851

      accuracy                           0.99    756226
     macro avg       0.88      0.98      0.91    756226
  weighted avg       0.99      0.99  

In [44]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test, params_knn={'n_neighbors': 6, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 11, 'p': 1})

CV F1: 0.9821 ± 0.0184
Test Accuracy: 0.9905
                precision    recall  f1-score   support

          Bots     0.4499    0.9144    0.6030       584
   Brute Force     0.9842    0.9956    0.9899      2745
          DDoS     0.9977    0.9987    0.9982     38404
           DoS     0.9923    0.9984    0.9953     58124
Normal Traffic     0.9988    0.9899    0.9943    628518
         Other     0.8394    0.9780    0.9034     27851

      accuracy                         0.9905    756226
     macro avg     0.8770    0.9792    0.9140    756226
  weighted avg     0.9919    0.9905    0.9909    756226

Resource Usage: {'Training Time (s)': 0.5352437496185303, 'Peak CPU (%)': 19.6, 'Avg CPU (%)': 17.0}
