# Imports and benchmark functions

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

import time
import psutil
import threading
from memory_profiler import memory_usage

In [2]:
def apply_rf(X_train, y_train, best_params=None, random_state=42, n_jobs=-1, cv=5): 
    measurement_rf = {}
        
    # Default to empty dictionary if best_params is not provided
    best_params = best_params or {}

    rf_model = RandomForestClassifier(**best_params, random_state=random_state, n_jobs=n_jobs, verbose=1)
    
    # Function to monitor CPU usage during training
    cpu_usage = []
    stop_flag = threading.Event()

    def monitor_cpu():
        while not stop_flag.is_set():
            cpu_usage.append(psutil.cpu_percent(interval=0.1))

    # Function to train the model
    def train_model():
        rf_model.fit(X_train, y_train)

    try:
        # Start CPU monitoring in a separate thread
        cpu_thread = threading.Thread(target=monitor_cpu)
        cpu_thread.start()

        # Measure memory usage and training time
        start_time = time.time()
        train_model()
        training_time = time.time() - start_time

        # Stop CPU monitoring
        stop_flag.set()
        cpu_thread.join()

        # Add measurements
        measurement_rf['Training Time (s)'] = training_time
        measurement_rf['Peak CPU Usage (%)'] = max(cpu_usage)
        measurement_rf['Average CPU Usage (%)'] = sum(cpu_usage) / len(cpu_usage) if cpu_usage else 0

        # Perform cross-validation
        cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=cv, n_jobs=n_jobs)

        return cv_scores_rf, measurement_rf, rf_model

    except Exception as e:
        import traceback
        print("⛔ Full error traceback:")
        traceback.print_exc()  # Print detailed error traceback
        print(f"Error during Random Forest training: {e}")
        return None, None, None

In [3]:
def eval_dataset_w_RF(X_train, X_test, y_train, y_test):
    params_rf = {'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 8}

    # Fitting the model
    cv_scores_rf, measurement_rf, rf_model = apply_rf(X_train, y_train, best_params=params_rf)

    # Making predictions
    y_pred_rf = rf_model.predict(X_test)
    
    # Evaluating the model performance on the cross validation set vs accuracy on the test set
    cv_scores_mean_rf = np.mean(cv_scores_rf)
    print(f'Cross validation average score: {cv_scores_mean_rf:.4f} +/- standard deviation: {np.std(cv_scores_rf):.4f}')

    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    print(f'Accuracy on the test set: {accuracy_rf:.4f}')
    
    # Checking computational cost
    print("Resource measurements:", measurement_rf)
    print(classification_report(y_test, y_pred_rf))

In [4]:
def eval_dataset_w_KNN(X_train, X_test, y_train, y_test):
    params_knn = {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 100, 'p': 1}
    
    # Fitting the model
    cv_scores_knn, measurement_knn, knn_model = apply_knn(X_train, y_train, best_params=params_knn)
    
    # Making predictions
    y_pred_knn = knn_model.predict(X_test)
    
    # Evaluating the model performance on the cross validation set vs accuracy on the test set
    cv_scores_mean_knn = np.mean(cv_scores_knn)
    print(f'Cross validation average score: {cv_scores_mean_knn:.4f} +/- standard deviation: {np.std(cv_scores_knn):.4f}')
    
    accuracy_knn = accuracy_score(y_test, y_pred_knn)
    print(f'Accuracy on the test set: {accuracy_knn:.4f}')
    
    # Checking computational cost
    print("Resource measurements:", measurement_knn)
    print(classification_report(y_test, y_pred_knn))

In [5]:
def apply_knn(X_train, y_train, best_params=None, random_state=42, n_jobs=-1, cv=5):
    measurement_knn = {}
    
    # Default to empty dictionary if best_params is not provided
    best_params = best_params or {}
    
    knn_model = KNeighborsClassifier(**best_params, n_jobs=n_jobs)
    
    # Function to monitor CPU usage during training
    cpu_usage = []
    stop_flag = threading.Event()
    
    def monitor_cpu():
        while not stop_flag.is_set():
            cpu_usage.append(psutil.cpu_percent(interval=0.1))
    
    # Function to train the model
    def train_model():
        knn_model.fit(X_train, y_train)
    
    try:
        # Start CPU monitoring in a separate thread
        cpu_thread = threading.Thread(target=monitor_cpu)
        cpu_thread.start()
        
        # Measure memory usage and training time
        start_time = time.time()
        train_model()
        training_time = time.time() - start_time
        
        # Stop CPU monitoring
        stop_flag.set()
        cpu_thread.join()
        
        # Add measurements
        measurement_knn['Training Time (s)'] = training_time
        measurement_knn['Peak CPU Usage (%)'] = max(cpu_usage)
        measurement_knn['Average CPU Usage (%)'] = sum(cpu_usage) / len(cpu_usage) if cpu_usage else 0
        
        # Perform cross-validation
        cv_scores_knn = cross_val_score(knn_model, X_train, y_train, cv=cv, n_jobs=n_jobs)
        
        return cv_scores_knn, measurement_knn, knn_model
        
    except Exception as e:
        import traceback
        print("⛔ Full error traceback:")
        traceback.print_exc()  # Print detailed error traceback
        print(f"Error during KNN training: {e}")
        return None, None, None

# Data balancing


In [6]:
# Reading data
df = pd.read_csv("..\..\data prep\cicids2018_prep\cicids2018_final.csv")

  df = pd.read_csv("..\..\data prep\cicids2018_prep\cicids2018_final.csv")


In [7]:
df['Attack Type'].unique()

array(['Normal Traffic', 'Bot', 'DoS', 'Brute Force', 'DDoS',
       'Infilteration'], dtype=object)

In [8]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

In [9]:
# Preparing training and test splits
X = df.drop('Attack Type', axis=1)
y = df['Attack Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [10]:
# Initialize scaling algos
RS = RobustScaler()
X_train_RS_scaled = RS.fit_transform(X_train)
X_test_RS_scaled = RS.transform(X_test)

SS = StandardScaler()
X_train_SS_scaled = SS.fit_transform(X_train)
X_test_SS_scaled = SS.transform(X_test)

MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [11]:
print(df['Attack Type'].value_counts())

Attack Type
Normal Traffic    8634196
DDoS               775470
DoS                196299
Bot                143977
Infilteration      107531
Brute Force         94876
Name: count, dtype: int64


## Evals

In [15]:
eval_dataset_w_RF(X_train, X_test, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.1min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.6s finished


Cross validation average score: 0.9817 +/- standard deviation: 0.0011
Accuracy on the test set: 0.9816
Resource measurements: {'Training Time (s)': 314.10686445236206, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.32835633626098}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

           Bot       1.00      0.99      1.00     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.99      0.92      0.95    232641
           DoS       0.99      0.98      0.98     58890
 Infilteration       0.00      0.00      0.00     32259
Normal Traffic       0.98      1.00      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.83      0.81      0.82   2985705
  weighted avg       0.97      0.98      0.98   2985705



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
eval_dataset_w_RF(X_train_RS_scaled, X_test_RS_scaled, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.1min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.1s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.3s finished


Cross validation average score: 0.9818 +/- standard deviation: 0.0011
Accuracy on the test set: 0.9817
Resource measurements: {'Training Time (s)': 313.87343525886536, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.1209476309227}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

           Bot       1.00      0.99      0.99     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.99      0.92      0.95    232641
           DoS       0.99      0.99      0.99     58890
 Infilteration       0.00      0.00      0.00     32259
Normal Traffic       0.98      1.00      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.83      0.81      0.82   2985705
  weighted avg       0.97      0.98      0.98   2985705



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
eval_dataset_w_RF(X_train_SS_scaled, X_test_SS_scaled, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.1s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.5s finished


Cross validation average score: 0.9814 +/- standard deviation: 0.0007
Accuracy on the test set: 0.9833
Resource measurements: {'Training Time (s)': 309.41271471977234, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.32163995067818}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

           Bot       1.00      0.99      0.99     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.98      0.96      0.97    232641
           DoS       0.99      0.98      0.98     58890
 Infilteration       0.00      0.00      0.00     32259
Normal Traffic       0.98      1.00      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.83      0.82      0.82   2985705
  weighted avg       0.97      0.98      0.98   2985705



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
eval_dataset_w_RF(X_train_MMS_scaled, X_test_MMS_scaled, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.9min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.1s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.5s finished


Cross validation average score: 0.9820 +/- standard deviation: 0.0014
Accuracy on the test set: 0.9801
Resource measurements: {'Training Time (s)': 303.8914921283722, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.27672151127362}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

           Bot       1.00      0.99      1.00     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.99      0.90      0.94    232641
           DoS       0.99      0.98      0.98     58890
 Infilteration       0.00      0.00      0.00     32259
Normal Traffic       0.98      1.00      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.83      0.81      0.82   2985705
  weighted avg       0.97      0.98      0.97   2985705



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
eval_dataset_w_KNN(X_train, X_test, y_train, y_test)

Cross validation average score: 0.9802 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9801
Resource measurements: {'Training Time (s)': 98.43241429328918, 'Peak CPU Usage (%)': 17.6, 'Average CPU Usage (%)': 10.329166666666667}
                precision    recall  f1-score   support

           Bot       0.99      1.00      0.99     43193
   Brute Force       0.99      0.99      0.99     28463
          DDoS       0.95      0.97      0.96    232641
           DoS       0.95      0.95      0.95     58890
 Infilteration       0.23      0.06      0.09     32259
Normal Traffic       0.99      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.85      0.83      0.83   2985705
  weighted avg       0.97      0.98      0.98   2985705



In [20]:
eval_dataset_w_KNN(X_train_RS_scaled, X_test_RS_scaled, y_train, y_test)

Cross validation average score: 0.9775 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9775
Resource measurements: {'Training Time (s)': 87.88576531410217, 'Peak CPU Usage (%)': 12.4, 'Average CPU Usage (%)': 8.4}
                precision    recall  f1-score   support

           Bot       0.99      1.00      0.99     43193
   Brute Force       0.99      0.99      0.99     28463
          DDoS       0.95      0.94      0.95    232641
           DoS       0.95      0.96      0.96     58890
 Infilteration       0.22      0.06      0.10     32259
Normal Traffic       0.98      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.85      0.82      0.83   2985705
  weighted avg       0.97      0.98      0.97   2985705



In [21]:
eval_dataset_w_KNN(X_train_SS_scaled, X_test_SS_scaled, y_train, y_test)

Cross validation average score: 0.9827 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9825
Resource measurements: {'Training Time (s)': 85.26975274085999, 'Peak CPU Usage (%)': 11.2, 'Average CPU Usage (%)': 7.663636363636363}
                precision    recall  f1-score   support

           Bot       0.99      1.00      0.99     43193
   Brute Force       0.99      0.99      0.99     28463
          DDoS       0.97      0.98      0.97    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.22      0.06      0.10     32259
Normal Traffic       0.99      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.86      0.84      0.84   2985705
  weighted avg       0.98      0.98      0.98   2985705



In [22]:
eval_dataset_w_KNN(X_train_MMS_scaled, X_test_MMS_scaled, y_train, y_test)

Cross validation average score: 0.9827 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9825
Resource measurements: {'Training Time (s)': 105.5821647644043, 'Peak CPU Usage (%)': 25.5, 'Average CPU Usage (%)': 20.779999999999998}
                precision    recall  f1-score   support

           Bot       0.99      1.00      0.99     43193
   Brute Force       0.99      0.99      0.99     28463
          DDoS       0.97      0.98      0.97    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.21      0.06      0.09     32259
Normal Traffic       0.99      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.86      0.84      0.84   2985705
  weighted avg       0.98      0.98      0.98   2985705



## Under Sampling

In [23]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss

In [24]:
# Initializing the undersampling for the clean df
X_train_resampled_rus, y_train_resampled_rus = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train, y_train)

# Initializing the undersampling for the scaled df
X_train_scaled_rus_RS, y_train_scaled_rus_RS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_RS_scaled, y_train)

X_train_scaled_rus_SS, y_train_scaled_rus_SS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_SS_scaled, y_train)

X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [25]:
# Initializing the undersampling for the clean df
X_train_resampled_NM, y_train_resampled_NM = NearMiss(sampling_strategy={'Normal Traffic': 1000000}, version=3).fit_resample(X_train, y_train)

# Initializing the undersampling for the scaled df
X_train_scaled_NM_RS, y_train_scaled_NM_RS = NearMiss(sampling_strategy={'Normal Traffic': 1000000}, version=3).fit_resample(X_train_RS_scaled, y_train)

X_train_scaled_NM_SS, y_train_scaled_NM_SS = NearMiss(sampling_strategy={'Normal Traffic': 1000000}, version=3).fit_resample(X_train_SS_scaled, y_train)

X_train_scaled_NM_MMS, y_train_scaled_NM_MMS = NearMiss(sampling_strategy={'Normal Traffic': 1000000}, version=3).fit_resample(X_train_MMS_scaled, y_train)


[WinError 2] Не удается найти указанный файл
  File "c:\ML\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\ogoreltsev.pav\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ogoreltsev.pav\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\ogoreltsev.pav\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


## Evals RF

In [26]:
eval_dataset_w_RF(X_train_resampled_rus, X_test, y_train_resampled_rus, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.1min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.1s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.6s finished


Cross validation average score: 0.9547 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9804
Resource measurements: {'Training Time (s)': 65.95187258720398, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.3546875}
                precision    recall  f1-score   support

           Bot       1.00      0.99      1.00     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.91      0.99      0.95    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.50      0.05      0.09     32259
Normal Traffic       0.99      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.90      0.84      0.83   2985705
  weighted avg       0.98      0.98      0.98   2985705



In [27]:
eval_dataset_w_RF(X_train_scaled_rus_RS, X_test_RS_scaled, y_train_scaled_rus_RS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.2min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.3s finished


Cross validation average score: 0.9547 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9804
Resource measurements: {'Training Time (s)': 72.00814771652222, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.45769230769231}
                precision    recall  f1-score   support

           Bot       1.00      0.99      1.00     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.91      0.99      0.95    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.50      0.05      0.09     32259
Normal Traffic       0.99      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.90      0.84      0.83   2985705
  weighted avg       0.98      0.98      0.98   2985705



In [28]:
eval_dataset_w_RF(X_train_scaled_rus_SS, X_test_SS_scaled, y_train_scaled_rus_SS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.2min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.3s finished


Cross validation average score: 0.9547 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9804
Resource measurements: {'Training Time (s)': 71.04313039779663, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.22219387755102}
                precision    recall  f1-score   support

           Bot       1.00      0.99      1.00     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.91      0.99      0.95    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.50      0.05      0.08     32259
Normal Traffic       0.99      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.90      0.84      0.83   2985705
  weighted avg       0.98      0.98      0.98   2985705



In [29]:
eval_dataset_w_RF(X_train_scaled_rus_MMS, X_test_MMS_scaled, y_train_scaled_rus_MMS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.1min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.4s finished


Cross validation average score: 0.9546 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9803
Resource measurements: {'Training Time (s)': 68.84704279899597, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.17851458885943}
                precision    recall  f1-score   support

           Bot       1.00      0.99      1.00     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.91      0.99      0.95    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.49      0.05      0.08     32259
Normal Traffic       0.99      0.99      0.99   2590259

      accuracy                           0.98   2985705
     macro avg       0.90      0.84      0.83   2985705
  weighted avg       0.98      0.98      0.98   2985705



In [30]:
eval_dataset_w_RF(X_train_resampled_NM, X_test, y_train_resampled_NM, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   24.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.1s finished


Cross validation average score: 0.9954 +/- standard deviation: 0.0004
Accuracy on the test set: 0.1380
Resource measurements: {'Training Time (s)': 24.88511633872986, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.80059523809524}
                precision    recall  f1-score   support

           Bot       0.99      0.99      0.99     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.62      1.00      0.77    232641
           DoS       0.31      1.00      0.47     58890
 Infilteration       0.01      0.97      0.03     32259
Normal Traffic       1.00      0.01      0.01   2590259

      accuracy                           0.14   2985705
     macro avg       0.65      0.83      0.54   2985705
  weighted avg       0.94      0.14      0.11   2985705



In [31]:
eval_dataset_w_RF(X_train_scaled_NM_RS, X_test_RS_scaled, y_train_scaled_NM_RS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   29.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.1s finished


Cross validation average score: 0.9956 +/- standard deviation: 0.0004
Accuracy on the test set: 0.1409
Resource measurements: {'Training Time (s)': 29.871477127075195, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.66511627906976}
                precision    recall  f1-score   support

           Bot       0.99      0.99      0.99     43193
   Brute Force       1.00      0.99      1.00     28463
          DDoS       0.62      1.00      0.77    232641
           DoS       0.30      1.00      0.47     58890
 Infilteration       0.01      0.97      0.03     32259
Normal Traffic       1.00      0.01      0.02   2590259

      accuracy                           0.14   2985705
     macro avg       0.65      0.83      0.55   2985705
  weighted avg       0.94      0.14      0.11   2985705



In [32]:
eval_dataset_w_RF(X_train_scaled_NM_SS, X_test_SS_scaled, y_train_scaled_NM_SS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   27.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.2s finished


Cross validation average score: 0.9959 +/- standard deviation: 0.0002
Accuracy on the test set: 0.1405
Resource measurements: {'Training Time (s)': 28.585793256759644, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.95856353591161}
                precision    recall  f1-score   support

           Bot       0.99      0.99      0.99     43193
   Brute Force       1.00      0.99      0.99     28463
          DDoS       0.62      1.00      0.77    232641
           DoS       0.30      1.00      0.47     58890
 Infilteration       0.01      0.98      0.03     32259
Normal Traffic       1.00      0.01      0.02   2590259

      accuracy                           0.14   2985705
     macro avg       0.65      0.83      0.54   2985705
  weighted avg       0.94      0.14      0.11   2985705



In [33]:
eval_dataset_w_RF(X_train_scaled_NM_MMS, X_test_MMS_scaled, y_train_scaled_NM_MMS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   26.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    6.2s finished


Cross validation average score: 0.9961 +/- standard deviation: 0.0001
Accuracy on the test set: 0.1323
Resource measurements: {'Training Time (s)': 27.803845643997192, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.97213114754099}
                precision    recall  f1-score   support

           Bot       0.99      0.99      0.99     43193
   Brute Force       0.95      1.00      0.97     28463
          DDoS       0.62      1.00      0.77    232641
           DoS       0.30      1.00      0.47     58890
 Infilteration       0.01      0.97      0.03     32259
Normal Traffic       1.00      0.00      0.00   2590259

      accuracy                           0.13   2985705
     macro avg       0.65      0.83      0.54   2985705
  weighted avg       0.94      0.13      0.09   2985705



## Evals KNN

In [34]:
eval_dataset_w_KNN(X_train_resampled_rus, X_test, y_train_resampled_rus, y_test)

Cross validation average score: 0.9456 +/- standard deviation: 0.0005
Accuracy on the test set: 0.9605
Resource measurements: {'Training Time (s)': 21.452256202697754, 'Peak CPU Usage (%)': 23.4, 'Average CPU Usage (%)': 15.549999999999999}
                precision    recall  f1-score   support

           Bot       0.98      1.00      0.99     43193
   Brute Force       0.97      0.99      0.98     28463
          DDoS       0.91      0.99      0.95    232641
           DoS       0.91      0.96      0.93     58890
 Infilteration       0.11      0.25      0.15     32259
Normal Traffic       0.99      0.97      0.98   2590259

      accuracy                           0.96   2985705
     macro avg       0.81      0.86      0.83   2985705
  weighted avg       0.97      0.96      0.97   2985705



In [35]:
eval_dataset_w_KNN(X_train_scaled_rus_RS, X_test_RS_scaled, y_train_scaled_rus_RS, y_test)

Cross validation average score: 0.9379 +/- standard deviation: 0.0004
Accuracy on the test set: 0.9500
Resource measurements: {'Training Time (s)': 20.657833099365234, 'Peak CPU Usage (%)': 22.1, 'Average CPU Usage (%)': 18.650000000000002}
                precision    recall  f1-score   support

           Bot       0.98      1.00      0.99     43193
   Brute Force       0.98      1.00      0.99     28463
          DDoS       0.84      0.97      0.90    232641
           DoS       0.88      0.97      0.92     58890
 Infilteration       0.12      0.30      0.18     32259
Normal Traffic       0.99      0.95      0.97   2590259

      accuracy                           0.95   2985705
     macro avg       0.80      0.86      0.82   2985705
  weighted avg       0.97      0.95      0.96   2985705



In [36]:
eval_dataset_w_KNN(X_train_scaled_rus_SS, X_test_SS_scaled, y_train_scaled_rus_SS, y_test)

Cross validation average score: 0.9544 +/- standard deviation: 0.0004
Accuracy on the test set: 0.9645
Resource measurements: {'Training Time (s)': 19.428059816360474, 'Peak CPU Usage (%)': 16.0, 'Average CPU Usage (%)': 15.200000000000001}
                precision    recall  f1-score   support

           Bot       0.99      1.00      0.99     43193
   Brute Force       0.98      1.00      0.99     28463
          DDoS       0.94      0.99      0.97    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.13      0.31      0.18     32259
Normal Traffic       0.99      0.97      0.98   2590259

      accuracy                           0.96   2985705
     macro avg       0.84      0.88      0.85   2985705
  weighted avg       0.98      0.96      0.97   2985705



In [37]:
eval_dataset_w_KNN(X_train_scaled_rus_MMS, X_test_MMS_scaled, y_train_scaled_rus_MMS, y_test)

Cross validation average score: 0.9544 +/- standard deviation: 0.0003
Accuracy on the test set: 0.9644
Resource measurements: {'Training Time (s)': 19.193310737609863, 'Peak CPU Usage (%)': 19.1, 'Average CPU Usage (%)': 12.180000000000001}
                precision    recall  f1-score   support

           Bot       0.99      1.00      0.99     43193
   Brute Force       0.98      1.00      0.99     28463
          DDoS       0.94      0.99      0.97    232641
           DoS       0.99      1.00      0.99     58890
 Infilteration       0.13      0.31      0.18     32259
Normal Traffic       0.99      0.97      0.98   2590259

      accuracy                           0.96   2985705
     macro avg       0.84      0.88      0.85   2985705
  weighted avg       0.98      0.96      0.97   2985705



In [38]:
eval_dataset_w_KNN(X_train_resampled_NM, X_test, y_train_resampled_NM, y_test)

Cross validation average score: 0.9849 +/- standard deviation: 0.0003
Accuracy on the test set: 0.1419
Resource measurements: {'Training Time (s)': 9.510598421096802, 'Peak CPU Usage (%)': 28.5, 'Average CPU Usage (%)': 23.475}
                precision    recall  f1-score   support

           Bot       0.86      1.00      0.92     43193
   Brute Force       0.90      0.99      0.95     28463
          DDoS       0.50      0.99      0.66    232641
           DoS       0.27      0.97      0.43     58890
 Infilteration       0.01      0.97      0.03     32259
Normal Traffic       1.00      0.01      0.03   2590259

      accuracy                           0.14   2985705
     macro avg       0.59      0.82      0.50   2985705
  weighted avg       0.93      0.14      0.10   2985705



In [39]:
eval_dataset_w_KNN(X_train_scaled_NM_RS, X_test_RS_scaled, y_train_scaled_NM_RS, y_test)

Cross validation average score: 0.9832 +/- standard deviation: 0.0001
Accuracy on the test set: 0.1406
Resource measurements: {'Training Time (s)': 8.061488151550293, 'Peak CPU Usage (%)': 22.0, 'Average CPU Usage (%)': 17.975}
                precision    recall  f1-score   support

           Bot       0.90      1.00      0.95     43193
   Brute Force       0.97      0.99      0.98     28463
          DDoS       0.43      0.99      0.60    232641
           DoS       0.27      0.98      0.43     58890
 Infilteration       0.01      0.93      0.03     32259
Normal Traffic       1.00      0.01      0.02   2590259

      accuracy                           0.14   2985705
     macro avg       0.60      0.82      0.50   2985705
  weighted avg       0.93      0.14      0.10   2985705



In [40]:
eval_dataset_w_KNN(X_train_scaled_NM_SS, X_test_SS_scaled, y_train_scaled_NM_SS, y_test)

Cross validation average score: 0.9955 +/- standard deviation: 0.0001
Accuracy on the test set: 0.1419
Resource measurements: {'Training Time (s)': 7.946810007095337, 'Peak CPU Usage (%)': 16.0, 'Average CPU Usage (%)': 15.075}
                precision    recall  f1-score   support

           Bot       0.94      1.00      0.97     43193
   Brute Force       0.98      0.99      0.98     28463
          DDoS       0.66      1.00      0.79    232641
           DoS       0.31      1.00      0.47     58890
 Infilteration       0.01      0.98      0.03     32259
Normal Traffic       1.00      0.01      0.02   2590259

      accuracy                           0.14   2985705
     macro avg       0.65      0.83      0.54   2985705
  weighted avg       0.94      0.14      0.11   2985705



In [41]:
eval_dataset_w_KNN(X_train_scaled_NM_MMS, X_test_MMS_scaled, y_train_scaled_NM_MMS, y_test)

Cross validation average score: 0.9959 +/- standard deviation: 0.0001
Accuracy on the test set: 0.1413
Resource measurements: {'Training Time (s)': 7.335773229598999, 'Peak CPU Usage (%)': 7.4, 'Average CPU Usage (%)': 5.325}
                precision    recall  f1-score   support

           Bot       0.66      1.00      0.80     43193
   Brute Force       0.93      1.00      0.96     28463
          DDoS       0.66      1.00      0.79    232641
           DoS       0.30      1.00      0.46     58890
 Infilteration       0.01      0.98      0.03     32259
Normal Traffic       1.00      0.01      0.02   2590259

      accuracy                           0.14   2985705
     macro avg       0.59      0.83      0.51   2985705
  weighted avg       0.94      0.14      0.11   2985705



## Over Sampling

In [42]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

In [None]:
X_train_resampled_ADASYN, y_train_resampled_ADASYN = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5).fit_resample(X_train_resampled_rus, y_train_resampled_rus)

X_train_resampled_scaled_RS_ADASYN, y_train_resampled_scaled_RS_ADASYN = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5).fit_resample(X_train_scaled_rus_RS, y_train_scaled_rus_RS)

X_train_resampled_scaled_MMS_ADASYN, y_train_resampled_scaled_MMS_ADASYN = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)


X_train_resampled_SMOTE, y_train_resampled_SMOTE = SMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS':780000, 'DoS': 200000}, random_state=42).fit_resample(X_train_resampled_rus, y_train_resampled_rus)

X_train_resampled_scaled_RS_SMOTE, y_train_resampled_scaled_RS_SMOTE = SMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS':780000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_RS, y_train_scaled_rus_RS)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS':780000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)


X_train_resampled_BSMOTE, y_train_resampled_BSMOTE = BorderlineSMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS': 780000, 'DoS': 200000}, random_state=42, k_neighbors=5, m_neighbors=10, kind='borderline-1').fit_resample(X_train_resampled_rus, y_train_resampled_rus)

X_train_resampled_scaled_RS_BSMOTE, y_train_resampled_scaled_RS_BSMOTE = BorderlineSMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS': 780000, 'DoS': 200000}, random_state=42, k_neighbors=5, m_neighbors=10, kind='borderline-1').fit_resample(X_train_scaled_rus_RS, y_train_scaled_rus_RS)

X_train_resampled_scaled_MMS_BSMOTE, y_train_resampled_scaled_MMS_BSMOTE = BorderlineSMOTE(sampling_strategy={'Bot': 150000, 'Brute Force': 100000, 'Infilteration': 110000, 'DDoS': 780000, 'DoS': 200000}, random_state=42, k_neighbors=5, m_neighbors=10, kind='borderline-1').fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

ValueError: The {'Bots'} target class is/are not present in the data.

In [None]:
y_train_resampled_rus.value_counts()

In [None]:
y_train_resampled_ADASYN.value_counts()

In [None]:
y_train_resampled_SMOTE.value_counts()

In [None]:
y_train_resampled_scaled.value_counts()

## Eval RF

In [None]:
eval_dataset_w_RF(X_train_resampled_ADASYN, X_test, y_train_resampled_ADASYN, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_scaled_RS_ADASYN, X_test_RS_scaled, y_train_resampled_scaled_RS_ADASYN, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_ADASYN, X_test_MMS_scaled, y_train_resampled_scaled_MMS_ADASYN, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_SMOTE, X_test, y_train_resampled_SMOTE, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_scaled_RS_SMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_SMOTE, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_BSMOTE, X_test, y_train_resampled_BSMOTE, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_scaled_RS_BSMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_BSMOTE, y_test)

In [None]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_BSMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_BSMOTE, y_test)

## Eval KNN

In [None]:
eval_dataset_w_KNN(X_train_resampled_ADASYN, X_test, y_train_resampled_ADASYN, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_scaled_RS_ADASYN, X_test_RS_scaled, y_train_resampled_scaled_RS_ADASYN, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_ADASYN, X_test_MMS_scaled, y_train_resampled_scaled_MMS_ADASYN, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_SMOTE, X_test, y_train_resampled_SMOTE, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_scaled_RS_SMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_SMOTE, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_BSMOTE, X_test, y_train_resampled_BSMOTE, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_scaled_RS_BSMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_BSMOTE, y_test)

In [None]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_BSMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_BSMOTE, y_test)