# Imports and benchmark functions

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

import time
import psutil
import threading
from memory_profiler import memory_usage

In [62]:
def apply_rf(X_train, y_train, best_params=None, random_state=42, n_jobs=-1, cv=5): 
    measurement_rf = {}
        
    # Default to empty dictionary if best_params is not provided
    best_params = best_params or {}

    rf_model = RandomForestClassifier(**best_params, random_state=random_state, n_jobs=n_jobs, verbose=1)
    
    # Function to monitor CPU usage during training
    cpu_usage = []
    stop_flag = threading.Event()

    def monitor_cpu():
        while not stop_flag.is_set():
            cpu_usage.append(psutil.cpu_percent(interval=0.1))

    # Function to train the model
    def train_model():
        rf_model.fit(X_train, y_train)

    try:
        # Start CPU monitoring in a separate thread
        cpu_thread = threading.Thread(target=monitor_cpu)
        cpu_thread.start()

        # Measure memory usage and training time
        start_time = time.time()
        train_model()
        training_time = time.time() - start_time

        # Stop CPU monitoring
        stop_flag.set()
        cpu_thread.join()

        # Add measurements
        measurement_rf['Training Time (s)'] = training_time
        measurement_rf['Peak CPU Usage (%)'] = max(cpu_usage)
        measurement_rf['Average CPU Usage (%)'] = sum(cpu_usage) / len(cpu_usage) if cpu_usage else 0

        # Perform cross-validation
        cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=cv, n_jobs=n_jobs)

        return cv_scores_rf, measurement_rf, rf_model

    except Exception as e:
        import traceback
        print("⛔ Full error traceback:")
        traceback.print_exc()  # Print detailed error traceback
        print(f"Error during Random Forest training: {e}")
        return None, None, None

In [88]:
def eval_dataset_w_RF(X_train, X_test, y_train, y_test):
    params_rf = {'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}

    # Fitting the model
    cv_scores_rf, measurement_rf, rf_model = apply_rf(X_train, y_train, best_params=params_rf)

    # Making predictions
    y_pred_rf = rf_model.predict(X_test)
    
    # Evaluating the model performance on the cross validation set vs accuracy on the test set
    cv_scores_mean_rf = np.mean(cv_scores_rf)
    print(f'Cross validation average score: {cv_scores_mean_rf:.4f} +/- standard deviation: {np.std(cv_scores_rf):.4f}')

    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    print(f'Accuracy on the test set: {accuracy_rf:.4f}')
    
    # Checking computational cost
    print("Resource measurements:", measurement_rf)
    print(classification_report(y_test, y_pred_rf))

In [98]:
def eval_dataset_w_KNN(X_train, X_test, y_train, y_test):
    params_knn = {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 100, 'p': 1}
    
    # Fitting the model
    cv_scores_knn, measurement_knn, knn_model = apply_knn(X_train, y_train, best_params=params_knn)
    
    # Making predictions
    y_pred_knn = knn_model.predict(X_test)
    
    # Evaluating the model performance on the cross validation set vs accuracy on the test set
    cv_scores_mean_knn = np.mean(cv_scores_knn)
    print(f'Cross validation average score: {cv_scores_mean_knn:.4f} +/- standard deviation: {np.std(cv_scores_knn):.4f}')
    
    accuracy_knn = accuracy_score(y_test, y_pred_knn)
    print(f'Accuracy on the test set: {accuracy_knn:.4f}')
    
    # Checking computational cost
    print("Resource measurements:", measurement_knn)
    print(classification_report(y_test, y_pred_knn))

In [5]:
def apply_knn(X_train, y_train, best_params=None, random_state=42, n_jobs=-1, cv=5):
    measurement_knn = {}
    
    # Default to empty dictionary if best_params is not provided
    best_params = best_params or {}
    
    knn_model = KNeighborsClassifier(**best_params, n_jobs=n_jobs)
    
    # Function to monitor CPU usage during training
    cpu_usage = []
    stop_flag = threading.Event()
    
    def monitor_cpu():
        while not stop_flag.is_set():
            cpu_usage.append(psutil.cpu_percent(interval=0.1))
    
    # Function to train the model
    def train_model():
        knn_model.fit(X_train, y_train)
    
    try:
        # Start CPU monitoring in a separate thread
        cpu_thread = threading.Thread(target=monitor_cpu)
        cpu_thread.start()
        
        # Measure memory usage and training time
        start_time = time.time()
        train_model()
        training_time = time.time() - start_time
        
        # Stop CPU monitoring
        stop_flag.set()
        cpu_thread.join()
        
        # Add measurements
        measurement_knn['Training Time (s)'] = training_time
        measurement_knn['Peak CPU Usage (%)'] = max(cpu_usage)
        measurement_knn['Average CPU Usage (%)'] = sum(cpu_usage) / len(cpu_usage) if cpu_usage else 0
        
        # Perform cross-validation
        cv_scores_knn = cross_val_score(knn_model, X_train, y_train, cv=cv, n_jobs=n_jobs)
        
        return cv_scores_knn, measurement_knn, knn_model
        
    except Exception as e:
        import traceback
        print("⛔ Full error traceback:")
        traceback.print_exc()  # Print detailed error traceback
        print(f"Error during KNN training: {e}")
        return None, None, None

# Data balancing


In [89]:
# Reading data
df = pd.read_csv("..\..\data prep\cicids2017_prep\cicids2017_FINAL_no_class_sync.csv")

In [7]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

In [91]:
# Preparing training and test splits
X = df.drop('Attack Type', axis=1)
y = df['Attack Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [104]:
# Initialize scaling algos
RS = RobustScaler()
X_train_RS_scaled = RS.fit_transform(X_train)
X_test_RS_scaled = RS.transform(X_test)

SS = StandardScaler()
X_train_SS_scaled = SS.fit_transform(X_train)
X_test_SS_scaled = SS.transform(X_test)

MMS = MinMaxScaler()
X_train_MMS_scaled = MMS.fit_transform(X_train)
X_test_MMS_scaled = MMS.transform(X_test)

In [93]:
print(df['Attack Type'].value_counts())

Attack Type
Normal Traffic    2095057
DoS                193745
DDoS               128014
Port Scanning       90694
Brute Force          9150
Web Attacks          2143
Bots                 1948
Name: count, dtype: int64


## Evals

In [94]:
eval_dataset_w_RF(X_train, X_test, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    2.0s finished


Cross validation average score: 0.9955 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9956
Resource measurements: {'Training Time (s)': 120.61213088035583, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 95.65379213483142}
                precision    recall  f1-score   support

          Bots       1.00      0.01      0.02       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       1.00      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.04      0.07       643

      accuracy                           1.00    756226
     macro avg       1.00      0.71      0.72    756226
  weighted avg       1.00      1.00      0.99    756226



In [95]:
eval_dataset_w_RF(X_train_RS_scaled, X_test_RS_scaled, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.7min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.9s finished


Cross validation average score: 0.9955 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9956
Resource measurements: {'Training Time (s)': 102.1338119506836, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.26848874598076}
                precision    recall  f1-score   support

          Bots       1.00      0.01      0.03       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       1.00      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.01      0.01       643

      accuracy                           1.00    756226
     macro avg       1.00      0.71      0.71    756226
  weighted avg       1.00      1.00      0.99    756226



In [96]:
eval_dataset_w_RF(X_train_SS_scaled, X_test_SS_scaled, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.5min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9955 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9956
Resource measurements: {'Training Time (s)': 90.93157124519348, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.52833607907743}
                precision    recall  f1-score   support

          Bots       1.00      0.01      0.03       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       1.00      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.04      0.08       643

      accuracy                           1.00    756226
     macro avg       1.00      0.71      0.72    756226
  weighted avg       1.00      1.00      0.99    756226



In [97]:
eval_dataset_w_RF(X_train_MMS_scaled, X_test_MMS_scaled, y_train, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.2min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9955 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9955
Resource measurements: {'Training Time (s)': 71.45124316215515, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.99862475442038}
                precision    recall  f1-score   support

          Bots       1.00      0.01      0.03       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       1.00      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.01      0.01       643

      accuracy                           1.00    756226
     macro avg       1.00      0.71      0.71    756226
  weighted avg       1.00      1.00      0.99    756226



In [99]:
eval_dataset_w_KNN(X_train, X_test, y_train, y_test)

Cross validation average score: 0.9935 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9939
Resource measurements: {'Training Time (s)': 15.792449235916138, 'Peak CPU Usage (%)': 18.6, 'Average CPU Usage (%)': 14.314285714285715}
                precision    recall  f1-score   support

          Bots       0.75      0.47      0.58       584
   Brute Force       0.99      0.96      0.97      2745
          DDoS       0.97      0.98      0.98     38404
           DoS       0.98      0.98      0.98     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.98      1.00      0.99     27208
   Web Attacks       0.97      0.93      0.95       643

      accuracy                           0.99    756226
     macro avg       0.95      0.90      0.92    756226
  weighted avg       0.99      0.99      0.99    756226



In [148]:
eval_dataset_w_KNN(X_train_RS_scaled, X_test_RS_scaled, y_train, y_test)

Cross validation average score: 0.9867 +/- standard deviation: 0.0003
Accuracy on the test set: 0.9875
Resource measurements: {'Training Time (s)': 14.503502607345581, 'Peak CPU Usage (%)': 20.8, 'Average CPU Usage (%)': 8.866666666666667}
                precision    recall  f1-score   support

          Bots       0.67      0.49      0.56       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       0.92      0.93      0.93     38404
           DoS       0.97      0.97      0.97     58124
Normal Traffic       0.99      0.99      0.99    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.97      0.93      0.95       643

      accuracy                           0.99    756226
     macro avg       0.93      0.90      0.91    756226
  weighted avg       0.99      0.99      0.99    756226



In [149]:
eval_dataset_w_KNN(X_train_SS_scaled, X_test_SS_scaled, y_train, y_test)

Cross validation average score: 0.9973 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9974
Resource measurements: {'Training Time (s)': 14.706319808959961, 'Peak CPU Usage (%)': 31.2, 'Average CPU Usage (%)': 12.0125}
                precision    recall  f1-score   support

          Bots       0.68      0.50      0.58       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.99      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.98      0.93      0.95       643

      accuracy                           1.00    756226
     macro avg       0.95      0.91      0.93    756226
  weighted avg       1.00      1.00      1.00    756226



In [150]:
eval_dataset_w_KNN(X_train_MMS_scaled, X_test_MMS_scaled, y_train, y_test)

Cross validation average score: 0.9971 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9973
Resource measurements: {'Training Time (s)': 14.027924537658691, 'Peak CPU Usage (%)': 12.1, 'Average CPU Usage (%)': 7.833333333333333}
                precision    recall  f1-score   support

          Bots       0.68      0.50      0.58       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.99      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.98      0.93      0.95       643

      accuracy                           1.00    756226
     macro avg       0.95      0.91      0.93    756226
  weighted avg       1.00      1.00      1.00    756226



## Under Sampling

In [101]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss

In [105]:
# Initializing the undersampling for the clean df
X_train_resampled_rus, y_train_resampled_rus = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train, y_train)

# Initializing the undersampling for the scaled df
X_train_scaled_rus_RS, y_train_scaled_rus_RS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_RS_scaled, y_train)

X_train_scaled_rus_SS, y_train_scaled_rus_SS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_SS_scaled, y_train)

X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

In [103]:
# Initializing the undersampling for the clean df
X_train_resampled_NM, y_train_resampled_NM = NearMiss(sampling_strategy={'Normal Traffic': 500000}, version=3).fit_resample(X_train, y_train)

# Initializing the undersampling for the scaled df
X_train_scaled_NM_RS, y_train_scaled_NM_RS = NearMiss(sampling_strategy={'Normal Traffic': 500000}, version=3).fit_resample(X_train_RS_scaled, y_train)

X_train_scaled_NM_SS, y_train_scaled_NM_SS = NearMiss(sampling_strategy={'Normal Traffic': 500000}, version=3).fit_resample(X_train_SS_scaled, y_train)

X_train_scaled_NM_MMS, y_train_scaled_NM_MMS = NearMiss(sampling_strategy={'Normal Traffic': 500000}, version=3).fit_resample(X_train_MMS_scaled, y_train)




## Evals RF

In [106]:
eval_dataset_w_RF(X_train_resampled_rus, X_test, y_train_resampled_rus, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   42.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.9s finished


Cross validation average score: 0.9926 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9962
Resource measurements: {'Training Time (s)': 43.40252733230591, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 96.64455128205131}
                precision    recall  f1-score   support

          Bots       1.00      0.30      0.47       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       1.00      0.76      0.79    756226
  weighted avg       1.00      1.00      1.00    756226



In [107]:
eval_dataset_w_RF(X_train_scaled_rus_RS, X_test_RS_scaled, y_train_scaled_rus_RS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   37.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9928 +/- standard deviation: 0.0001
Accuracy on the test set: 0.9962
Resource measurements: {'Training Time (s)': 38.812490940093994, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 95.1874149659864}
                precision    recall  f1-score   support

          Bots       1.00      0.30      0.47       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       1.00      0.76      0.79    756226
  weighted avg       1.00      1.00      1.00    756226



In [108]:
eval_dataset_w_RF(X_train_scaled_rus_SS, X_test_SS_scaled, y_train_scaled_rus_SS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   34.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    2.0s finished


Cross validation average score: 0.9926 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9962
Resource measurements: {'Training Time (s)': 35.354289531707764, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.52527075812272}
                precision    recall  f1-score   support

          Bots       1.00      0.30      0.47       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       1.00      0.76      0.79    756226
  weighted avg       1.00      1.00      1.00    756226



In [109]:
eval_dataset_w_RF(X_train_scaled_rus_MMS, X_test_MMS_scaled, y_train_scaled_rus_MMS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   29.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.9s finished


Cross validation average score: 0.9928 +/- standard deviation: 0.0003
Accuracy on the test set: 0.9962
Resource measurements: {'Training Time (s)': 30.5056893825531, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.88257261410787}
                precision    recall  f1-score   support

          Bots       1.00      0.30      0.47       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       1.00      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       1.00      0.76      0.79    756226
  weighted avg       1.00      1.00      1.00    756226



In [110]:
eval_dataset_w_RF(X_train_resampled_NM, X_test, y_train_resampled_NM, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   11.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.9s finished


Cross validation average score: 0.9957 +/- standard deviation: 0.0008
Accuracy on the test set: 0.4689
Resource measurements: {'Training Time (s)': 11.453615427017212, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.33267326732674}
                precision    recall  f1-score   support

          Bots       0.04      0.66      0.07       584
   Brute Force       0.02      1.00      0.03      2745
          DDoS       0.54      1.00      0.70     38404
           DoS       0.27      1.00      0.43     58124
Normal Traffic       1.00      0.36      0.53    628518
 Port Scanning       0.61      1.00      0.76     27208
   Web Attacks       0.02      0.98      0.04       643

      accuracy                           0.47    756226
     macro avg       0.36      0.86      0.37    756226
  weighted avg       0.90      0.47      0.54    756226



In [111]:
eval_dataset_w_RF(X_train_scaled_NM_RS, X_test_RS_scaled, y_train_scaled_NM_RS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   10.3s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9963 +/- standard deviation: 0.0008
Accuracy on the test set: 0.5171
Resource measurements: {'Training Time (s)': 10.683176279067993, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.25000000000001}
                precision    recall  f1-score   support

          Bots       0.04      0.68      0.08       584
   Brute Force       0.02      1.00      0.04      2745
          DDoS       0.40      1.00      0.58     38404
           DoS       0.29      1.00      0.45     58124
Normal Traffic       1.00      0.42      0.59    628518
 Port Scanning       0.68      1.00      0.81     27208
   Web Attacks       0.02      0.98      0.05       643

      accuracy                           0.52    756226
     macro avg       0.35      0.87      0.37    756226
  weighted avg       0.90      0.52      0.58    756226



In [112]:
eval_dataset_w_RF(X_train_scaled_NM_SS, X_test_SS_scaled, y_train_scaled_NM_SS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    9.3s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9960 +/- standard deviation: 0.0004
Accuracy on the test set: 0.2662
Resource measurements: {'Training Time (s)': 9.652126789093018, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.93928571428572}
                precision    recall  f1-score   support

          Bots       0.03      0.69      0.06       584
   Brute Force       0.02      1.00      0.03      2745
          DDoS       0.23      1.00      0.37     38404
           DoS       0.24      1.00      0.39     58124
Normal Traffic       1.00      0.12      0.21    628518
 Port Scanning       0.37      1.00      0.54     27208
   Web Attacks       0.02      0.98      0.05       643

      accuracy                           0.27    756226
     macro avg       0.27      0.83      0.24    756226
  weighted avg       0.87      0.27      0.24    756226



In [113]:
eval_dataset_w_RF(X_train_scaled_NM_MMS, X_test_MMS_scaled, y_train_scaled_NM_MMS, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    7.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9958 +/- standard deviation: 0.0004
Accuracy on the test set: 0.2600
Resource measurements: {'Training Time (s)': 7.835391521453857, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.86619718309862}
                precision    recall  f1-score   support

          Bots       0.03      0.71      0.06       584
   Brute Force       0.01      1.00      0.02      2745
          DDoS       0.71      1.00      0.83     38404
           DoS       0.24      1.00      0.38     58124
Normal Traffic       1.00      0.11      0.20    628518
 Port Scanning       0.22      1.00      0.37     27208
   Web Attacks       0.02      0.98      0.05       643

      accuracy                           0.26    756226
     macro avg       0.32      0.83      0.27    756226
  weighted avg       0.89      0.26      0.25    756226



## Evals KNN

In [151]:
eval_dataset_w_KNN(X_train_resampled_rus, X_test, y_train_resampled_rus, y_test)



Cross validation average score: 0.9823 +/- standard deviation: 0.0020
Accuracy on the test set: 0.9834
Resource measurements: {'Training Time (s)': 9.873514413833618, 'Peak CPU Usage (%)': 11.4, 'Average CPU Usage (%)': 9.200000000000001}
                precision    recall  f1-score   support

          Bots       0.16      0.75      0.26       584
   Brute Force       0.98      0.96      0.97      2745
          DDoS       0.97      0.98      0.98     38404
           DoS       0.96      0.98      0.97     58124
Normal Traffic       0.99      0.99      0.99    628518
 Port Scanning       0.99      0.81      0.89     27208
   Web Attacks       0.96      0.93      0.95       643

      accuracy                           0.98    756226
     macro avg       0.86      0.92      0.86    756226
  weighted avg       0.99      0.98      0.98    756226



In [152]:
eval_dataset_w_KNN(X_train_scaled_rus_RS, X_test_RS_scaled, y_train_scaled_rus_RS, y_test)

Cross validation average score: 0.9646 +/- standard deviation: 0.0007
Accuracy on the test set: 0.9700
Resource measurements: {'Training Time (s)': 9.611931800842285, 'Peak CPU Usage (%)': 9.9, 'Average CPU Usage (%)': 8.0}
                precision    recall  f1-score   support

          Bots       0.16      0.77      0.27       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       0.89      0.94      0.91     38404
           DoS       0.95      0.97      0.96     58124
Normal Traffic       0.98      0.99      0.98    628518
 Port Scanning       1.00      0.66      0.80     27208
   Web Attacks       0.96      0.93      0.95       643

      accuracy                           0.97    756226
     macro avg       0.85      0.89      0.84    756226
  weighted avg       0.97      0.97      0.97    756226



In [153]:
eval_dataset_w_KNN(X_train_scaled_rus_SS, X_test_SS_scaled, y_train_scaled_rus_SS, y_test)

Cross validation average score: 0.9803 +/- standard deviation: 0.0019
Accuracy on the test set: 0.9859
Resource measurements: {'Training Time (s)': 5.313117742538452, 'Peak CPU Usage (%)': 15.2, 'Average CPU Usage (%)': 9.6}
                precision    recall  f1-score   support

          Bots       0.17      0.84      0.28       584
   Brute Force       0.99      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.97      0.99      0.98     58124
Normal Traffic       0.99      0.99      0.99    628518
 Port Scanning       0.99      0.81      0.89     27208
   Web Attacks       0.93      0.94      0.93       643

      accuracy                           0.99    756226
     macro avg       0.86      0.93      0.86    756226
  weighted avg       0.99      0.99      0.99    756226



In [154]:
eval_dataset_w_KNN(X_train_scaled_rus_MMS, X_test_MMS_scaled, y_train_scaled_rus_MMS, y_test)



Cross validation average score: 0.9771 +/- standard deviation: 0.0011
Accuracy on the test set: 0.9784
Resource measurements: {'Training Time (s)': 9.802721500396729, 'Peak CPU Usage (%)': 14.2, 'Average CPU Usage (%)': 12.0}
                precision    recall  f1-score   support

          Bots       0.16      0.78      0.27       584
   Brute Force       0.99      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.93      0.99      0.96     58124
Normal Traffic       0.98      0.99      0.99    628518
 Port Scanning       1.00      0.67      0.80     27208
   Web Attacks       0.97      0.93      0.95       643

      accuracy                           0.98    756226
     macro avg       0.86      0.90      0.85    756226
  weighted avg       0.98      0.98      0.98    756226



In [155]:
eval_dataset_w_KNN(X_train_resampled_NM, X_test, y_train_resampled_NM, y_test)

Cross validation average score: 0.9907 +/- standard deviation: 0.0004
Accuracy on the test set: 0.2360
Resource measurements: {'Training Time (s)': 1.4848995208740234, 'Peak CPU Usage (%)': 9.9, 'Average CPU Usage (%)': 8.25}
                precision    recall  f1-score   support

          Bots       0.02      0.69      0.04       584
   Brute Force       0.03      0.99      0.06      2745
          DDoS       0.19      0.99      0.32     38404
           DoS       0.20      1.00      0.34     58124
Normal Traffic       1.00      0.08      0.15    628518
 Port Scanning       0.32      1.00      0.49     27208
   Web Attacks       0.02      0.94      0.04       643

      accuracy                           0.24    756226
     macro avg       0.26      0.81      0.21    756226
  weighted avg       0.87      0.24      0.19    756226



In [156]:
eval_dataset_w_KNN(X_train_scaled_NM_RS, X_test_RS_scaled, y_train_scaled_NM_RS, y_test)

Cross validation average score: 0.9840 +/- standard deviation: 0.0004
Accuracy on the test set: 0.2071
Resource measurements: {'Training Time (s)': 1.4383196830749512, 'Peak CPU Usage (%)': 13.3, 'Average CPU Usage (%)': 10.225}
                precision    recall  f1-score   support

          Bots       0.03      0.76      0.05       584
   Brute Force       0.02      0.99      0.03      2745
          DDoS       0.24      0.99      0.39     38404
           DoS       0.19      0.99      0.32     58124
Normal Traffic       1.00      0.05      0.09    628518
 Port Scanning       0.40      1.00      0.57     27208
   Web Attacks       0.04      0.96      0.07       643

      accuracy                           0.21    756226
     macro avg       0.27      0.82      0.22    756226
  weighted avg       0.87      0.21      0.14    756226



In [157]:
eval_dataset_w_KNN(X_train_scaled_NM_SS, X_test_SS_scaled, y_train_scaled_NM_SS, y_test)

Cross validation average score: 0.9955 +/- standard deviation: 0.0004
Accuracy on the test set: 0.2573
Resource measurements: {'Training Time (s)': 1.4132719039916992, 'Peak CPU Usage (%)': 11.5, 'Average CPU Usage (%)': 7.475}
                precision    recall  f1-score   support

          Bots       0.02      0.78      0.05       584
   Brute Force       0.04      0.99      0.08      2745
          DDoS       0.54      1.00      0.70     38404
           DoS       0.13      1.00      0.22     58124
Normal Traffic       1.00      0.11      0.19    628518
 Port Scanning       0.54      1.00      0.70     27208
   Web Attacks       0.03      0.96      0.05       643

      accuracy                           0.26    756226
     macro avg       0.33      0.83      0.29    756226
  weighted avg       0.89      0.26      0.24    756226



In [158]:
eval_dataset_w_KNN(X_train_scaled_NM_MMS, X_test_MMS_scaled, y_train_scaled_NM_MMS, y_test)

Cross validation average score: 0.9952 +/- standard deviation: 0.0003
Accuracy on the test set: 0.2504
Resource measurements: {'Training Time (s)': 1.390134572982788, 'Peak CPU Usage (%)': 12.1, 'Average CPU Usage (%)': 8.725}
                precision    recall  f1-score   support

          Bots       0.02      0.78      0.05       584
   Brute Force       0.04      0.99      0.08      2745
          DDoS       0.51      1.00      0.67     38404
           DoS       0.13      1.00      0.23     58124
Normal Traffic       1.00      0.10      0.18    628518
 Port Scanning       0.53      1.00      0.70     27208
   Web Attacks       0.02      0.97      0.04       643

      accuracy                           0.25    756226
     macro avg       0.32      0.83      0.28    756226
  weighted avg       0.88      0.25      0.23    756226



## Over Sampling

In [114]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

In [115]:
X_train_resampled_ADASYN, y_train_resampled_ADASYN = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5).fit_resample(X_train_resampled_rus, y_train_resampled_rus)

X_train_resampled_scaled_RS_ADASYN, y_train_resampled_scaled_RS_ADASYN = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5).fit_resample(X_train_scaled_rus_RS, y_train_scaled_rus_RS)

X_train_resampled_scaled_MMS_ADASYN, y_train_resampled_scaled_MMS_ADASYN = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)


X_train_resampled_SMOTE, y_train_resampled_SMOTE = SMOTE(sampling_strategy={'Bots': 2000, 'Web Attacks': 2000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_resampled_rus, y_train_resampled_rus)

X_train_resampled_scaled_RS_SMOTE, y_train_resampled_scaled_RS_SMOTE = SMOTE(sampling_strategy={'Bots': 2000, 'Web Attacks': 2000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_RS, y_train_scaled_rus_RS)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 2000, 'Web Attacks': 2000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)


X_train_resampled_BSMOTE, y_train_resampled_BSMOTE = BorderlineSMOTE(sampling_strategy={'Bots': 2000, 'Web Attacks': 2000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS': 90000, 'DoS': 200000}, random_state=42, k_neighbors=5, m_neighbors=10, kind='borderline-1').fit_resample(X_train_resampled_rus, y_train_resampled_rus)

X_train_resampled_scaled_RS_BSMOTE, y_train_resampled_scaled_RS_BSMOTE = BorderlineSMOTE(sampling_strategy={'Bots': 2000, 'Web Attacks': 2000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS': 90000, 'DoS': 200000}, random_state=42, k_neighbors=5, m_neighbors=10, kind='borderline-1').fit_resample(X_train_scaled_rus_RS, y_train_scaled_rus_RS)

X_train_resampled_scaled_MMS_BSMOTE, y_train_resampled_scaled_MMS_BSMOTE = BorderlineSMOTE(sampling_strategy={'Bots': 2000, 'Web Attacks': 2000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS': 90000, 'DoS': 200000}, random_state=42, k_neighbors=5, m_neighbors=10, kind='borderline-1').fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [116]:
y_train_resampled_rus.value_counts()

Attack Type
Normal Traffic    500000
DoS               135621
DDoS               89610
Port Scanning      63486
Brute Force         6405
Web Attacks         1500
Bots                1364
Name: count, dtype: int64

In [117]:
y_train_resampled_ADASYN.value_counts()

Attack Type
Port Scanning     505920
DDoS              500622
Bots              500076
Brute Force       500064
Web Attacks       500050
Normal Traffic    500000
DoS               499430
Name: count, dtype: int64

In [118]:
y_train_resampled_SMOTE.value_counts()

Attack Type
Normal Traffic    500000
DoS               200000
DDoS               90000
Port Scanning      70000
Brute Force         7000
Bots                2000
Web Attacks         2000
Name: count, dtype: int64

## Eval RF

In [119]:
eval_dataset_w_RF(X_train_resampled_ADASYN, X_test, y_train_resampled_ADASYN, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.6min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.9s finished


Cross validation average score: 0.9718 +/- standard deviation: 0.0069
Accuracy on the test set: 0.9384
Resource measurements: {'Training Time (s)': 157.79689478874207, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.9514841351075}
                precision    recall  f1-score   support

          Bots       0.04      0.99      0.07       584
   Brute Force       0.41      1.00      0.58      2745
          DDoS       0.97      1.00      0.98     38404
           DoS       0.87      1.00      0.93     58124
Normal Traffic       1.00      0.93      0.96    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.04      0.99      0.07       643

      accuracy                           0.94    756226
     macro avg       0.61      0.99      0.65    756226
  weighted avg       0.98      0.94      0.96    756226



In [120]:
eval_dataset_w_RF(X_train_resampled_scaled_RS_ADASYN, X_test_RS_scaled, y_train_resampled_scaled_RS_ADASYN, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.2min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    2.0s finished


Cross validation average score: 0.9594 +/- standard deviation: 0.0226
Accuracy on the test set: 0.9383
Resource measurements: {'Training Time (s)': 197.55077195167542, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.10075885328835}
                precision    recall  f1-score   support

          Bots       0.04      0.99      0.07       584
   Brute Force       0.39      1.00      0.56      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.89      1.00      0.94     58124
Normal Traffic       1.00      0.93      0.96    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.03      0.99      0.06       643

      accuracy                           0.94    756226
     macro avg       0.62      0.99      0.66    756226
  weighted avg       0.99      0.94      0.96    756226



In [121]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_ADASYN, X_test_MMS_scaled, y_train_resampled_scaled_MMS_ADASYN, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.7min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9520 +/- standard deviation: 0.0082
Accuracy on the test set: 0.8903
Resource measurements: {'Training Time (s)': 108.67772316932678, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.91941489361697}
                precision    recall  f1-score   support

          Bots       0.03      0.99      0.06       584
   Brute Force       0.19      1.00      0.32      2745
          DDoS       0.81      1.00      0.90     38404
           DoS       0.76      1.00      0.86     58124
Normal Traffic       1.00      0.87      0.93    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.02      0.99      0.05       643

      accuracy                           0.89    756226
     macro avg       0.54      0.98      0.59    756226
  weighted avg       0.97      0.89      0.92    756226



In [122]:
eval_dataset_w_RF(X_train_resampled_SMOTE, X_test, y_train_resampled_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   43.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.6s finished


Cross validation average score: 0.9916 +/- standard deviation: 0.0005
Accuracy on the test set: 0.9961
Resource measurements: {'Training Time (s)': 44.4897301197052, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.2110769230769}
                precision    recall  f1-score   support

          Bots       0.58      0.57      0.57       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       0.94      0.79      0.80    756226
  weighted avg       1.00      1.00      1.00    756226



In [123]:
eval_dataset_w_RF(X_train_resampled_scaled_RS_SMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   41.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.6s finished


Cross validation average score: 0.9916 +/- standard deviation: 0.0004
Accuracy on the test set: 0.9961
Resource measurements: {'Training Time (s)': 42.391907691955566, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.31536050156738}
                precision    recall  f1-score   support

          Bots       0.58      0.57      0.57       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       0.94      0.79      0.80    756226
  weighted avg       1.00      1.00      1.00    756226



In [124]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   30.2s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.7s finished


Cross validation average score: 0.9918 +/- standard deviation: 0.0004
Accuracy on the test set: 0.9962
Resource measurements: {'Training Time (s)': 31.16592001914978, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.7003937007874}
                precision    recall  f1-score   support

          Bots       0.59      0.57      0.58       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       0.94      0.79      0.80    756226
  weighted avg       1.00      1.00      1.00    756226



In [125]:
eval_dataset_w_RF(X_train_resampled_BSMOTE, X_test, y_train_resampled_BSMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   41.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.7s finished


Cross validation average score: 0.9855 +/- standard deviation: 0.0032
Accuracy on the test set: 0.9860
Resource measurements: {'Training Time (s)': 42.01073408126831, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.52073170731704}
                precision    recall  f1-score   support

          Bots       0.58      0.56      0.57       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.87      1.00      0.93     58124
Normal Traffic       1.00      0.98      0.99    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           0.99    756226
     macro avg       0.92      0.79      0.79    756226
  weighted avg       0.99      0.99      0.99    756226



In [126]:
eval_dataset_w_RF(X_train_resampled_scaled_RS_BSMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_BSMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   43.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.7s finished


Cross validation average score: 0.9815 +/- standard deviation: 0.0131
Accuracy on the test set: 0.9883
Resource measurements: {'Training Time (s)': 44.4300742149353, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 93.80628742514965}
                precision    recall  f1-score   support

          Bots       0.58      0.57      0.57       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.89      1.00      0.94     58124
Normal Traffic       1.00      0.99      0.99    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           0.99    756226
     macro avg       0.92      0.79      0.80    756226
  weighted avg       0.99      0.99      0.99    756226



In [127]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_BSMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_BSMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   29.2s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9830 +/- standard deviation: 0.0039
Accuracy on the test set: 0.9841
Resource measurements: {'Training Time (s)': 30.112889289855957, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.2765182186235}
                precision    recall  f1-score   support

          Bots       0.58      0.56      0.57       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.85      1.00      0.92     58124
Normal Traffic       1.00      0.98      0.99    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           0.98    756226
     macro avg       0.92      0.79      0.79    756226
  weighted avg       0.99      0.98      0.98    756226



## Eval KNN

In [159]:
eval_dataset_w_KNN(X_train_resampled_ADASYN, X_test, y_train_resampled_ADASYN, y_test)

Cross validation average score: 0.8677 +/- standard deviation: 0.0680
Accuracy on the test set: 0.9696
Resource measurements: {'Training Time (s)': 22.81295943260193, 'Peak CPU Usage (%)': 14.3, 'Average CPU Usage (%)': 10.383333333333335}
                precision    recall  f1-score   support

          Bots       0.15      0.92      0.26       584
   Brute Force       0.72      0.98      0.83      2745
          DDoS       0.90      0.99      0.95     38404
           DoS       0.87      0.99      0.93     58124
Normal Traffic       0.99      0.97      0.98    628518
 Port Scanning       0.97      0.86      0.91     27208
   Web Attacks       0.26      0.91      0.41       643

      accuracy                           0.97    756226
     macro avg       0.70      0.95      0.75    756226
  weighted avg       0.98      0.97      0.97    756226



In [160]:
eval_dataset_w_KNN(X_train_resampled_scaled_RS_ADASYN, X_test_RS_scaled, y_train_resampled_scaled_RS_ADASYN, y_test)

Cross validation average score: 0.8890 +/- standard deviation: 0.0450
Accuracy on the test set: 0.9451
Resource measurements: {'Training Time (s)': 23.712327003479004, 'Peak CPU Usage (%)': 27.5, 'Average CPU Usage (%)': 13.233333333333334}
                precision    recall  f1-score   support

          Bots       0.15      0.92      0.25       584
   Brute Force       0.79      0.99      0.88      2745
          DDoS       0.68      0.99      0.81     38404
           DoS       0.80      0.99      0.89     58124
Normal Traffic       0.99      0.94      0.97    628518
 Port Scanning       0.98      0.84      0.91     27208
   Web Attacks       0.51      0.94      0.66       643

      accuracy                           0.95    756226
     macro avg       0.70      0.95      0.77    756226
  weighted avg       0.96      0.95      0.95    756226



In [161]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_ADASYN, X_test_MMS_scaled, y_train_resampled_scaled_MMS_ADASYN, y_test)

Cross validation average score: 0.8329 +/- standard deviation: 0.0471
Accuracy on the test set: 0.9756
Resource measurements: {'Training Time (s)': 21.23766040802002, 'Peak CPU Usage (%)': 18.4, 'Average CPU Usage (%)': 10.728571428571428}
                precision    recall  f1-score   support

          Bots       0.17      0.92      0.29       584
   Brute Force       0.78      0.99      0.87      2745
          DDoS       0.96      1.00      0.98     38404
           DoS       0.90      0.99      0.95     58124
Normal Traffic       0.99      0.98      0.99    628518
 Port Scanning       0.99      0.77      0.87     27208
   Web Attacks       0.51      0.86      0.64       643

      accuracy                           0.98    756226
     macro avg       0.76      0.93      0.80    756226
  weighted avg       0.98      0.98      0.98    756226



In [162]:
eval_dataset_w_KNN(X_train_resampled_SMOTE, X_test, y_train_resampled_SMOTE, y_test)

Cross validation average score: 0.9825 +/- standard deviation: 0.0013
Accuracy on the test set: 0.9846
Resource measurements: {'Training Time (s)': 10.40953540802002, 'Peak CPU Usage (%)': 10.4, 'Average CPU Usage (%)': 8.3}
                precision    recall  f1-score   support

          Bots       0.16      0.80      0.27       584
   Brute Force       0.98      0.96      0.97      2745
          DDoS       0.97      0.98      0.98     38404
           DoS       0.96      0.99      0.97     58124
Normal Traffic       0.99      0.99      0.99    628518
 Port Scanning       0.99      0.86      0.92     27208
   Web Attacks       0.94      0.93      0.94       643

      accuracy                           0.98    756226
     macro avg       0.86      0.93      0.86    756226
  weighted avg       0.99      0.98      0.99    756226



In [163]:
eval_dataset_w_KNN(X_train_resampled_scaled_RS_SMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_SMOTE, y_test)

Cross validation average score: 0.9644 +/- standard deviation: 0.0006
Accuracy on the test set: 0.9678
Resource measurements: {'Training Time (s)': 10.052657842636108, 'Peak CPU Usage (%)': 10.4, 'Average CPU Usage (%)': 8.775}
                precision    recall  f1-score   support

          Bots       0.17      0.85      0.28       584
   Brute Force       0.99      0.97      0.98      2745
          DDoS       0.89      0.94      0.91     38404
           DoS       0.92      0.97      0.95     58124
Normal Traffic       0.98      0.98      0.98    628518
 Port Scanning       1.00      0.66      0.80     27208
   Web Attacks       0.89      0.95      0.92       643

      accuracy                           0.97    756226
     macro avg       0.83      0.90      0.83    756226
  weighted avg       0.97      0.97      0.97    756226



In [164]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

Cross validation average score: 0.9762 +/- standard deviation: 0.0007
Accuracy on the test set: 0.9778
Resource measurements: {'Training Time (s)': 10.865217447280884, 'Peak CPU Usage (%)': 23.8, 'Average CPU Usage (%)': 17.975}
                precision    recall  f1-score   support

          Bots       0.17      0.83      0.28       584
   Brute Force       0.99      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.93      0.99      0.96     58124
Normal Traffic       0.98      0.99      0.99    628518
 Port Scanning       1.00      0.66      0.80     27208
   Web Attacks       0.92      0.96      0.94       643

      accuracy                           0.98    756226
     macro avg       0.86      0.91      0.85    756226
  weighted avg       0.98      0.98      0.98    756226



In [165]:
eval_dataset_w_KNN(X_train_resampled_BSMOTE, X_test, y_train_resampled_BSMOTE, y_test)

Cross validation average score: 0.9775 +/- standard deviation: 0.0070
Accuracy on the test set: 0.9794
Resource measurements: {'Training Time (s)': 5.936140537261963, 'Peak CPU Usage (%)': 9.7, 'Average CPU Usage (%)': 8.26}
                precision    recall  f1-score   support

          Bots       0.16      0.83      0.27       584
   Brute Force       0.94      0.97      0.95      2745
          DDoS       0.95      0.99      0.97     38404
           DoS       0.90      0.99      0.94     58124
Normal Traffic       0.99      0.98      0.99    628518
 Port Scanning       0.98      0.89      0.93     27208
   Web Attacks       0.88      0.93      0.90       643

      accuracy                           0.98    756226
     macro avg       0.83      0.94      0.85    756226
  weighted avg       0.98      0.98      0.98    756226



In [166]:
eval_dataset_w_KNN(X_train_resampled_scaled_RS_BSMOTE, X_test_RS_scaled, y_train_resampled_scaled_RS_BSMOTE, y_test)

Cross validation average score: 0.9556 +/- standard deviation: 0.0104
Accuracy on the test set: 0.9675
Resource measurements: {'Training Time (s)': 10.129005670547485, 'Peak CPU Usage (%)': 12.0, 'Average CPU Usage (%)': 8.55}
                precision    recall  f1-score   support

          Bots       0.17      0.88      0.29       584
   Brute Force       0.98      0.97      0.98      2745
          DDoS       0.90      0.94      0.92     38404
           DoS       0.91      0.98      0.94     58124
Normal Traffic       0.98      0.98      0.98    628518
 Port Scanning       1.00      0.66      0.79     27208
   Web Attacks       0.88      0.95      0.91       643

      accuracy                           0.97    756226
     macro avg       0.83      0.91      0.83    756226
  weighted avg       0.97      0.97      0.97    756226



In [167]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_BSMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_BSMOTE, y_test)

Cross validation average score: 0.9711 +/- standard deviation: 0.0100
Accuracy on the test set: 0.9793
Resource measurements: {'Training Time (s)': 9.841461896896362, 'Peak CPU Usage (%)': 10.8, 'Average CPU Usage (%)': 8.075000000000001}
                precision    recall  f1-score   support

          Bots       0.17      0.85      0.28       584
   Brute Force       0.98      0.97      0.98      2745
          DDoS       0.99      1.00      1.00     38404
           DoS       0.92      0.99      0.96     58124
Normal Traffic       0.99      0.99      0.99    628518
 Port Scanning       0.99      0.73      0.84     27208
   Web Attacks       0.88      0.96      0.92       643

      accuracy                           0.98    756226
     macro avg       0.85      0.93      0.85    756226
  weighted avg       0.98      0.98      0.98    756226



## Tweaking OverSampling

In [168]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 10000, 'Web Attacks': 20000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [169]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   34.2s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.9s finished


Cross validation average score: 0.9874 +/- standard deviation: 0.0004
Accuracy on the test set: 0.9938
Resource measurements: {'Training Time (s)': 35.2208354473114, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 95.013503649635}
                precision    recall  f1-score   support

          Bots       0.59      0.61      0.60       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      0.99      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.20      0.93      0.33       643

      accuracy                           0.99    756226
     macro avg       0.82      0.93      0.84    756226
  weighted avg       1.00      0.99      0.99    756226



In [180]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 20000, 'Web Attacks': 30000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [181]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   33.8s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9828 +/- standard deviation: 0.0003
Accuracy on the test set: 0.9950
Resource measurements: {'Training Time (s)': 34.81387424468994, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 95.24400000000003}
                precision    recall  f1-score   support

          Bots       0.59      0.62      0.60       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.29      0.93      0.45       643

      accuracy                           1.00    756226
     macro avg       0.84      0.93      0.86    756226
  weighted avg       1.00      1.00      1.00    756226



In [172]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 50000, 'Web Attacks': 50000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [173]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   36.5s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.9s finished


Cross validation average score: 0.9835 +/- standard deviation: 0.0009
Accuracy on the test set: 0.9874
Resource measurements: {'Training Time (s)': 37.5619113445282, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.69192982456141}
                precision    recall  f1-score   support

          Bots       0.08      0.96      0.15       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      0.99      0.99    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.29      0.94      0.45       643

      accuracy                           0.99    756226
     macro avg       0.77      0.98      0.79    756226
  weighted avg       1.00      0.99      0.99    756226



In [174]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 100000, 'Web Attacks': 100000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [175]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   38.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9806 +/- standard deviation: 0.0006
Accuracy on the test set: 0.9832
Resource measurements: {'Training Time (s)': 39.14310598373413, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 95.0746794871795}
                precision    recall  f1-score   support

          Bots       0.06      0.98      0.11       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      0.98      0.99    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.30      0.94      0.45       643

      accuracy                           0.98    756226
     macro avg       0.76      0.98      0.79    756226
  weighted avg       1.00      0.98      0.99    756226



In [176]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 3500, 'Web Attacks': 5500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [177]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   32.3s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    2.0s finished


Cross validation average score: 0.9913 +/- standard deviation: 0.0003
Accuracy on the test set: 0.9950
Resource measurements: {'Training Time (s)': 33.269890785217285, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.49584905660373}
                precision    recall  f1-score   support

          Bots       0.59      0.60      0.60       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.29      0.89      0.44       643

      accuracy                           1.00    756226
     macro avg       0.84      0.92      0.86    756226
  weighted avg       1.00      1.00      1.00    756226



In [178]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 7500, 'Web Attacks': 7500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [179]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   32.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.7s finished


Cross validation average score: 0.9892 +/- standard deviation: 0.0004
Accuracy on the test set: 0.9950
Resource measurements: {'Training Time (s)': 33.352999687194824, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.17537878787877}
                precision    recall  f1-score   support

          Bots       0.59      0.61      0.60       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.29      0.89      0.44       643

      accuracy                           1.00    756226
     macro avg       0.84      0.92      0.86    756226
  weighted avg       1.00      1.00      1.00    756226



In [None]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 5500, 'Web Attacks': 10000, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [133]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   30.8s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.7s finished


Cross validation average score: 0.9893 +/- standard deviation: 0.0005
Accuracy on the test set: 0.9936
Resource measurements: {'Training Time (s)': 31.692704677581787, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.33346007604565}
                precision    recall  f1-score   support

          Bots       0.59      0.61      0.60       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      0.99      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.19      0.89      0.31       643

      accuracy                           0.99    756226
     macro avg       0.82      0.92      0.84    756226
  weighted avg       1.00      0.99      0.99    756226



## Tweaking UnderSampling

In [182]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 1000000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 3500, 'Web Attacks': 5500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [183]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   52.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.8s finished


Cross validation average score: 0.9908 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9960
Resource measurements: {'Training Time (s)': 54.30982804298401, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.81592039801}
                precision    recall  f1-score   support

          Bots       0.59      0.58      0.59       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       1.00      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.05      0.09       643

      accuracy                           1.00    756226
     macro avg       0.94      0.79      0.80    756226
  weighted avg       1.00      1.00      1.00    756226



In [184]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 750000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 3500, 'Web Attacks': 5500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [185]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   41.8s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.7s finished


Cross validation average score: 0.9896 +/- standard deviation: 0.0002
Accuracy on the test set: 0.9961
Resource measurements: {'Training Time (s)': 43.0068244934082, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.33333333333331}
                precision    recall  f1-score   support

          Bots       0.59      0.60      0.60       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       1.00      0.06      0.12       643

      accuracy                           1.00    756226
     macro avg       0.94      0.80      0.81    756226
  weighted avg       1.00      1.00      1.00    756226



In [190]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 500000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 3500, 'Web Attacks': 5500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [189]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   35.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    2.1s finished


Cross validation average score: 0.9913 +/- standard deviation: 0.0003
Accuracy on the test set: 0.9950
Resource measurements: {'Training Time (s)': 35.967469215393066, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 95.68293650793649}
                precision    recall  f1-score   support

          Bots       0.59      0.60      0.60       584
   Brute Force       1.00      0.96      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.98      0.99     58124
Normal Traffic       1.00      1.00      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.29      0.89      0.44       643

      accuracy                           1.00    756226
     macro avg       0.84      0.92      0.86    756226
  weighted avg       1.00      1.00      1.00    756226



In [191]:
eval_dataset_w_KNN(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

Cross validation average score: 0.9766 +/- standard deviation: 0.0014
Accuracy on the test set: 0.9811
Resource measurements: {'Training Time (s)': 6.3763511180877686, 'Peak CPU Usage (%)': 33.7, 'Average CPU Usage (%)': 24.925}
                precision    recall  f1-score   support

          Bots       0.17      0.88      0.29       584
   Brute Force       0.98      0.97      0.98      2745
          DDoS       0.99      1.00      1.00     38404
           DoS       0.93      0.99      0.96     58124
Normal Traffic       0.99      0.99      0.99    628518
 Port Scanning       0.99      0.77      0.87     27208
   Web Attacks       0.86      0.95      0.91       643

      accuracy                           0.98    756226
     macro avg       0.85      0.94      0.85    756226
  weighted avg       0.98      0.98      0.98    756226



In [186]:
X_train_scaled_rus_MMS, y_train_scaled_rus_MMS = RandomUnderSampler(sampling_strategy={'Normal Traffic': 250000}, random_state=42).fit_resample(X_train_MMS_scaled, y_train)

X_train_resampled_scaled_MMS_SMOTE, y_train_resampled_scaled_MMS_SMOTE = SMOTE(sampling_strategy={'Bots': 3500, 'Web Attacks': 5500, 'Brute Force': 7000, 'Port Scanning': 70000, 'DDoS':90000, 'DoS': 200000}, random_state=42).fit_resample(X_train_scaled_rus_MMS, y_train_scaled_rus_MMS)

In [187]:
eval_dataset_w_RF(X_train_resampled_scaled_MMS_SMOTE, X_test_MMS_scaled, y_train_resampled_scaled_MMS_SMOTE, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   21.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 150 out of 150 | elapsed:    1.7s finished


Cross validation average score: 0.9902 +/- standard deviation: 0.0004
Accuracy on the test set: 0.9936
Resource measurements: {'Training Time (s)': 21.85346555709839, 'Peak CPU Usage (%)': 100.0, 'Average CPU Usage (%)': 94.83072625698323}
                precision    recall  f1-score   support

          Bots       0.58      0.60      0.59       584
   Brute Force       1.00      0.97      0.98      2745
          DDoS       1.00      1.00      1.00     38404
           DoS       0.99      0.99      0.99     58124
Normal Traffic       1.00      0.99      1.00    628518
 Port Scanning       0.99      1.00      0.99     27208
   Web Attacks       0.19      0.89      0.32       643

      accuracy                           0.99    756226
     macro avg       0.82      0.92      0.84    756226
  weighted avg       1.00      0.99      0.99    756226

