# Setup
## Parameters

In [1]:
label_classes = ['33+1', '8+1', '1+1']  # Classes: '33+1', '8+1', '1+1'

sampling_methods = ['None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids']   # Samplers: 'None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids'

evaluator_types = ['KNearestNeighbor']   # Evaluators: 'XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor'

# Random Seeds
train_test_seed = 42
evaluator_seed = 42

# Import/Export Directories
metrics_directory = './metrics'
resampled_dataset_directory = './resampled_datasets'
evaluator_directory = './trained_evaluators'

# Notebook parameter validation

sampler_categories = ['None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids']
label_categories = ['33+1', '8+1', '1+1']
evaluator_categories = ['XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor']

for label_class in label_classes:
    if label_class not in label_categories:
        assert False, f'{label_class} is an invalid class structure.'

for sampler in sampling_methods:
    if sampler not in sampler_categories:
        assert False, f'{sampler} is an invalid sampling method.'
    
for evaluator in evaluator_types:
    if evaluator not in evaluator_categories:
        assert False, f'{evaluator} is an invalid evaluator.'

## Common Packages

In [2]:
import os
import pandas as pd
import time
from IPython.display import display
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Importing the Dataset

In [3]:
all_filepaths = [filename for filename in os.listdir(resampled_dataset_directory) if filename.endswith('.parquet')]
print(f'CSVs in "{resampled_dataset_directory}": {len(all_filepaths)}\n')

datasets = {}

for sampler in sampling_methods:
    datasets[sampler] = {}
    for label_class in label_classes:
        print(f'Loading ./resampled_datasets/{sampler}_{label_class}_resampled_dataset.parquet')
        datasets[sampler][label_class] = pd.read_parquet(
            path=f'./resampled_datasets/{sampler}_{label_class}_resampled_dataset.parquet'
        )
        print(f'Dataset Shape: {datasets[sampler][label_class].shape}')
        display(datasets[sampler][label_class])

CSVs in "./resampled_datasets": 15

Loading ./resampled_datasets/None_33+1_resampled_dataset.parquet
Dataset Shape: (1389408, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.00,17.00,64.00,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,21
1,76.135781,428611.80,8.20,151.90,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.00,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.00,244.60,1
2,4.549627,108.00,6.00,64.00,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,12
3,0.000000,54.00,6.00,64.00,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,10
4,0.000000,0.00,1.00,64.00,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233582,0.000000,54.00,6.00,64.00,5.966641,5.966641,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.303339e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,13
233583,8.547875,20177.97,15.59,68.64,7031.899016,7031.899016,0.0,0.0,0.0,0.0,...,68.440643,82.09,8.334834e+07,9.5,12.897158,96.738630,1.597739e+04,0.94,141.55,9
233584,0.000000,54.00,6.00,64.00,4.410794,4.410794,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334818e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,9
233585,0.000000,0.00,1.00,64.00,22.793830,22.793830,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.314974e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,6


Loading ./resampled_datasets/None_8+1_resampled_dataset.parquet
Dataset Shape: (1389408, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.00,17.00,64.00,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,3
1,76.135781,428611.80,8.20,151.90,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.00,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.00,244.60,0
2,4.549627,108.00,6.00,64.00,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
3,0.000000,54.00,6.00,64.00,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
4,0.000000,0.00,1.00,64.00,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233582,0.000000,54.00,6.00,64.00,5.966641,5.966641,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.303339e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
233583,8.547875,20177.97,15.59,68.64,7031.899016,7031.899016,0.0,0.0,0.0,0.0,...,68.440643,82.09,8.334834e+07,9.5,12.897158,96.738630,1.597739e+04,0.94,141.55,2
233584,0.000000,54.00,6.00,64.00,4.410794,4.410794,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334818e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
233585,0.000000,0.00,1.00,64.00,22.793830,22.793830,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.314974e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,2


Loading ./resampled_datasets/None_1+1_resampled_dataset.parquet
Dataset Shape: (1389408, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.00,17.00,64.00,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0
1,76.135781,428611.80,8.20,151.90,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.00,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.00,244.60,1
2,4.549627,108.00,6.00,64.00,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
3,0.000000,54.00,6.00,64.00,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
4,0.000000,0.00,1.00,64.00,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233582,0.000000,54.00,6.00,64.00,5.966641,5.966641,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.303339e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
233583,8.547875,20177.97,15.59,68.64,7031.899016,7031.899016,0.0,0.0,0.0,0.0,...,68.440643,82.09,8.334834e+07,9.5,12.897158,96.738630,1.597739e+04,0.94,141.55,0
233584,0.000000,54.00,6.00,64.00,4.410794,4.410794,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334818e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
233585,0.000000,0.00,1.00,64.00,22.793830,22.793830,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.314974e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0


Loading ./resampled_datasets/RandomOverSampler_33+1_resampled_dataset.parquet
Dataset Shape: (7299188, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,21
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,1
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,12
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,10
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7299183,84.306294,8574.2,8.2,95.4,0.674955,0.674955,0.0,0.0,0.0,0.0,...,55.099916,130.4,8.730838e-02,5.5,17.749092,77.923048,3.446832e+03,0.9,38.50,33
7299184,35.046658,4131.6,10.0,95.3,1.018656,1.018656,0.0,0.0,0.0,0.0,...,59.975085,131.5,4.443579e-02,5.5,15.058692,84.817579,4.510625e+03,0.8,38.50,33
7299185,1300.182142,14839.5,7.1,130.0,1.225049,1.225049,0.0,0.0,0.0,0.0,...,46.416692,107.9,5.435891e-02,5.5,15.207270,65.643116,2.438648e+03,0.9,38.50,33
7299186,2.039416,151725.0,6.0,112.0,59.575648,59.575648,0.0,0.0,0.0,0.0,...,0.000000,1514.0,1.398087e-04,5.5,55.027266,0.000000,0.000000e+00,0.0,38.50,33


Loading ./resampled_datasets/RandomOverSampler_8+1_resampled_dataset.parquet
Dataset Shape: (8092224, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,3
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,0
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,2
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,2
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8092219,148.301989,9636.2,6.5,94.4,36.515951,36.515951,0.0,0.0,0.0,0.0,...,87.847986,136.5,1.141648e-02,5.5,15.133559,124.235814,1.380786e+04,0.9,38.50,7
8092220,48.978031,16974.5,6.0,104.1,39.539877,39.539877,0.0,0.0,0.0,0.0,...,696.852957,1080.3,5.858994e-03,5.5,30.261153,985.498902,1.033186e+06,0.9,38.50,7
8092221,106.930907,3287.4,11.5,86.7,77.855874,77.855874,0.0,0.0,0.0,0.0,...,39.320718,142.0,3.758659e-02,5.5,12.519245,55.607893,5.342274e+03,0.7,38.50,7
8092222,83.611826,15602.3,12.6,90.8,23.397903,23.397903,0.0,0.0,0.0,0.0,...,23.423697,85.5,1.341100e-02,5.5,11.559150,33.126109,8.093751e+02,0.7,38.50,7


Loading ./resampled_datasets/RandomOverSampler_1+1_resampled_dataset.parquet
Dataset Shape: (2713172, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,0
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,1
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,0
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,0
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713167,2.101208,528401.6,6.0,114.4,203.928830,203.928830,0.0,0.0,0.0,0.0,...,1971.614649,1803.6,8.219004e-04,5.5,65.253330,2788.284176,4.472826e+06,0.9,38.50,1
2713168,1.792944,329152.0,6.0,64.0,157.001031,157.001031,0.0,0.0,0.0,0.0,...,0.000000,1514.0,2.541804e-03,5.5,55.027266,0.000000,0.000000e+00,0.0,38.50,1
2713169,0.019382,158.4,12.1,76.2,11.138463,11.138463,0.0,0.0,0.0,0.0,...,76.039440,174.2,6.775784e-03,5.5,16.912792,107.536007,6.735486e+03,0.9,38.50,1
2713170,1.747535,2066445.4,6.0,87.6,770.763713,770.763713,0.0,0.0,0.0,0.0,...,1699.198468,1741.2,1.665202e+08,13.5,48.335946,2403.541519,2.889974e+06,1.0,244.60,1


Loading ./resampled_datasets/RandomUnderSampler_33+1_resampled_dataset.parquet
Dataset Shape: (1088, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
137607,1042.361327,25539.6,13.2,91.3,9.251794,9.251794,0.0,0.0,0.0,0.0,...,98.906652,161.7,1.676299e+08,13.5,18.444901,140.273788,1.043426e+04,1.0,244.6,0
247290,569.088140,8966.1,11.4,137.9,12.415494,12.415494,0.0,0.0,0.0,0.0,...,66.128475,139.8,1.676298e+08,13.5,15.062563,93.611788,4.396525e+03,1.0,244.6,0
358710,872.579318,10412.1,11.0,95.4,10.074352,10.074352,0.0,0.0,0.0,0.0,...,79.346773,196.6,1.676298e+08,13.5,16.175083,112.542588,6.590925e+03,1.0,244.6,0
380486,1617.801931,36366.0,10.9,169.9,0.147236,0.147236,0.0,0.0,0.0,0.0,...,1340.405867,154.3,1.676298e+08,13.5,32.274008,1899.548307,1.817328e+06,1.0,244.6,0
158793,0.052914,1609.5,6.0,117.0,75.729887,75.729887,0.0,0.0,0.0,0.0,...,455.708031,539.7,5.530000e-03,5.5,28.124546,644.468477,2.962976e+05,0.9,38.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97365,174.818179,19506.5,7.1,93.3,1.179003,1.179003,0.0,0.0,0.0,0.0,...,116.474911,170.0,1.676390e+08,13.5,18.240984,164.836651,1.359149e+04,1.0,244.6,33
91914,174.133769,17573.8,4.8,56.7,0.929341,0.929341,0.0,0.0,0.0,0.0,...,84.808606,95.8,1.676390e+08,13.5,15.293290,120.107793,7.235836e+03,1.0,244.6,33
219431,92.403640,5757.5,7.6,76.3,0.475867,0.475867,0.0,0.0,0.0,0.0,...,1349.642825,105.0,1.676390e+08,13.5,31.153231,1912.702207,1.842516e+06,1.0,244.6,33
118952,288.044928,8894.7,8.2,94.9,10.217197,10.217197,0.0,0.0,0.0,0.0,...,74.019008,135.0,1.676390e+08,13.5,14.588315,104.861988,5.682494e+03,1.0,244.6,33


Loading ./resampled_datasets/RandomUnderSampler_8+1_resampled_dataset.parquet
Dataset Shape: (3224, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
121041,12.380986,36685.7,6.0,155.1,29.421952,29.421952,0.0,0.0,0.0,0.0,...,503.283262,222.1,1.665190e+08,13.5,30.217510,712.728522,2.542565e+05,1.0,244.6,0
163567,83.608971,422328.0,9.4,116.1,23.917201,23.917201,0.0,0.0,0.0,0.0,...,40.729218,105.0,7.765198e-03,5.5,14.076475,57.599812,2.123052e+03,0.8,38.5,0
219685,30.757302,2371118.0,6.0,147.5,69.292917,69.292917,0.0,0.0,0.0,0.0,...,566.252115,784.0,3.314495e-03,5.5,32.266548,800.801420,4.009849e+05,0.8,38.5,0
343024,10.047043,246705.5,10.4,62.6,81.515117,81.515117,0.0,0.0,0.0,0.0,...,1143.080394,845.7,1.665209e+08,13.5,37.275200,1619.731002,1.315828e+06,1.0,244.6,0
28304,36.551427,906781.4,8.2,144.8,26.714701,26.714701,0.0,0.0,0.0,0.0,...,241.587886,884.8,1.665191e+08,13.5,17.160037,278.945403,9.007023e+04,1.0,244.6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78832,222.743738,5351.2,9.9,105.4,0.174401,0.174401,0.0,0.0,0.0,0.0,...,30.623911,109.3,9.750199e-03,5.5,13.014610,43.308750,1.316071e+03,0.9,38.5,7
259315,0.208264,73803.0,17.0,60.0,298.725247,298.725247,0.0,0.0,0.0,0.0,...,0.120000,1291.6,3.201008e-04,5.5,50.832273,0.169706,1.440000e-01,0.1,38.5,7
232354,1.431495,1953154.0,6.0,64.0,1232.626642,1232.626642,0.0,0.0,0.0,0.0,...,930.698343,1629.2,1.668459e+08,13.5,41.118187,1318.804147,8.707082e+05,1.0,244.6,7
402914,60.898023,652.3,9.3,60.0,68.652651,68.652651,0.0,0.0,0.0,0.0,...,77.476212,105.2,1.074259e-02,5.5,13.941468,109.567909,1.001257e+04,0.8,38.5,7


Loading ./resampled_datasets/RandomUnderSampler_1+1_resampled_dataset.parquet
Dataset Shape: (65644, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
328970,0.000822,10.25,46.59,65.73,18.056287,18.056287,0.0,0.0,0.0,0.0,...,14.259080,572.88,8.364655e+07,9.5,33.881879,20.204136,1.873641e+03,0.11,141.55,0
137464,0.000000,0.00,1.00,64.00,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.315019e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0
109608,0.000000,54.00,6.00,64.00,41.365985,41.365985,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.303770e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
81765,0.000000,0.00,1.00,64.00,0.266647,0.266647,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.348320e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0
221254,0.051820,13929.00,17.00,64.00,5410.204932,5410.204932,0.0,0.0,0.0,0.0,...,0.000000,50.00,8.310747e+07,9.5,10.000000,0.000000,0.000000e+00,0.00,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233110,22.200058,34473.50,13.70,98.80,46.190235,46.190235,0.0,0.0,0.0,0.0,...,163.796590,150.90,3.051305e-03,5.5,18.550145,231.643360,3.503273e+04,0.90,38.50,1
233116,38.119787,32584.90,12.60,82.70,7.859036,7.859036,0.0,0.0,0.0,0.0,...,48.033592,113.30,1.665200e+08,13.5,12.564913,68.144351,2.568164e+03,1.00,244.60,1
233386,34.191413,67355.40,6.00,166.10,26.202975,26.202975,0.0,0.0,0.0,0.0,...,357.930387,670.00,1.665256e+08,13.5,19.239078,508.534770,2.359663e+05,1.00,244.60,1
233491,26.651705,27319.90,6.00,113.20,49.856850,49.856850,0.0,0.0,0.0,0.0,...,523.426226,365.00,6.194711e-03,5.5,38.191663,740.236468,3.431003e+05,0.90,38.50,1


Loading ./resampled_datasets/SMOTE_33+1_resampled_dataset.parquet
Dataset Shape: (2547620, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,162.000000,17.00,64.000000,17.544943,17.544943,0.0,0.0,0.0,0.0,...,0.000000,162.000000,8.300786e+07,9.5,18.000000,0.000000,0.000000,0.000000,141.55,21
1,0.000000,54.000000,6.00,64.000000,6.221817,6.221817,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334813e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,9
2,0.000000,54.000000,6.00,64.000000,8.199278,8.199278,0.0,0.0,0.0,0.0,...,0.000000,54.000000,8.295125e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,20
3,0.000000,54.000000,6.00,64.000000,19.043810,19.043810,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334509e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,9
4,0.000000,2.140000,1.15,65.270000,12.654857,12.654857,0.0,0.0,0.0,0.0,...,7.113096,43.900000,8.312817e+07,9.5,9.410572,10.072289,347.220876,0.150000,141.55,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2547615,144.381557,4154.671882,7.10,172.475841,81.562134,81.562134,0.0,0.0,0.0,0.0,...,238.492874,496.078220,1.177138e-02,5.5,20.114113,337.279857,122742.988115,0.810914,38.50,33
2547616,121.376917,10722.088051,7.10,85.046520,6.103580,6.103580,0.0,0.0,0.0,0.0,...,74.160889,179.612820,1.676392e+08,13.5,17.366595,105.085507,5682.486711,1.000000,244.60,33
2547617,452.847589,16162.306374,7.10,72.967922,0.506335,0.506335,0.0,0.0,0.0,0.0,...,29.806732,87.357461,3.383168e-02,5.5,13.805576,42.153085,1126.375933,0.900000,38.50,33
2547618,468.029552,5097.176609,7.10,96.647996,10.342081,10.342081,0.0,0.0,0.0,0.0,...,238.675340,162.358399,1.676390e+08,13.5,21.099657,337.960206,67574.513301,1.000000,244.60,33


Loading ./resampled_datasets/SMOTE_8+1_resampled_dataset.parquet
Dataset Shape: (2822584, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,162.000000,17.00,64.000000,17.544943,17.544943,0.0,0.0,0.0,0.0,...,0.000000,162.000000,8.300786e+07,9.5,18.000000,0.000000,0.000000,0.000000,141.55,3
1,0.000000,54.000000,6.00,64.000000,6.221817,6.221817,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334813e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,2
2,0.000000,54.000000,6.00,64.000000,8.199278,8.199278,0.0,0.0,0.0,0.0,...,0.000000,54.000000,8.295125e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,3
3,0.000000,54.000000,6.00,64.000000,19.043810,19.043810,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334509e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,2
4,0.000000,2.140000,1.15,65.270000,12.654857,12.654857,0.0,0.0,0.0,0.0,...,7.113096,43.900000,8.312817e+07,9.5,9.410572,10.072289,347.220876,0.150000,141.55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2822579,210.817806,9709.228908,7.10,90.642940,10.750663,10.750663,0.0,0.0,0.0,0.0,...,109.222702,93.759338,1.664818e+08,13.5,15.920673,154.787232,12017.931683,1.000000,244.60,7
2822580,472.668640,1903.378342,11.50,97.094786,3.643343,3.643343,0.0,0.0,0.0,0.0,...,34.450865,92.168373,2.211139e-02,5.5,14.593947,48.720880,1487.290895,0.826775,38.50,7
2822581,191.787015,7664.390328,8.90,128.785764,3.401687,3.401687,0.0,0.0,0.0,0.0,...,46.707797,106.635597,1.676390e+08,13.5,13.769177,66.067947,2219.660760,1.000000,244.60,7
2822582,251.312256,4961.100786,10.40,139.602696,4.774916,4.774916,0.0,0.0,0.0,0.0,...,68.041279,156.917794,2.086204e-02,5.5,17.183800,96.224900,5311.614688,0.900000,38.50,7


Loading ./resampled_datasets/SMOTE_1+1_resampled_dataset.parquet
Dataset Shape: (946876, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,1.620000e+02,17.00,64.000000,17.544943,17.544943,0.0,0.0,0.0,0.0,...,0.000000,162.000000,8.300786e+07,9.5,18.000000,0.000000,0.000000,0.000000,141.55,0
1,0.000000,5.400000e+01,6.00,64.000000,6.221817,6.221817,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334813e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,0
2,0.000000,5.400000e+01,6.00,64.000000,8.199278,8.199278,0.0,0.0,0.0,0.0,...,0.000000,54.000000,8.295125e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,0
3,0.000000,5.400000e+01,6.00,64.000000,19.043810,19.043810,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334509e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,0
4,0.000000,2.140000e+00,1.15,65.270000,12.654857,12.654857,0.0,0.0,0.0,0.0,...,7.113096,43.900000,8.312817e+07,9.5,9.410572,10.072289,347.220876,0.150000,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946871,51.531976,1.915145e+05,8.20,96.064442,43.179130,43.179130,0.0,0.0,0.0,0.0,...,259.750409,96.102253,1.665221e+08,13.5,17.225191,367.663400,72090.662250,1.000000,244.60,1
946872,68.196588,2.532512e+05,6.00,116.869038,1294.224027,1294.224027,0.0,0.0,0.0,0.0,...,943.202151,409.013001,1.665196e+08,13.5,40.139441,1335.930129,893831.982553,1.000000,244.60,1
946873,24.532109,1.612908e+06,6.00,233.533313,61.402976,61.402976,0.0,0.0,0.0,0.0,...,3.408458,77.361526,1.173764e-02,5.5,11.583017,4.820287,151.335533,0.076767,38.50,1
946874,42.055308,1.182109e+04,9.90,125.683530,28.084851,28.084851,0.0,0.0,0.0,0.0,...,168.619284,406.333699,1.665177e+08,13.5,17.350575,163.704508,13413.802681,1.000000,244.60,1


Loading ./resampled_datasets/ClusterCentroids_33+1_resampled_dataset.parquet
Dataset Shape: (1088, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,112.820531,2831.487500,11.118750,96.306250,23.680621,23.680621,0.0,0.0,0.0,0.0,...,45.542654,134.212500,2.385680e-02,5.5,15.369907,64.407038,3.311644e+03,0.856250,38.5,0
1,423.524072,7328.771429,8.357143,130.657143,76.308018,76.308018,0.0,0.0,0.0,0.0,...,119.951239,141.385714,1.676298e+08,13.5,16.618607,169.937605,1.560634e+04,1.000000,244.6,0
2,373.984638,83291.400000,8.200000,65.300000,65.517287,65.517287,0.0,0.0,0.0,0.0,...,3544.017932,3263.400000,4.300907e-03,5.5,100.956805,5011.998224,1.404716e+07,0.900000,38.5,0
3,4.749069,119489.200000,6.000000,64.000000,92.875204,92.875204,0.0,0.0,0.0,0.0,...,2087.442576,2350.800000,1.676297e+08,13.5,54.080281,2958.907948,4.402649e+06,1.000000,244.6,0
4,1.108356,95326.200000,6.000000,70.000000,62.912492,62.912492,0.0,0.0,0.0,0.0,...,2645.413247,351.600000,1.676298e+08,13.5,50.244107,3748.404394,7.065778e+06,1.000000,244.6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083,0.471143,81146.200000,6.000000,226.000000,175.097529,175.097529,0.0,0.0,0.0,0.0,...,489.434923,1351.200000,1.676391e+08,13.5,51.043853,681.978236,2.339504e+05,1.000000,244.6,33
1084,0.964338,77485.000000,6.000000,52.800000,74.126744,74.126744,0.0,0.0,0.0,0.0,...,839.028208,637.200000,1.676390e+08,13.5,51.809400,1185.684037,7.063765e+05,1.000000,244.6,33
1085,1130.109987,69645.260000,7.540000,88.020000,17.557112,17.557112,0.0,0.0,0.0,0.0,...,72.770909,154.240000,1.676391e+08,13.5,15.379787,102.916058,6.505454e+03,0.960000,244.6,33
1086,835.037838,65703.500000,8.200000,123.300000,12.169314,12.169314,0.0,0.0,0.0,0.0,...,375.776075,118.700000,1.676391e+08,13.5,23.465460,532.431031,1.425352e+05,1.000000,244.6,33


Loading ./resampled_datasets/ClusterCentroids_8+1_resampled_dataset.parquet
Dataset Shape: (3224, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,43.815188,2.718021e+06,6.547619,114.886905,142.163258,142.163258,0.0,0.0,0.0,1.355253e-20,...,658.614308,984.957143,1.665217e+08,13.5,38.301460,927.845058,4.631108e+05,0.997619,244.6,0
1,29.787825,8.510525e+05,7.081366,155.334161,573.360184,573.360184,0.0,0.0,0.0,-2.032879e-20,...,64.438030,274.290683,7.380798e-03,5.5,18.080104,91.129136,2.067982e+04,0.559938,38.5,0
2,22.785047,6.673545e+05,7.028378,113.526351,103.422655,103.422655,0.0,0.0,0.0,-6.776264e-21,...,481.688400,707.765541,1.665218e+08,13.5,29.922482,679.533278,2.757185e+05,0.972973,244.6,0
3,29.623289,5.007870e+05,6.958333,112.175000,159.085461,159.085461,0.0,0.0,0.0,0.000000e+00,...,2978.115790,1196.250000,1.665221e+08,13.5,60.897104,4218.946112,9.092597e+06,1.000000,244.6,0
4,8.284695,2.468985e+05,8.200000,202.400000,30.080409,30.080409,0.0,0.0,0.0,0.000000e+00,...,4155.947420,1312.600000,3.216478e-02,5.5,66.276102,5877.397206,1.974094e+07,0.900000,38.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3219,674.457357,7.849950e+03,11.150000,72.200000,11.176992,11.176992,0.0,0.0,0.0,0.000000e+00,...,111.798171,218.000000,1.818210e-02,5.5,21.944259,158.106489,1.401305e+04,0.900000,38.5,7
3220,23.529895,1.009150e+03,6.800000,80.500000,120.401771,120.401771,0.0,0.0,0.0,0.000000e+00,...,61.602241,111.150000,1.676298e+08,13.5,15.036101,87.214399,3.806562e+03,1.000000,244.6,7
3221,872.579318,1.041210e+04,11.000000,95.400000,10.074352,10.074352,0.0,0.0,0.0,0.000000e+00,...,79.346773,196.600000,1.676298e+08,13.5,16.175083,112.542588,6.590925e+03,1.000000,244.6,7
3222,40.305533,9.581750e+02,9.725000,105.200000,37.461488,37.461488,0.0,0.0,0.0,0.000000e+00,...,55.779626,127.825000,1.295140e-02,5.5,15.342622,78.884304,3.951636e+03,0.800000,38.5,7


Loading ./resampled_datasets/ClusterCentroids_1+1_resampled_dataset.parquet
Dataset Shape: (65644, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,9.248562e-02,29377.990170,16.940114,64.375852,8513.369067,8513.369067,-4.235165e-22,-1.387779e-17,5.551115e-17,-4.163336e-17,...,0.301614,50.203295,8.310671e+07,9.500000,10.012404,0.426270,8.672647e+00,0.028068,141.550000,0
1,1.183356e+01,973420.400000,17.000000,38.000000,81.536986,81.536986,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,255.675388,648.500000,7.186234e-04,5.500000,22.304492,361.579601,1.321211e+05,0.500000,38.500000,0
2,6.453561e+00,877661.200000,17.000000,57.700000,167.154275,167.154275,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,17.249820,1140.600000,1.668613e+08,13.500000,49.072889,5.634970,1.599328e+01,1.000000,244.600000,0
3,6.198883e-08,565.580000,6.770000,65.680000,19358.326154,19358.326154,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,488.856927,698.350000,1.000034e+08,9.500000,36.869890,691.428106,2.721642e+05,0.890000,141.550000,0
4,1.068042e-01,116406.603774,16.094340,64.000000,728.055153,728.055153,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,540.073609,900.358491,6.291813e+07,8.886792,43.460264,763.905060,3.093446e+05,0.943396,122.283019,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65639,2.220006e+01,34473.500000,13.700000,98.800000,46.190235,46.190235,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,163.796590,150.900000,3.051305e-03,5.500000,18.550145,231.643360,3.503273e+04,0.900000,38.500000,1
65640,3.811979e+01,32584.900000,12.600000,82.700000,7.859036,7.859036,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,48.033592,113.300000,1.665200e+08,13.500000,12.564913,68.144351,2.568164e+03,1.000000,244.600000,1
65641,3.419141e+01,67355.400000,6.000000,166.100000,26.202975,26.202975,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,357.930387,670.000000,1.665256e+08,13.500000,19.239078,508.534770,2.359663e+05,1.000000,244.600000,1
65642,2.665170e+01,27319.900000,6.000000,113.200000,49.856850,49.856850,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,523.426226,365.000000,6.194711e-03,5.500000,38.191663,740.236468,3.431003e+05,0.900000,38.500000,1


# Preprocessing
## Scaling Numerical Features

In [4]:
num_cols = [
    'flow_duration', 'Header_Length',  'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count', 'fin_count',
    'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]

scalers = {}
print('Scaled Datasets:')
for sampler in sampling_methods:
    scalers[sampler] = StandardScaler()

    for label_class in label_classes:
        datasets[sampler][label_class][num_cols] = scalers[sampler].fit_transform(
            datasets[sampler][label_class][num_cols]
        )
    
        print(f'Sampler: {sampler} / Label Class: {label_class}')
        display(datasets[sampler][label_classes[0]].head(3))

Scaled Datasets:
Sampler: None / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.003675,-0.166147,17.0,-0.167976,-0.090509,-0.090509,0.0,0.0,0.0,0.0,...,-0.208794,-0.343574,-0.011406,0.000835,-0.459184,-0.208627,-0.095484,-0.414344,0.000754,21
1,0.28153,0.760145,8.2,6.105171,-0.090092,-0.090092,0.0,0.0,0.0,0.0,...,10.488495,2.198032,4.882094,4.877978,3.28135,10.509058,9.13432,3.870082,4.88556,1
2,-0.003979,-0.166095,6.0,-0.167976,-0.090505,-0.090505,0.0,0.0,1.0,0.0,...,-0.208794,-0.29382,0.009369,0.000835,-0.317097,-0.208627,-0.095484,-0.414344,0.000754,12


Sampler: None / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.003675,-0.166147,17.0,-0.167976,-0.090509,-0.090509,0.0,0.0,0.0,0.0,...,-0.208794,-0.343574,-0.011406,0.000835,-0.459184,-0.208627,-0.095484,-0.414344,0.000754,21
1,0.28153,0.760145,8.2,6.105171,-0.090092,-0.090092,0.0,0.0,0.0,0.0,...,10.488495,2.198032,4.882094,4.877978,3.28135,10.509058,9.13432,3.870082,4.88556,1
2,-0.003979,-0.166095,6.0,-0.167976,-0.090505,-0.090505,0.0,0.0,1.0,0.0,...,-0.208794,-0.29382,0.009369,0.000835,-0.317097,-0.208627,-0.095484,-0.414344,0.000754,12


Sampler: None / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.003675,-0.166147,17.0,-0.167976,-0.090509,-0.090509,0.0,0.0,0.0,0.0,...,-0.208794,-0.343574,-0.011406,0.000835,-0.459184,-0.208627,-0.095484,-0.414344,0.000754,21
1,0.28153,0.760145,8.2,6.105171,-0.090092,-0.090092,0.0,0.0,0.0,0.0,...,10.488495,2.198032,4.882094,4.877978,3.28135,10.509058,9.13432,3.870082,4.88556,1
2,-0.003979,-0.166095,6.0,-0.167976,-0.090505,-0.090505,0.0,0.0,1.0,0.0,...,-0.208794,-0.29382,0.009369,0.000835,-0.317097,-0.208627,-0.095484,-0.414344,0.000754,12


Sampler: RandomOverSampler / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: RandomOverSampler / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: RandomOverSampler / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: RandomUnderSampler / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
137607,3.222377,-0.267777,13.2,0.36347,-0.117866,-0.117866,0.0,0.0,0.0,0.0,...,-0.274441,-0.413,1.545597,1.524737,-0.247211,-0.273294,-0.204346,1.018886,1.528815,0
247290,1.622551,-0.288803,11.4,1.803861,-0.11772,-0.11772,0.0,0.0,0.0,0.0,...,-0.358384,-0.461181,1.545596,1.524737,-0.484394,-0.357717,-0.210258,1.018886,1.528815,0
358710,2.648455,-0.286968,11.0,0.4902,-0.117828,-0.117828,0.0,0.0,0.0,0.0,...,-0.324533,-0.33622,1.545596,1.524737,-0.40638,-0.323467,-0.208109,1.018886,1.528815,0


Sampler: RandomUnderSampler / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
137607,3.222377,-0.267777,13.2,0.36347,-0.117866,-0.117866,0.0,0.0,0.0,0.0,...,-0.274441,-0.413,1.545597,1.524737,-0.247211,-0.273294,-0.204346,1.018886,1.528815,0
247290,1.622551,-0.288803,11.4,1.803861,-0.11772,-0.11772,0.0,0.0,0.0,0.0,...,-0.358384,-0.461181,1.545596,1.524737,-0.484394,-0.357717,-0.210258,1.018886,1.528815,0
358710,2.648455,-0.286968,11.0,0.4902,-0.117828,-0.117828,0.0,0.0,0.0,0.0,...,-0.324533,-0.33622,1.545596,1.524737,-0.40638,-0.323467,-0.208109,1.018886,1.528815,0


Sampler: RandomUnderSampler / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
137607,3.222377,-0.267777,13.2,0.36347,-0.117866,-0.117866,0.0,0.0,0.0,0.0,...,-0.274441,-0.413,1.545597,1.524737,-0.247211,-0.273294,-0.204346,1.018886,1.528815,0
247290,1.622551,-0.288803,11.4,1.803861,-0.11772,-0.11772,0.0,0.0,0.0,0.0,...,-0.358384,-0.461181,1.545596,1.524737,-0.484394,-0.357717,-0.210258,1.018886,1.528815,0
358710,2.648455,-0.286968,11.0,0.4902,-0.117828,-0.117828,0.0,0.0,0.0,0.0,...,-0.324533,-0.33622,1.545596,1.524737,-0.40638,-0.323467,-0.208109,1.018886,1.528815,0


Sampler: SMOTE / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.175623,-0.278562,17.0,-0.557617,-0.073585,-0.073585,0.0,0.0,0.0,0.0,...,-0.604151,-0.416428,-0.038296,-0.031136,-0.265569,-0.603796,-0.267222,-1.278147,-0.03119,21
1,-0.175623,-0.278707,6.0,-0.557617,-0.073753,-0.073753,0.0,1.0,0.0,1.0,...,-0.604151,-0.677988,-0.031859,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,9
2,-0.175623,-0.278707,6.0,-0.557617,-0.073724,-0.073724,0.0,0.0,0.0,0.0,...,-0.604151,-0.677988,-0.039367,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,20


Sampler: SMOTE / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.175623,-0.278562,17.0,-0.557617,-0.073585,-0.073585,0.0,0.0,0.0,0.0,...,-0.604151,-0.416428,-0.038296,-0.031136,-0.265569,-0.603796,-0.267222,-1.278147,-0.03119,21
1,-0.175623,-0.278707,6.0,-0.557617,-0.073753,-0.073753,0.0,1.0,0.0,1.0,...,-0.604151,-0.677988,-0.031859,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,9
2,-0.175623,-0.278707,6.0,-0.557617,-0.073724,-0.073724,0.0,0.0,0.0,0.0,...,-0.604151,-0.677988,-0.039367,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,20


Sampler: SMOTE / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.175623,-0.278562,17.0,-0.557617,-0.073585,-0.073585,0.0,0.0,0.0,0.0,...,-0.604151,-0.416428,-0.038296,-0.031136,-0.265569,-0.603796,-0.267222,-1.278147,-0.03119,21
1,-0.175623,-0.278707,6.0,-0.557617,-0.073753,-0.073753,0.0,1.0,0.0,1.0,...,-0.604151,-0.677988,-0.031859,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,9
2,-0.175623,-0.278707,6.0,-0.557617,-0.073724,-0.073724,0.0,0.0,0.0,0.0,...,-0.604151,-0.677988,-0.039367,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,20


Sampler: ClusterCentroids / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.158755,-0.460666,11.11875,0.380183,-0.231754,-0.231754,0.0,0.0,0.0,0.0,...,-0.588789,-0.797205,-1.461175,-1.502993,-0.863778,-0.588342,-0.231288,0.637615,-1.503092,0
1,1.524459,-0.457775,8.357143,1.501086,-0.231678,-0.231678,0.0,0.0,0.0,0.0,...,-0.490582,-0.785343,1.553021,1.532797,-0.790863,-0.48991,-0.228472,0.995545,1.540535,0
2,1.306708,-0.408941,8.2,-0.631581,-0.231694,-0.231694,0.0,0.0,0.0,0.0,...,4.028637,4.377559,-1.461175,-1.502993,4.133899,4.026441,2.985967,0.74655,-1.503092,0


Sampler: ClusterCentroids / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.158755,-0.460666,11.11875,0.380183,-0.231754,-0.231754,0.0,0.0,0.0,0.0,...,-0.588789,-0.797205,-1.461175,-1.502993,-0.863778,-0.588342,-0.231288,0.637615,-1.503092,0
1,1.524459,-0.457775,8.357143,1.501086,-0.231678,-0.231678,0.0,0.0,0.0,0.0,...,-0.490582,-0.785343,1.553021,1.532797,-0.790863,-0.48991,-0.228472,0.995545,1.540535,0
2,1.306708,-0.408941,8.2,-0.631581,-0.231694,-0.231694,0.0,0.0,0.0,0.0,...,4.028637,4.377559,-1.461175,-1.502993,4.133899,4.026441,2.985967,0.74655,-1.503092,0


Sampler: ClusterCentroids / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.158755,-0.460666,11.11875,0.380183,-0.231754,-0.231754,0.0,0.0,0.0,0.0,...,-0.588789,-0.797205,-1.461175,-1.502993,-0.863778,-0.588342,-0.231288,0.637615,-1.503092,0
1,1.524459,-0.457775,8.357143,1.501086,-0.231678,-0.231678,0.0,0.0,0.0,0.0,...,-0.490582,-0.785343,1.553021,1.532797,-0.790863,-0.48991,-0.228472,0.995545,1.540535,0
2,1.306708,-0.408941,8.2,-0.631581,-0.231694,-0.231694,0.0,0.0,0.0,0.0,...,4.028637,4.377559,-1.461175,-1.502993,4.133899,4.026441,2.985967,0.74655,-1.503092,0


## X, y Train/Test Splitting

In [5]:
# Not storing test datasets in memory, will save as parquet and later load for testing.
X_train = {}
y_train = {}

print('Train/Test Shapes:')

for sampler in sampling_methods:
    X_train[sampler] = {}
    y_train[sampler] = {}
    
    for label_class in label_classes:
        X = datasets[sampler][label_class].drop('label', axis=1)
        y = datasets[sampler][label_class]['label']
        
        X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
            X, y, test_size=0.2, random_state=train_test_seed
        )
        
        print(f'\nSampler: {sampler} / Label Class: {label_class}')
        print(f'X_train: {X_train_temp.shape}, y_train: {y_train_temp.shape}, X_test: {X_test_temp.shape}, y_test: {y_test_temp.shape}')
        
        X_train[sampler][label_class] = X_train_temp
        y_train[sampler][label_class] = y_train_temp
        
        # Save test dataset
        pd.concat([X_test_temp, y_test_temp], axis=1).to_parquet(
            path=f'./test_datasets/{sampler}_{label_class}_test_dataset.parquet'
        )

# Whole datasets no longer used. Free memory.
del datasets

Train/Test Shapes:

Sampler: None / Label Class: 33+1
X_train: (1111526, 46), y_train: (1111526,), X_test: (277882, 46), y_test: (277882,)

Sampler: None / Label Class: 8+1
X_train: (1111526, 46), y_train: (1111526,), X_test: (277882, 46), y_test: (277882,)

Sampler: None / Label Class: 1+1
X_train: (1111526, 46), y_train: (1111526,), X_test: (277882, 46), y_test: (277882,)

Sampler: RandomOverSampler / Label Class: 33+1
X_train: (5839350, 46), y_train: (5839350,), X_test: (1459838, 46), y_test: (1459838,)

Sampler: RandomOverSampler / Label Class: 8+1
X_train: (6473779, 46), y_train: (6473779,), X_test: (1618445, 46), y_test: (1618445,)

Sampler: RandomOverSampler / Label Class: 1+1
X_train: (2170537, 46), y_train: (2170537,), X_test: (542635, 46), y_test: (542635,)

Sampler: RandomUnderSampler / Label Class: 33+1
X_train: (870, 46), y_train: (870,), X_test: (218, 46), y_test: (218,)

Sampler: RandomUnderSampler / Label Class: 8+1
X_train: (2579, 46), y_train: (2579,), X_test: (645, 4

# Evaluators

In [6]:
# Get all evaluators
evaluators = []
for evaluator_type in evaluator_types:
    for sampler in sampling_methods:
        for label_class in label_classes:
            evaluators.append((evaluator_type, sampler, label_class))

# Get list of pretrained evaluators
pretrained_evaluators = [filename for filename in os.listdir(evaluator_directory) if filename.endswith('.joblib')]

for i, evaluator in enumerate(pretrained_evaluators):
    pretrained_evaluators[i] = tuple(evaluator[:-7].split('_'))

# Get list of untrained evaluators
untrained_evaluators = [evaluator for evaluator in evaluators if evaluator not in pretrained_evaluators]

print(f'All evaluators: {len(evaluators)}')
print(f'Pretrained evaluators: {len(pretrained_evaluators)}')
print(f'Untrained evaluators: {len(untrained_evaluators)}')

All evaluators: 15
Pretrained evaluators: 105
Untrained evaluators: 0


## Training

In [7]:
# Only train evaluators not found in ./trained_evaluators
for i, (evaluator_type, sampler, label_class) in enumerate(untrained_evaluators):
    print(f'{i+1}/{len(untrained_evaluators)} : {evaluator_type}, {sampler}, {label_class}')
    match evaluator_type:
        case 'XGBoost':
            from xgboost import XGBClassifier
            evaluator_model = XGBClassifier()
            
        case 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            evaluator_model = LogisticRegression(random_state=evaluator_seed, n_jobs=-1)
            
        case 'Perceptron':
            from sklearn.linear_model import Perceptron
            evaluator_model = Perceptron(random_state=evaluator_seed, n_jobs=-1)
            
        case 'AdaBoost':
            from sklearn.ensemble import AdaBoostClassifier
            evaluator_model = AdaBoostClassifier(random_state=evaluator_seed, algorithm='SAMME')
            
        case 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            evaluator_model = RandomForestClassifier(random_state=evaluator_seed, n_jobs=-1)
            
        case 'DeepNeuralNetwork':
            from sklearn.neural_network import MLPClassifier
            evaluator_model = MLPClassifier(random_state=evaluator_seed)
            
        case 'KNearestNeighbor':
            from sklearn.neighbors import KNeighborsClassifier
            evaluator_model = KNeighborsClassifier(n_jobs=-1)
            
        case _:
            print(f'Invalid evaluator model: {evaluator_type}')
    
    
    if evaluator_type == 'XGBoost' and label_class == '1+1':
        evaluator_model = XGBClassifier(objective='binary:logistic')
            
    print(f'{time.time()} : Training {evaluator_type} on {sampler} balanced data with {label_class} label classes')
    evaluator_model.fit(X_train[sampler][label_class], y_train[sampler][label_class])
    print(f'{time.time()} : Training completed.\n')
    
    # Saving evaluator to file
    dump(evaluator_model, filename=f'./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib')

In [8]:
# Free training memory
try:
    del evaluator_model
except NameError:
    print('No evaluators to delete')
del X_train
del y_train

No evaluators to delete


## Testing

In [None]:
# Load previous metrics dataframe
try: 
    df_metrics = pd.read_json(path_or_buf=metrics_directory+'/sampling_metrics.json', orient='index')
    
except FileNotFoundError:
    df_metrics = pd.DataFrame(columns=['Sampler', 'Label Classes', 'Evaluator', 'Test Duration', 'Accuracy', 'Precision', 'Recall', 'F1'])

# Run prediction
for (evaluator_type, sampler, label_class) in evaluators:
    # Load evaluator
    try:
        evaluator = load(filename=f'./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib')
    
    except FileNotFoundError:
        print(f'EVALUATOR NOT FOUND: ./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib\n')
    
    else:
        # Load testing dataset
        df_test = pd.read_parquet(path=f'./test_datasets/{sampler}_{label_class}_test_dataset.parquet')
        
        X_test = df_test.drop('label', axis=1)
        y_test = df_test['label']
            
        print(f'{time.time()} : Predicting {evaluator_type} on {sampler} sampled data with {label_class} classes')
        
        start_time = time.time()
        y_pred = evaluator.predict(X_test)
        test_duration = time.time() - start_time
        
        # Gather metrics  
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro', zero_division=0.0)
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        
        evaluator_metrics = [sampler, label_class, evaluator_type, test_duration, accuracy, precision, recall, f1]
                
        # Add metrics to dataframe and display
        update_row = df_metrics.loc[(df_metrics['Sampler'] == sampler) &
                                    (df_metrics['Label Classes'] == label_class) &
                                    (df_metrics['Evaluator'] == evaluator_type)]
        
        if update_row.empty:    
            # No previous record
            df_metrics.loc[len(df_metrics.index)] = evaluator_metrics
            
            print(f'{evaluator_type} / {sampler} / {label_class} Metrics')
            display(df_metrics.loc[len(df_metrics.index)-1])
        
        else:   
            # Previous record exists
            update_row = evaluator_metrics
        
            print(f'{evaluator_type} / {sampler} / {label_class} Metrics')
            display(df_metrics.loc[(df_metrics['Sampler'] == sampler) &
                                    (df_metrics['Label Classes'] == label_class) &
                                    (df_metrics['Evaluator'] == evaluator_type)])

1715522431.3831902 : Predicting KNearestNeighbor on 33+1 classes
New Record
KNearestNeighbor / None / 33+1 Metrics


Sampler                      None
Label Classes                33+1
Evaluator        KNearestNeighbor
Test Duration          178.839612
Accuracy                 0.946546
Precision                0.656483
Recall                   0.623892
F1                       0.631163
Name: 90, dtype: object

1715522610.8434405 : Predicting KNearestNeighbor on 8+1 classes
New Record
KNearestNeighbor / None / 8+1 Metrics


Sampler                      None
Label Classes                 8+1
Evaluator        KNearestNeighbor
Test Duration          177.430614
Accuracy                 0.954509
Precision                0.694458
Recall                   0.622671
F1                       0.638203
Name: 91, dtype: object

1715522788.7708628 : Predicting KNearestNeighbor on 1+1 classes
New Record
KNearestNeighbor / None / 1+1 Metrics


Sampler                      None
Label Classes                 1+1
Evaluator        KNearestNeighbor
Test Duration          177.506453
Accuracy                 0.991896
Precision                0.897603
Recall                   0.940536
F1                       0.917904
Name: 92, dtype: object

1715522967.8579066 : Predicting KNearestNeighbor on 33+1 classes


## Save Metrics to File

In [None]:
df_metrics['Sampler'] = pd.Categorical(df_metrics['Sampler'], categories=sampler_categories)
df_metrics['Label Classes'] = pd.Categorical(df_metrics['Label Classes'], categories=label_categories)
df_metrics['Evaluator'] = pd.Categorical(df_metrics['Evaluator'], categories=evaluator_categories)

df_metrics.sort_values(['Sampler', 'Label Classes', 'Evaluator'], inplace=True)

df_metrics.to_json(path_or_buf=metrics_directory+'/sampling_metrics.json', orient='index')

display(df_metrics)