# Setup
## Parameters

In [1]:
label_classes = ['33+1', '8+1', '1+1']  # Classes: '33+1', '8+1', '1+1'

sampling_methods = ['RandomOverSampler', 'SMOTE']   # Samplers: 'None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids'

evaluator_types = ['KNearestNeighbor']   # Evaluators: 'XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor'

# Random Seeds
train_test_seed = 42
evaluator_seed = 42

# Import/Export Directories
metrics_directory = './metrics'
resampled_dataset_directory = './resampled_datasets'
evaluator_directory = './trained_evaluators'

# Notebook parameter validation

sampler_categories = ['None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids']
label_categories = ['33+1', '8+1', '1+1']
evaluator_categories = ['XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor']

for label_class in label_classes:
    if label_class not in label_categories:
        assert False, f'{label_class} is an invalid class structure.'

for sampler in sampling_methods:
    if sampler not in sampler_categories:
        assert False, f'{sampler} is an invalid sampling method.'
    
for evaluator in evaluator_types:
    if evaluator not in evaluator_categories:
        assert False, f'{evaluator} is an invalid evaluator.'

## Common Packages

In [2]:
import os
import pandas as pd
import time
import psutil
import GPUtil
from datetime import datetime 
from IPython.display import display
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Importing the Dataset

In [3]:
all_filepaths = [filename for filename in os.listdir(resampled_dataset_directory) if filename.endswith('.parquet')]
print(f'CSVs in "{resampled_dataset_directory}": {len(all_filepaths)}\n')

datasets = {}

for sampler in sampling_methods:
    datasets[sampler] = {}
    for label_class in label_classes:
        print(f'Loading ./resampled_datasets/{sampler}_{label_class}_resampled_dataset.parquet')
        datasets[sampler][label_class] = pd.read_parquet(
            path=f'./resampled_datasets/{sampler}_{label_class}_resampled_dataset.parquet'
        )
        print(f'Dataset Shape: {datasets[sampler][label_class].shape}')
        display(datasets[sampler][label_class])

CSVs in "./resampled_datasets": 15

Loading ./resampled_datasets/RandomOverSampler_33+1_resampled_dataset.parquet
Dataset Shape: (7299188, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,21
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,1
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,12
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,10
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7299183,84.306294,8574.2,8.2,95.4,0.674955,0.674955,0.0,0.0,0.0,0.0,...,55.099916,130.4,8.730838e-02,5.5,17.749092,77.923048,3.446832e+03,0.9,38.50,33
7299184,35.046658,4131.6,10.0,95.3,1.018656,1.018656,0.0,0.0,0.0,0.0,...,59.975085,131.5,4.443579e-02,5.5,15.058692,84.817579,4.510625e+03,0.8,38.50,33
7299185,1300.182142,14839.5,7.1,130.0,1.225049,1.225049,0.0,0.0,0.0,0.0,...,46.416692,107.9,5.435891e-02,5.5,15.207270,65.643116,2.438648e+03,0.9,38.50,33
7299186,2.039416,151725.0,6.0,112.0,59.575648,59.575648,0.0,0.0,0.0,0.0,...,0.000000,1514.0,1.398087e-04,5.5,55.027266,0.000000,0.000000e+00,0.0,38.50,33


Loading ./resampled_datasets/RandomOverSampler_8+1_resampled_dataset.parquet
Dataset Shape: (8092224, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,3
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,0
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,2
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,2
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8092219,148.301989,9636.2,6.5,94.4,36.515951,36.515951,0.0,0.0,0.0,0.0,...,87.847986,136.5,1.141648e-02,5.5,15.133559,124.235814,1.380786e+04,0.9,38.50,7
8092220,48.978031,16974.5,6.0,104.1,39.539877,39.539877,0.0,0.0,0.0,0.0,...,696.852957,1080.3,5.858994e-03,5.5,30.261153,985.498902,1.033186e+06,0.9,38.50,7
8092221,106.930907,3287.4,11.5,86.7,77.855874,77.855874,0.0,0.0,0.0,0.0,...,39.320718,142.0,3.758659e-02,5.5,12.519245,55.607893,5.342274e+03,0.7,38.50,7
8092222,83.611826,15602.3,12.6,90.8,23.397903,23.397903,0.0,0.0,0.0,0.0,...,23.423697,85.5,1.341100e-02,5.5,11.559150,33.126109,8.093751e+02,0.7,38.50,7


Loading ./resampled_datasets/RandomOverSampler_1+1_resampled_dataset.parquet
Dataset Shape: (2713172, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,0
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,1
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,0
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,0
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713167,2.101208,528401.6,6.0,114.4,203.928830,203.928830,0.0,0.0,0.0,0.0,...,1971.614649,1803.6,8.219004e-04,5.5,65.253330,2788.284176,4.472826e+06,0.9,38.50,1
2713168,1.792944,329152.0,6.0,64.0,157.001031,157.001031,0.0,0.0,0.0,0.0,...,0.000000,1514.0,2.541804e-03,5.5,55.027266,0.000000,0.000000e+00,0.0,38.50,1
2713169,0.019382,158.4,12.1,76.2,11.138463,11.138463,0.0,0.0,0.0,0.0,...,76.039440,174.2,6.775784e-03,5.5,16.912792,107.536007,6.735486e+03,0.9,38.50,1
2713170,1.747535,2066445.4,6.0,87.6,770.763713,770.763713,0.0,0.0,0.0,0.0,...,1699.198468,1741.2,1.665202e+08,13.5,48.335946,2403.541519,2.889974e+06,1.0,244.60,1


Loading ./resampled_datasets/SMOTE_33+1_resampled_dataset.parquet
Dataset Shape: (2547620, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,162.000000,17.00,64.000000,17.544943,17.544943,0.0,0.0,0.0,0.0,...,0.000000,162.000000,8.300786e+07,9.5,18.000000,0.000000,0.000000,0.000000,141.55,21
1,0.000000,54.000000,6.00,64.000000,6.221817,6.221817,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334813e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,9
2,0.000000,54.000000,6.00,64.000000,8.199278,8.199278,0.0,0.0,0.0,0.0,...,0.000000,54.000000,8.295125e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,20
3,0.000000,54.000000,6.00,64.000000,19.043810,19.043810,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334509e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,9
4,0.000000,2.140000,1.15,65.270000,12.654857,12.654857,0.0,0.0,0.0,0.0,...,7.113096,43.900000,8.312817e+07,9.5,9.410572,10.072289,347.220876,0.150000,141.55,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2547615,144.381557,4154.671882,7.10,172.475841,81.562134,81.562134,0.0,0.0,0.0,0.0,...,238.492874,496.078220,1.177138e-02,5.5,20.114113,337.279857,122742.988115,0.810914,38.50,33
2547616,121.376917,10722.088051,7.10,85.046520,6.103580,6.103580,0.0,0.0,0.0,0.0,...,74.160889,179.612820,1.676392e+08,13.5,17.366595,105.085507,5682.486711,1.000000,244.60,33
2547617,452.847589,16162.306374,7.10,72.967922,0.506335,0.506335,0.0,0.0,0.0,0.0,...,29.806732,87.357461,3.383168e-02,5.5,13.805576,42.153085,1126.375933,0.900000,38.50,33
2547618,468.029552,5097.176609,7.10,96.647996,10.342081,10.342081,0.0,0.0,0.0,0.0,...,238.675340,162.358399,1.676390e+08,13.5,21.099657,337.960206,67574.513301,1.000000,244.60,33


Loading ./resampled_datasets/SMOTE_8+1_resampled_dataset.parquet
Dataset Shape: (2822584, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,162.000000,17.00,64.000000,17.544943,17.544943,0.0,0.0,0.0,0.0,...,0.000000,162.000000,8.300786e+07,9.5,18.000000,0.000000,0.000000,0.000000,141.55,3
1,0.000000,54.000000,6.00,64.000000,6.221817,6.221817,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334813e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,2
2,0.000000,54.000000,6.00,64.000000,8.199278,8.199278,0.0,0.0,0.0,0.0,...,0.000000,54.000000,8.295125e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,3
3,0.000000,54.000000,6.00,64.000000,19.043810,19.043810,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334509e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,2
4,0.000000,2.140000,1.15,65.270000,12.654857,12.654857,0.0,0.0,0.0,0.0,...,7.113096,43.900000,8.312817e+07,9.5,9.410572,10.072289,347.220876,0.150000,141.55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2822579,210.817806,9709.228908,7.10,90.642940,10.750663,10.750663,0.0,0.0,0.0,0.0,...,109.222702,93.759338,1.664818e+08,13.5,15.920673,154.787232,12017.931683,1.000000,244.60,7
2822580,472.668640,1903.378342,11.50,97.094786,3.643343,3.643343,0.0,0.0,0.0,0.0,...,34.450865,92.168373,2.211139e-02,5.5,14.593947,48.720880,1487.290895,0.826775,38.50,7
2822581,191.787015,7664.390328,8.90,128.785764,3.401687,3.401687,0.0,0.0,0.0,0.0,...,46.707797,106.635597,1.676390e+08,13.5,13.769177,66.067947,2219.660760,1.000000,244.60,7
2822582,251.312256,4961.100786,10.40,139.602696,4.774916,4.774916,0.0,0.0,0.0,0.0,...,68.041279,156.917794,2.086204e-02,5.5,17.183800,96.224900,5311.614688,0.900000,38.50,7


Loading ./resampled_datasets/SMOTE_1+1_resampled_dataset.parquet
Dataset Shape: (946876, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,1.620000e+02,17.00,64.000000,17.544943,17.544943,0.0,0.0,0.0,0.0,...,0.000000,162.000000,8.300786e+07,9.5,18.000000,0.000000,0.000000,0.000000,141.55,0
1,0.000000,5.400000e+01,6.00,64.000000,6.221817,6.221817,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334813e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,0
2,0.000000,5.400000e+01,6.00,64.000000,8.199278,8.199278,0.0,0.0,0.0,0.0,...,0.000000,54.000000,8.295125e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,0
3,0.000000,5.400000e+01,6.00,64.000000,19.043810,19.043810,0.0,1.0,0.0,1.0,...,0.000000,54.000000,8.334509e+07,9.5,10.392305,0.000000,0.000000,0.000000,141.55,0
4,0.000000,2.140000e+00,1.15,65.270000,12.654857,12.654857,0.0,0.0,0.0,0.0,...,7.113096,43.900000,8.312817e+07,9.5,9.410572,10.072289,347.220876,0.150000,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946871,51.531976,1.915145e+05,8.20,96.064442,43.179130,43.179130,0.0,0.0,0.0,0.0,...,259.750409,96.102253,1.665221e+08,13.5,17.225191,367.663400,72090.662250,1.000000,244.60,1
946872,68.196588,2.532512e+05,6.00,116.869038,1294.224027,1294.224027,0.0,0.0,0.0,0.0,...,943.202151,409.013001,1.665196e+08,13.5,40.139441,1335.930129,893831.982553,1.000000,244.60,1
946873,24.532109,1.612908e+06,6.00,233.533313,61.402976,61.402976,0.0,0.0,0.0,0.0,...,3.408458,77.361526,1.173764e-02,5.5,11.583017,4.820287,151.335533,0.076767,38.50,1
946874,42.055308,1.182109e+04,9.90,125.683530,28.084851,28.084851,0.0,0.0,0.0,0.0,...,168.619284,406.333699,1.665177e+08,13.5,17.350575,163.704508,13413.802681,1.000000,244.60,1


# Preprocessing
## Scaling Numerical Features

In [4]:
num_cols = [
    'flow_duration', 'Header_Length',  'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count', 'fin_count',
    'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]

scalers = {}
print('Scaled Datasets:')
for sampler in sampling_methods:
    scalers[sampler] = StandardScaler()

    for label_class in label_classes:
        datasets[sampler][label_class][num_cols] = scalers[sampler].fit_transform(
            datasets[sampler][label_class][num_cols]
        )
    
        print(f'Sampler: {sampler} / Label Class: {label_class}')
        display(datasets[sampler][label_classes[0]].head(3))

Scaled Datasets:
Sampler: RandomOverSampler / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: RandomOverSampler / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: RandomOverSampler / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: SMOTE / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.175623,-0.278562,17.0,-0.557617,-0.073585,-0.073585,0.0,0.0,0.0,0.0,...,-0.604151,-0.416428,-0.038296,-0.031136,-0.265569,-0.603796,-0.267222,-1.278147,-0.03119,21
1,-0.175623,-0.278707,6.0,-0.557617,-0.073753,-0.073753,0.0,1.0,0.0,1.0,...,-0.604151,-0.677988,-0.031859,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,9
2,-0.175623,-0.278707,6.0,-0.557617,-0.073724,-0.073724,0.0,0.0,0.0,0.0,...,-0.604151,-0.677988,-0.039367,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,20


Sampler: SMOTE / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.175623,-0.278562,17.0,-0.557617,-0.073585,-0.073585,0.0,0.0,0.0,0.0,...,-0.604151,-0.416428,-0.038296,-0.031136,-0.265569,-0.603796,-0.267222,-1.278147,-0.03119,21
1,-0.175623,-0.278707,6.0,-0.557617,-0.073753,-0.073753,0.0,1.0,0.0,1.0,...,-0.604151,-0.677988,-0.031859,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,9
2,-0.175623,-0.278707,6.0,-0.557617,-0.073724,-0.073724,0.0,0.0,0.0,0.0,...,-0.604151,-0.677988,-0.039367,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,20


Sampler: SMOTE / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.175623,-0.278562,17.0,-0.557617,-0.073585,-0.073585,0.0,0.0,0.0,0.0,...,-0.604151,-0.416428,-0.038296,-0.031136,-0.265569,-0.603796,-0.267222,-1.278147,-0.03119,21
1,-0.175623,-0.278707,6.0,-0.557617,-0.073753,-0.073753,0.0,1.0,0.0,1.0,...,-0.604151,-0.677988,-0.031859,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,9
2,-0.175623,-0.278707,6.0,-0.557617,-0.073724,-0.073724,0.0,0.0,0.0,0.0,...,-0.604151,-0.677988,-0.039367,-0.031136,-0.83631,-0.603796,-0.267222,-1.278147,-0.03119,20


## X, y Train/Test Splitting

In [5]:
# Not storing test datasets in memory, will save as parquet and later load for testing.
X_train = {}
y_train = {}

print('Train/Test Shapes:')

for sampler in sampling_methods:
    X_train[sampler] = {}
    y_train[sampler] = {}
    
    for label_class in label_classes:
        X = datasets[sampler][label_class].drop('label', axis=1)
        y = datasets[sampler][label_class]['label']
        
        X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
            X, y, test_size=0.2, random_state=train_test_seed
        )
        
        print(f'\nSampler: {sampler} / Label Class: {label_class}')
        print(f'X_train: {X_train_temp.shape}, y_train: {y_train_temp.shape}, X_test: {X_test_temp.shape}, y_test: {y_test_temp.shape}')
        
        X_train[sampler][label_class] = X_train_temp
        y_train[sampler][label_class] = y_train_temp
        
        # Save test dataset
        pd.concat([X_test_temp, y_test_temp], axis=1).to_parquet(
            path=f'./test_datasets/{sampler}_{label_class}_test_dataset.parquet'
        )

# Whole datasets no longer used. Free memory.
del datasets

Train/Test Shapes:

Sampler: RandomOverSampler / Label Class: 33+1
X_train: (5839350, 46), y_train: (5839350,), X_test: (1459838, 46), y_test: (1459838,)

Sampler: RandomOverSampler / Label Class: 8+1
X_train: (6473779, 46), y_train: (6473779,), X_test: (1618445, 46), y_test: (1618445,)

Sampler: RandomOverSampler / Label Class: 1+1
X_train: (2170537, 46), y_train: (2170537,), X_test: (542635, 46), y_test: (542635,)

Sampler: SMOTE / Label Class: 33+1
X_train: (2038096, 46), y_train: (2038096,), X_test: (509524, 46), y_test: (509524,)

Sampler: SMOTE / Label Class: 8+1
X_train: (2258067, 46), y_train: (2258067,), X_test: (564517, 46), y_test: (564517,)

Sampler: SMOTE / Label Class: 1+1
X_train: (757500, 46), y_train: (757500,), X_test: (189376, 46), y_test: (189376,)


# Evaluators

In [6]:
# Get all evaluators
evaluators = []
for evaluator_type in evaluator_types:
    for sampler in sampling_methods:
        for label_class in label_classes:
            evaluators.append((evaluator_type, sampler, label_class))

# Get list of pretrained evaluators
pretrained_evaluators = [filename for filename in os.listdir(evaluator_directory) if filename.endswith('.joblib')]

for i, evaluator in enumerate(pretrained_evaluators):
    pretrained_evaluators[i] = tuple(evaluator[:-7].split('_'))

# Get list of untrained evaluators
untrained_evaluators = [evaluator for evaluator in evaluators if evaluator not in pretrained_evaluators]

print(f'All evaluators: {len(evaluators)}')
print(f'Pretrained evaluators: {len(pretrained_evaluators)}')
print(f'Untrained evaluators: {len(untrained_evaluators)}')

All evaluators: 6
Pretrained evaluators: 105
Untrained evaluators: 0


## Training

In [7]:
# Only train evaluators not found in ./trained_evaluators
for i, (evaluator_type, sampler, label_class) in enumerate(untrained_evaluators):
    print(f'{i+1}/{len(untrained_evaluators)} : {evaluator_type}, {sampler}, {label_class}')
    match evaluator_type:
        case 'XGBoost':
            from xgboost import XGBClassifier
            evaluator_model = XGBClassifier()
            
        case 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            evaluator_model = LogisticRegression(random_state=evaluator_seed, n_jobs=-1)
            
        case 'Perceptron':
            from sklearn.linear_model import Perceptron
            evaluator_model = Perceptron(random_state=evaluator_seed, n_jobs=-1)
            
        case 'AdaBoost':
            from sklearn.ensemble import AdaBoostClassifier
            evaluator_model = AdaBoostClassifier(random_state=evaluator_seed, algorithm='SAMME')
            
        case 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            evaluator_model = RandomForestClassifier(random_state=evaluator_seed, n_jobs=-1)
            
        case 'DeepNeuralNetwork':
            from sklearn.neural_network import MLPClassifier
            evaluator_model = MLPClassifier(random_state=evaluator_seed)
            
        case 'KNearestNeighbor':
            from sklearn.neighbors import KNeighborsClassifier
            evaluator_model = KNeighborsClassifier(n_jobs=-1)
            
        case _:
            print(f'Invalid evaluator model: {evaluator_type}')
    
    
    if evaluator_type == 'XGBoost' and label_class == '1+1':
        evaluator_model = XGBClassifier(objective='binary:logistic')
            
    print(f'{time.time()} : Training {evaluator_type} on {sampler} balanced data with {label_class} label classes')
    evaluator_model.fit(X_train[sampler][label_class], y_train[sampler][label_class])
    print(f'{time.time()} : Training completed.\n')
    
    # Saving evaluator to file
    dump(evaluator_model, filename=f'./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib')

In [8]:
# Free training memory
try:
    del evaluator_model
except NameError:   # no evaluators were trained
    pass

del X_train
del y_train

No evaluators to delete


## Testing

In [9]:
# Load previous metrics dataframe
try: 
    df_metrics = pd.read_json(path_or_buf=metrics_directory+'/sampling_evaluator_metrics.json', orient='index')
    
except FileNotFoundError:
    df_metrics = pd.DataFrame(columns=['Sampler', 'Label Classes', 'Evaluator', 'Test Duration', 'Accuracy', 'Precision', 'Recall', 'F1'])

df_metrics

Unnamed: 0,Sampler,Label Classes,Evaluator,Test Duration,Accuracy,Precision,Recall,F1
0,,33+1,XGBoost,2.075624,0.992896,0.769986,0.719065,0.728827
15,,33+1,LogisticRegression,0.106245,0.792020,0.479057,0.430702,0.420893
30,,33+1,Perceptron,0.082568,0.728493,0.503647,0.472640,0.421391
45,,33+1,AdaBoost,4.977411,0.687285,0.252317,0.290804,0.246249
60,,33+1,RandomForest,2.130428,0.993432,0.800865,0.724395,0.736328
...,...,...,...,...,...,...,...,...
44,ClusterCentroids,1+1,Perceptron,0.002000,0.848351,0.868787,0.847768,0.846065
59,ClusterCentroids,1+1,AdaBoost,0.084290,0.939142,0.939702,0.939058,0.939113
74,ClusterCentroids,1+1,RandomForest,0.038548,0.976388,0.976816,0.976319,0.976380
89,ClusterCentroids,1+1,DeepNeuralNetwork,0.006001,0.942951,0.945422,0.942770,0.942854


In [10]:
# Run prediction
for (evaluator_type, sampler, label_class) in evaluators:
    # Load evaluator
    try:
        evaluator = load(filename=f'./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib')
    
    except FileNotFoundError:
        print(f'EVALUATOR NOT FOUND: ./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib\n')
    
    else:
        # Load testing dataset
        df_test = pd.read_parquet(path=f'./test_datasets/{sampler}_{label_class}_test_dataset.parquet')
        
        X_test = df_test.drop('label', axis=1)
        y_test = df_test['label']
            
        print(f'{datetime.now()} : Predicting {evaluator_type} on {sampler} sampled data with {label_class} classes')
        
        start_time = time.time()
        y_pred = evaluator.predict(X_test)
        test_duration = time.time() - start_time
        
        # Gather metrics  
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro', zero_division=0.0)
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        
        evaluator_metrics = [sampler, label_class, evaluator_type, test_duration, accuracy, precision, recall, f1]
                
        # Add metrics to dataframe and display
        update_row = df_metrics.loc[(df_metrics['Sampler'] == sampler) &
                                    (df_metrics['Label Classes'] == label_class) &
                                    (df_metrics['Evaluator'] == evaluator_type)]
        
        if update_row.empty:    
            # No previous record
            df_metrics.loc[len(df_metrics.index)] = evaluator_metrics
            
            print(f'{evaluator_type} / {sampler} / {label_class} Metrics')
            display(df_metrics.loc[len(df_metrics.index)-1])
        
        else:   
            # Previous record exists
            update_row = evaluator_metrics
        
            print(f'{evaluator_type} / {sampler} / {label_class} Metrics')
            display(df_metrics.loc[(df_metrics['Sampler'] == sampler) &
                                    (df_metrics['Label Classes'] == label_class) &
                                    (df_metrics['Evaluator'] == evaluator_type)])

2024-05-12 13:21:08.255349 : Predicting KNearestNeighbor on RandomOverSampler sampled data with 33+1 classes
KNearestNeighbor / RandomOverSampler / 33+1 Metrics


Sampler          RandomOverSampler
Label Classes                 33+1
Evaluator         KNearestNeighbor
Test Duration          4813.146803
Accuracy                  0.987982
Precision                 0.987976
Recall                    0.987968
F1                        0.987961
Name: 99, dtype: object

2024-05-12 14:41:24.905998 : Predicting KNearestNeighbor on RandomOverSampler sampled data with 8+1 classes
KNearestNeighbor / RandomOverSampler / 8+1 Metrics


Sampler          RandomOverSampler
Label Classes                  8+1
Evaluator         KNearestNeighbor
Test Duration          5856.801126
Accuracy                  0.990905
Precision                 0.991076
Recall                    0.990917
F1                        0.990903
Name: 100, dtype: object

2024-05-12 16:19:03.971855 : Predicting KNearestNeighbor on RandomOverSampler sampled data with 1+1 classes
KNearestNeighbor / RandomOverSampler / 1+1 Metrics


Sampler          RandomOverSampler
Label Classes                  1+1
Evaluator         KNearestNeighbor
Test Duration           701.277243
Accuracy                  0.995524
Precision                 0.995565
Recall                    0.995522
F1                        0.995524
Name: 101, dtype: object

2024-05-12 16:30:46.239902 : Predicting KNearestNeighbor on SMOTE sampled data with 33+1 classes
KNearestNeighbor / SMOTE / 33+1 Metrics


Sampler                     SMOTE
Label Classes                33+1
Evaluator        KNearestNeighbor
Test Duration          591.434569
Accuracy                 0.956402
Precision                0.956299
Recall                   0.956461
F1                       0.956165
Name: 102, dtype: object

2024-05-12 16:40:38.874173 : Predicting KNearestNeighbor on SMOTE sampled data with 8+1 classes
KNearestNeighbor / SMOTE / 8+1 Metrics


Sampler                     SMOTE
Label Classes                 8+1
Evaluator        KNearestNeighbor
Test Duration          731.782705
Accuracy                 0.967888
Precision                0.967854
Recall                   0.967923
F1                       0.967819
Name: 103, dtype: object

2024-05-12 16:52:51.417101 : Predicting KNearestNeighbor on SMOTE sampled data with 1+1 classes
KNearestNeighbor / SMOTE / 1+1 Metrics


Sampler                     SMOTE
Label Classes                 1+1
Evaluator        KNearestNeighbor
Test Duration           86.109575
Accuracy                 0.994841
Precision                0.994891
Recall                   0.994838
F1                       0.994841
Name: 104, dtype: object

## Save Metrics to File

In [12]:
df_metrics['Sampler'] = pd.Categorical(df_metrics['Sampler'], categories=sampler_categories)
df_metrics['Label Classes'] = pd.Categorical(df_metrics['Label Classes'], categories=label_categories)
df_metrics['Evaluator'] = pd.Categorical(df_metrics['Evaluator'], categories=evaluator_categories)

df_metrics.sort_values(['Sampler', 'Label Classes', 'Evaluator'], inplace=True)

df_metrics.to_json(path_or_buf=metrics_directory+'/sampling_evaluator_metrics.json', orient='index')

df_metrics

Unnamed: 0,Sampler,Label Classes,Evaluator,Test Duration,Accuracy,Precision,Recall,F1
0,,33+1,XGBoost,2.075624,0.992896,0.769986,0.719065,0.728827
15,,33+1,LogisticRegression,0.106245,0.792020,0.479057,0.430702,0.420893
30,,33+1,Perceptron,0.082568,0.728493,0.503647,0.472640,0.421391
45,,33+1,AdaBoost,4.977411,0.687285,0.252317,0.290804,0.246249
60,,33+1,RandomForest,2.130428,0.993432,0.800865,0.724395,0.736328
...,...,...,...,...,...,...,...,...
44,ClusterCentroids,1+1,Perceptron,0.002000,0.848351,0.868787,0.847768,0.846065
59,ClusterCentroids,1+1,AdaBoost,0.084290,0.939142,0.939702,0.939058,0.939113
74,ClusterCentroids,1+1,RandomForest,0.038548,0.976388,0.976816,0.976319,0.976380
89,ClusterCentroids,1+1,DeepNeuralNetwork,0.006001,0.942951,0.945422,0.942770,0.942854
