# Setup
## Parameters

In [1]:
label_classes = ['33+1', '8+1', '1+1']  # Classes: '33+1', '8+1', '1+1'

sampling_methods = ['None', 'RandomOverSampler']   # Samplers: None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids'

evaluator_types = ['Perceptron', 'AdaBoost']   # Evaluators: 'XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor'

# Random Seeds
train_test_seed = 42
evaluator_seed = 42

# Notebook parameter validation
for label_class in label_classes:
    if label_class not in ['33+1', '8+1', '1+1']:
        assert False, f'{label_class} is an invalid class structure.'

for sampler in sampling_methods:
    if sampler not in ['None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids']:
        assert False, f'{sampler} is an invalid sampling method.'
    
for evaluator in evaluator_types:
    if evaluator not in ['XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 
                         'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor']:
        assert False, f'{evaluator} is an invalid evaluator.'

## Common Packages

In [2]:
import os
import pandas as pd
from IPython.display import display
from datetime import datetime
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Importing the Dataset

In [3]:
DATASET_DIRECTORY = './resampled_datasets'

all_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.parquet')]
print(f'CSVs in "{DATASET_DIRECTORY}": {len(all_filepaths)}\n')

datasets = {}

for sampler in sampling_methods:
    datasets[sampler] = {}
    for label_class in label_classes:
        print(f'Loading ./resampled_datasets/{sampler}_{label_class}_resampled_dataset.parquet')
        datasets[sampler][label_class] = pd.read_parquet(
            path=f'./resampled_datasets/{sampler}_{label_class}_resampled_dataset.parquet'
        )
        [print(f'Dataset Shape: {datasets[sampler][label_class].shape}')]
        display(datasets[sampler][label_class])

CSVs in "./resampled_datasets": 6

Loading ./resampled_datasets/None_33+1_resampled_dataset.parquet
Dataset Shape: (1389408, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.00,17.00,64.00,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,21
1,76.135781,428611.80,8.20,151.90,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.00,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.00,244.60,1
2,4.549627,108.00,6.00,64.00,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,12
3,0.000000,54.00,6.00,64.00,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,10
4,0.000000,0.00,1.00,64.00,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233582,0.000000,54.00,6.00,64.00,5.966641,5.966641,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.303339e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,13
233583,8.547875,20177.97,15.59,68.64,7031.899016,7031.899016,0.0,0.0,0.0,0.0,...,68.440643,82.09,8.334834e+07,9.5,12.897158,96.738630,1.597739e+04,0.94,141.55,9
233584,0.000000,54.00,6.00,64.00,4.410794,4.410794,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334818e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,9
233585,0.000000,0.00,1.00,64.00,22.793830,22.793830,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.314974e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,6


Loading ./resampled_datasets/None_8+1_resampled_dataset.parquet
Dataset Shape: (1389408, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.00,17.00,64.00,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,3
1,76.135781,428611.80,8.20,151.90,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.00,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.00,244.60,0
2,4.549627,108.00,6.00,64.00,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
3,0.000000,54.00,6.00,64.00,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
4,0.000000,0.00,1.00,64.00,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233582,0.000000,54.00,6.00,64.00,5.966641,5.966641,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.303339e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
233583,8.547875,20177.97,15.59,68.64,7031.899016,7031.899016,0.0,0.0,0.0,0.0,...,68.440643,82.09,8.334834e+07,9.5,12.897158,96.738630,1.597739e+04,0.94,141.55,2
233584,0.000000,54.00,6.00,64.00,4.410794,4.410794,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334818e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,2
233585,0.000000,0.00,1.00,64.00,22.793830,22.793830,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.314974e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,2


Loading ./resampled_datasets/None_1+1_resampled_dataset.parquet
Dataset Shape: (1389408, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.00,17.00,64.00,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0
1,76.135781,428611.80,8.20,151.90,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.00,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.00,244.60,1
2,4.549627,108.00,6.00,64.00,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
3,0.000000,54.00,6.00,64.00,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
4,0.000000,0.00,1.00,64.00,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233582,0.000000,54.00,6.00,64.00,5.966641,5.966641,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.303339e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
233583,8.547875,20177.97,15.59,68.64,7031.899016,7031.899016,0.0,0.0,0.0,0.0,...,68.440643,82.09,8.334834e+07,9.5,12.897158,96.738630,1.597739e+04,0.94,141.55,0
233584,0.000000,54.00,6.00,64.00,4.410794,4.410794,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334818e+07,9.5,10.392305,0.000000,0.000000e+00,0.00,141.55,0
233585,0.000000,0.00,1.00,64.00,22.793830,22.793830,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.314974e+07,9.5,9.165151,0.000000,0.000000e+00,0.00,141.55,0


Loading ./resampled_datasets/RandomOverSampler_33+1_resampled_dataset.parquet
Dataset Shape: (7299188, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,21
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,1
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,12
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,10
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7299183,84.306294,8574.2,8.2,95.4,0.674955,0.674955,0.0,0.0,0.0,0.0,...,55.099916,130.4,8.730838e-02,5.5,17.749092,77.923048,3.446832e+03,0.9,38.50,33
7299184,35.046658,4131.6,10.0,95.3,1.018656,1.018656,0.0,0.0,0.0,0.0,...,59.975085,131.5,4.443579e-02,5.5,15.058692,84.817579,4.510625e+03,0.8,38.50,33
7299185,1300.182142,14839.5,7.1,130.0,1.225049,1.225049,0.0,0.0,0.0,0.0,...,46.416692,107.9,5.435891e-02,5.5,15.207270,65.643116,2.438648e+03,0.9,38.50,33
7299186,2.039416,151725.0,6.0,112.0,59.575648,59.575648,0.0,0.0,0.0,0.0,...,0.000000,1514.0,1.398087e-04,5.5,55.027266,0.000000,0.000000e+00,0.0,38.50,33


Loading ./resampled_datasets/RandomOverSampler_8+1_resampled_dataset.parquet
Dataset Shape: (8092224, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,3
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,0
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,2
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,2
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8092219,148.301989,9636.2,6.5,94.4,36.515951,36.515951,0.0,0.0,0.0,0.0,...,87.847986,136.5,1.141648e-02,5.5,15.133559,124.235814,1.380786e+04,0.9,38.50,7
8092220,48.978031,16974.5,6.0,104.1,39.539877,39.539877,0.0,0.0,0.0,0.0,...,696.852957,1080.3,5.858994e-03,5.5,30.261153,985.498902,1.033186e+06,0.9,38.50,7
8092221,106.930907,3287.4,11.5,86.7,77.855874,77.855874,0.0,0.0,0.0,0.0,...,39.320718,142.0,3.758659e-02,5.5,12.519245,55.607893,5.342274e+03,0.7,38.50,7
8092222,83.611826,15602.3,12.6,90.8,23.397903,23.397903,0.0,0.0,0.0,0.0,...,23.423697,85.5,1.341100e-02,5.5,11.559150,33.126109,8.093751e+02,0.7,38.50,7


Loading ./resampled_datasets/RandomOverSampler_1+1_resampled_dataset.parquet
Dataset Shape: (2713172, 47)


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.300688e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,0
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,1.665202e+08,13.5,41.470705,2419.498399,2.944407e+06,1.0,244.60,1
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.336142e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,0
3,0.000000,54.0,6.0,64.0,0.000000,0.000000,0.0,0.0,1.0,0.0,...,0.000000,54.0,8.308992e+07,9.5,10.392305,0.000000,0.000000e+00,0.0,141.55,0
4,0.000000,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.000000,42.0,8.313213e+07,9.5,9.165151,0.000000,0.000000e+00,0.0,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713167,2.101208,528401.6,6.0,114.4,203.928830,203.928830,0.0,0.0,0.0,0.0,...,1971.614649,1803.6,8.219004e-04,5.5,65.253330,2788.284176,4.472826e+06,0.9,38.50,1
2713168,1.792944,329152.0,6.0,64.0,157.001031,157.001031,0.0,0.0,0.0,0.0,...,0.000000,1514.0,2.541804e-03,5.5,55.027266,0.000000,0.000000e+00,0.0,38.50,1
2713169,0.019382,158.4,12.1,76.2,11.138463,11.138463,0.0,0.0,0.0,0.0,...,76.039440,174.2,6.775784e-03,5.5,16.912792,107.536007,6.735486e+03,0.9,38.50,1
2713170,1.747535,2066445.4,6.0,87.6,770.763713,770.763713,0.0,0.0,0.0,0.0,...,1699.198468,1741.2,1.665202e+08,13.5,48.335946,2403.541519,2.889974e+06,1.0,244.60,1


# Preprocessing
## Scaling Numerical Features

In [4]:
num_cols = [
    'flow_duration', 'Header_Length',  'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count', 'fin_count',
    'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]

scalers = {}
print('Scaled Datasets:')
for sampler in sampling_methods:
    scalers[sampler] = StandardScaler()

    for label_class in label_classes:
        datasets[sampler][label_class][num_cols] = scalers[sampler].fit_transform(
            datasets[sampler][label_class][num_cols]
        )
    
        print(f'Sampler: {sampler} / Label Class: {label_class}')
        display(datasets[sampler][label_classes[0]].head(3))

Scaled Datasets:
Sampler: None / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.003675,-0.166147,17.0,-0.167976,-0.090509,-0.090509,0.0,0.0,0.0,0.0,...,-0.208794,-0.343574,-0.011406,0.000835,-0.459184,-0.208627,-0.095484,-0.414344,0.000754,21
1,0.28153,0.760145,8.2,6.105171,-0.090092,-0.090092,0.0,0.0,0.0,0.0,...,10.488495,2.198032,4.882094,4.877978,3.28135,10.509058,9.13432,3.870082,4.88556,1
2,-0.003979,-0.166095,6.0,-0.167976,-0.090505,-0.090505,0.0,0.0,1.0,0.0,...,-0.208794,-0.29382,0.009369,0.000835,-0.317097,-0.208627,-0.095484,-0.414344,0.000754,12


Sampler: None / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.003675,-0.166147,17.0,-0.167976,-0.090509,-0.090509,0.0,0.0,0.0,0.0,...,-0.208794,-0.343574,-0.011406,0.000835,-0.459184,-0.208627,-0.095484,-0.414344,0.000754,21
1,0.28153,0.760145,8.2,6.105171,-0.090092,-0.090092,0.0,0.0,0.0,0.0,...,10.488495,2.198032,4.882094,4.877978,3.28135,10.509058,9.13432,3.870082,4.88556,1
2,-0.003979,-0.166095,6.0,-0.167976,-0.090505,-0.090505,0.0,0.0,1.0,0.0,...,-0.208794,-0.29382,0.009369,0.000835,-0.317097,-0.208627,-0.095484,-0.414344,0.000754,12


Sampler: None / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.003675,-0.166147,17.0,-0.167976,-0.090509,-0.090509,0.0,0.0,0.0,0.0,...,-0.208794,-0.343574,-0.011406,0.000835,-0.459184,-0.208627,-0.095484,-0.414344,0.000754,21
1,0.28153,0.760145,8.2,6.105171,-0.090092,-0.090092,0.0,0.0,0.0,0.0,...,10.488495,2.198032,4.882094,4.877978,3.28135,10.509058,9.13432,3.870082,4.88556,1
2,-0.003979,-0.166095,6.0,-0.167976,-0.090505,-0.090505,0.0,0.0,1.0,0.0,...,-0.208794,-0.29382,0.009369,0.000835,-0.317097,-0.208627,-0.095484,-0.414344,0.000754,12


Sampler: RandomOverSampler / Label Class: 33+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: RandomOverSampler / Label Class: 8+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


Sampler: RandomOverSampler / Label Class: 1+1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.115057,-0.280154,17.0,-0.4881,-0.077827,-0.077827,0.0,0.0,0.0,0.0,...,-0.584925,-0.671628,-0.009609,-0.002617,-0.900248,-0.584342,-0.244405,-1.259627,-0.002638,21
1,-0.041793,0.272014,8.2,2.249626,-0.077221,-0.077221,0.0,0.0,0.0,0.0,...,4.821669,0.737551,1.548932,1.554704,1.440983,4.831889,4.379384,1.02988,1.5551,1
2,-0.115135,-0.280123,6.0,-0.4881,-0.077822,-0.077822,0.0,0.0,1.0,0.0,...,-0.584925,-0.644042,-0.002992,-0.002617,-0.811314,-0.584342,-0.244405,-1.259627,-0.002638,12


## X, y Train/Test Splitting

In [5]:
X_train = {}
X_test = {}
y_train = {}
y_test = {}

print('Train/Test Shapes:')

for sampler in sampling_methods:
    X_train[sampler] = {}
    X_test[sampler] = {}
    y_train[sampler] = {}
    y_test[sampler] = {}
    
    for label_class in label_classes:
        X = datasets[sampler][label_class].drop('label', axis=1)
        y = datasets[sampler][label_class]['label']
        
        X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
            X, y, test_size=0.2, random_state=train_test_seed
        )
        
        X_train[sampler][label_class] = X_train_temp
        X_test[sampler][label_class] = X_test_temp
        y_train[sampler][label_class] = y_train_temp
        y_test[sampler][label_class] = y_test_temp
        print(f'\nSampler: {sampler} / Label Class: {label_class}')
        print(f'X_train: {X_train[sampler][label_class].shape}, y_train: {y_train[sampler][label_class].shape}, X_test: {X_test[sampler][label_class].shape}, y_test: {y_test[sampler][label_class].shape}')

Train/Test Shapes:

Sampler: None / Label Class: 33+1
X_train: (1111526, 46), y_train: (1111526,), X_test: (277882, 46), y_test: (277882,)

Sampler: None / Label Class: 8+1
X_train: (1111526, 46), y_train: (1111526,), X_test: (277882, 46), y_test: (277882,)

Sampler: None / Label Class: 1+1
X_train: (1111526, 46), y_train: (1111526,), X_test: (277882, 46), y_test: (277882,)

Sampler: RandomOverSampler / Label Class: 33+1
X_train: (5839350, 46), y_train: (5839350,), X_test: (1459838, 46), y_test: (1459838,)

Sampler: RandomOverSampler / Label Class: 8+1
X_train: (6473779, 46), y_train: (6473779,), X_test: (1618445, 46), y_test: (1618445,)

Sampler: RandomOverSampler / Label Class: 1+1
X_train: (2170537, 46), y_train: (2170537,), X_test: (542635, 46), y_test: (542635,)


# Evaluators

In [6]:
# Get all evaluators
evaluators = []
for evaluator_type in evaluator_types:
    for sampler in sampling_methods:
        for label_class in label_classes:
            evaluators.append((evaluator_type, sampler, label_class))

# Get list of pretrained evaluators
evaluator_directory = './trained_evaluators'
pretrained_evaluators = [filename for filename in os.listdir(evaluator_directory) if filename.endswith('.joblib')]

for i, evaluator in enumerate(pretrained_evaluators):
    pretrained_evaluators[i] = tuple(evaluator[:-7].split('_'))

# Get list of untrained evaluators
untrained_evaluators = [evaluator for evaluator in evaluators if evaluator not in pretrained_evaluators]

print(f'All evaluators: {evaluators}')
print(f'Pretrained evaluators: {pretrained_evaluators}')
print(f'Untrained evaluators: {untrained_evaluators}')

All evaluators: [('Perceptron', 'None', '33+1'), ('Perceptron', 'None', '8+1'), ('Perceptron', 'None', '1+1'), ('Perceptron', 'RandomOverSampler', '33+1'), ('Perceptron', 'RandomOverSampler', '8+1'), ('Perceptron', 'RandomOverSampler', '1+1'), ('AdaBoost', 'None', '33+1'), ('AdaBoost', 'None', '8+1'), ('AdaBoost', 'None', '1+1'), ('AdaBoost', 'RandomOverSampler', '33+1'), ('AdaBoost', 'RandomOverSampler', '8+1'), ('AdaBoost', 'RandomOverSampler', '1+1')]
Pretrained evaluators: [('AdaBoost', 'None', '1+1'), ('AdaBoost', 'None', '33+1'), ('AdaBoost', 'None', '8+1'), ('Perceptron', 'None', '1+1'), ('Perceptron', 'None', '33+1'), ('Perceptron', 'None', '8+1'), ('Perceptron', 'RandomOverSampler', '1+1'), ('Perceptron', 'RandomOverSampler', '33+1'), ('Perceptron', 'RandomOverSampler', '8+1')]
Untrained evaluators: [('AdaBoost', 'RandomOverSampler', '33+1'), ('AdaBoost', 'RandomOverSampler', '8+1'), ('AdaBoost', 'RandomOverSampler', '1+1')]


## Training

In [None]:
for (evaluator_type, sampler, label_class) in untrained_evaluators:
    print(f'{evaluator_type}, {sampler}, {label_class}')
    match evaluator_type:
        case 'XGBoost':
            from xgboost import XGBClassifier
            evaluator = XGBClassifier()
            
        case 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            evaluator = LogisticRegression(random_state=evaluator_seed, n_jobs=-1)
            
        case 'Perceptron':
            from sklearn.linear_model import Perceptron
            evaluator = Perceptron(random_state=evaluator_seed, n_jobs=-1)
            
        case 'AdaBoost':
            from sklearn.ensemble import AdaBoostClassifier
            evaluator = AdaBoostClassifier(random_state=evaluator_seed, algorithm='SAMME')
            
        case 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            evaluator = RandomForestClassifier(random_state=evaluator_seed, n_jobs=-1)
            
        case 'DeepNeuralNetwork':
            from sklearn.neural_network import MLPClassifier
            evaluator = MLPClassifier(random_state=evaluator_seed)
            
        case 'KNearestNeighbor':
            from sklearn.neighbors import KNeighborsClassifier
            evaluator = KNeighborsClassifier(n_jobs=-1)
            
        case _:
            print(f'Invalid evaluator model: {evaluator_type}')
    
    
    if evaluator_type == 'XGBoost' and label_class == '1+1':
        evaluator = XGBClassifier(objective='binary:logistic')
            
    print(f'{datetime.now()} : Training {evaluator_type} on {sampler} balanced data with {label_class} label classes')
    evaluator.fit(X_train[sampler][label_class], y_train[sampler][label_class])
    print(f'{datetime.now()} : Training completed.\n')
    
    # Saving
    dump(evaluator, filename=f'./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib')

## Testing

In [9]:
for (evaluator_type, sampler, label_class) in evaluators:
    try:
        evaluator = load(filename=f'./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib')
    
    except FileNotFoundError:
        print(f'EVALUATOR NOT FOUND: ./trained_evaluators/{evaluator_type}_{sampler}_{label_class}.joblib\n')
    
    else:
        print(f'{datetime.now()} : Predicting {evaluator_type} on {label_class} classes')
        y_pred = evaluator.predict(X_test[sampler][label_class])
        
        print(f'{evaluator_type} / {sampler} / {label_class} Metrics')
        print(f'   Accuracy: {accuracy_score(y_test[sampler][label_class], y_pred)}')
        print(f'   Precision: {precision_score(y_test[sampler][label_class], y_pred, average='weighted', zero_division=0.0)}')
        print(f'   Recall: {recall_score(y_test[sampler][label_class], y_pred, average='weighted')}')
        print(f'   F1: {f1_score(y_test[sampler][label_class], y_pred, average='weighted')}\n')

Loading ./trained_evaluators/Perceptron_None_33+1.joblib
2024-05-10 18:14:20.062408 : Predicting Perceptron on 33+1 classes
Perceptron / None / 33+1 Metrics
   Accuracy: 0.7284926695503847
   Precision: 0.7860619609939594
   Recall: 0.7284926695503847
   F1: 0.6907823296857236

Loading ./trained_evaluators/Perceptron_None_8+1.joblib
2024-05-10 18:14:20.453671 : Predicting Perceptron on 8+1 classes
Perceptron / None / 8+1 Metrics
   Accuracy: 0.8264479167416385
   Precision: 0.8304906957355364
   Recall: 0.8264479167416385
   F1: 0.7745500006175217

Loading ./trained_evaluators/Perceptron_None_1+1.joblib
2024-05-10 18:14:20.693373 : Predicting Perceptron on 1+1 classes
Perceptron / None / 1+1 Metrics
   Accuracy: 0.9829459986613023
   Precision: 0.9809483952529805
   Recall: 0.9829459986613023
   F1: 0.9795725094753132

Loading ./trained_evaluators/Perceptron_RandomOverSampler_33+1.joblib
2024-05-10 18:14:20.926287 : Predicting Perceptron on 33+1 classes
Perceptron / RandomOverSampler /