# Setting Parameters

In [1]:
# Classes: '33+1', '8+1', '1+1'
apply_classes = ['33+1', '8+1', '1+1']

# Samplers: None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', ['Clustering', 'SMOTE']
apply_sampling = 'RandomOverSampler'    # Select ONE from above

# Evaluators: 'XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor'
apply_evaluators = ['Perceptron', 'XGBoost', 'KNearestNeighbor', 'RandomForest']


# Notebook parameter validation
for _class in apply_classes:
    if _class not in ['33+1', '8+1', '1+1']:
        assert False, f'{_class} is an invalid class structure.'

if apply_sampling not in [None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', ['Clustering', 'SMOTE']]:
    assert False, f'{apply_sampling} is an invalid sampling method.'
    
for evaluator in apply_evaluators:
    if evaluator not in ['XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 
                         'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor']:
        assert False, f'{evaluator} is an invalid evaluator.'

# Dataset Handling
## Common Imports

In [2]:
import os
import pandas as pd
import random
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from ydata_profiling import ProfileReport
from tqdm import tqdm

## Loading the Dataset

In [3]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 5

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,...,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0,1200712.0
mean,5.583666,76481.54,9.069515,66.36748,9120.348,9120.348,3.243104e-06,0.08668523,0.2078875,0.09058958,...,124.9067,33.30498,124.9908,83140370.0,9.496511,13.13134,47.06479,30597.83,0.09656078,141.4608
std,256.1583,458931.1,8.964696,14.10731,100918.7,100918.7,0.001390587,0.2813734,0.405796,0.2870247,...,241.7839,159.8227,242.5392,17039590.0,0.8185547,8.640293,226.0328,322426.6,0.2330115,21.05626
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.08588,2.08588,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071570.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.69589,15.69589,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124530.0,9.5,10.3923,0.0,0.0,0.0,141.55
75%,0.1057466,266.9325,14.12,64.0,116.1455,116.1455,0.0,0.0,0.0,0.0,...,54.05234,0.3724683,54.06,83344000.0,9.5,10.39675,0.5059213,1.365208,0.08,141.55
max,68358.93,9797334.0,47.0,255.0,8388608.0,8388608.0,1.270713,1.0,1.0,1.0,...,8114.78,8440.653,6726.8,167639400.0,15.0,124.2106,11936.89,81509960.0,1.0,244.6


# Preprocessing
## Duplicating data for classes

In [4]:
all_data = {}

for _class in apply_classes:
    all_data[_class] = full_data.copy()
    
    match _class:            
        case '8+1':
            label_categories = {
                'Backdoor_Malware': 'Web',
                'BenignTraffic': 'Benign',
                'BrowserHijacking': 'Web',
                'CommandInjection': 'DDoS',
                'DDoS-ACK_Fragmentation': 'DDoS',
                'DDoS-HTTP_Flood': 'DDoS',
                'DDoS-ICMP_Flood': 'DDoS',
                'DDoS-ICMP_Fragmentation': 'DDoS',
                'DDoS-PSHACK_Flood': 'DDoS',
                'DDoS-RSTFINFlood': 'DDoS',
                'DDoS-SYN_Flood': 'DDoS',
                'DDoS-SlowLoris': 'DDoS',
                'DDoS-SynonymousIP_Flood': 'DDoS',
                'DDoS-TCP_Flood': 'DDoS',
                'DDoS-UDP_Flood': 'DDoS',
                'DDoS-UDP_Fragmentation': 'DDoS',
                'DNS_Spoofing': 'Spoofing',
                'DictionaryBruteForce': 'BruteForce',
                'DoS-HTTP_Flood': 'DoS',
                'DoS-SYN_Flood': 'DoS',
                'DoS-TCP_Flood': 'DoS',
                'DoS-UDP_Flood': 'DoS',
                'MITM-ArpSpoofing': 'Spoofing',
                'Mirai-greeth_flood': 'Mirai',
                'Mirai-greip_flood': 'Mirai',
                'Mirai-udpplain': 'Mirai',
                'Recon-HostDiscovery': 'Recon',
                'Recon-OSScan': 'Recon',
                'Recon-PingSweep': 'Recon',
                'Recon-PortScan': 'Recon',
                'SqlInjection': 'Web',
                'Uploading_Attack': 'Web',
                'VulnerabilityScan': 'Recon',
                'XSS': 'Web'
            }
            all_data['8+1']['label'] = all_data['8+1']['label'].map(label_categories)
            
        case '1+1':
            all_data['1+1'].loc[all_data['1+1']['label'] != 'BenignTraffic', 'label'] = 'Attack'
            all_data['1+1'].loc[all_data['1+1']['label'] == 'BenignTraffic', 'label'] = 'Benign'

all_data[apply_classes[0]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
214640,0.030366,2.22,46.59,64.0,2.964837,2.964837,0.0,0.0,0.0,0.0,...,8.571046,586.82,83693800.0,9.5,34.345174,12.160495,1059.069333,0.07,141.55,Mirai-greeth_flood
70255,0.0,54.0,6.0,64.0,10.733352,10.733352,0.0,0.0,0.0,0.0,...,0.0,54.0,82951230.0,9.5,10.392305,0.0,0.0,0.0,141.55,DoS-TCP_Flood
138876,0.0,0.0,1.0,64.0,4.603006,4.603006,0.0,0.0,0.0,0.0,...,0.0,42.0,83124830.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood
223653,0.020286,84.17,6.11,64.0,3.751915,3.751915,0.0,0.0,0.0,0.0,...,3.450844,56.47,83331830.0,9.5,10.470297,4.898925,200.336121,0.06,141.55,DDoS-PSHACK_Flood
28578,0.077025,34204.5,16.84,65.91,8881.473845,8881.473845,0.0,0.0,0.0,0.0,...,0.043589,50.2,83011750.0,9.5,10.0,0.0,0.0,0.0,141.55,DoS-UDP_Flood


In [5]:
all_data[apply_classes[1]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
214640,0.030366,2.22,46.59,64.0,2.964837,2.964837,0.0,0.0,0.0,0.0,...,8.571046,586.82,83693800.0,9.5,34.345174,12.160495,1059.069333,0.07,141.55,Mirai
70255,0.0,54.0,6.0,64.0,10.733352,10.733352,0.0,0.0,0.0,0.0,...,0.0,54.0,82951230.0,9.5,10.392305,0.0,0.0,0.0,141.55,DoS
138876,0.0,0.0,1.0,64.0,4.603006,4.603006,0.0,0.0,0.0,0.0,...,0.0,42.0,83124830.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS
223653,0.020286,84.17,6.11,64.0,3.751915,3.751915,0.0,0.0,0.0,0.0,...,3.450844,56.47,83331830.0,9.5,10.470297,4.898925,200.336121,0.06,141.55,DDoS
28578,0.077025,34204.5,16.84,65.91,8881.473845,8881.473845,0.0,0.0,0.0,0.0,...,0.043589,50.2,83011750.0,9.5,10.0,0.0,0.0,0.0,141.55,DoS


In [6]:
all_data[apply_classes[2]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
214640,0.030366,2.22,46.59,64.0,2.964837,2.964837,0.0,0.0,0.0,0.0,...,8.571046,586.82,83693800.0,9.5,34.345174,12.160495,1059.069333,0.07,141.55,Attack
70255,0.0,54.0,6.0,64.0,10.733352,10.733352,0.0,0.0,0.0,0.0,...,0.0,54.0,82951230.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
138876,0.0,0.0,1.0,64.0,4.603006,4.603006,0.0,0.0,0.0,0.0,...,0.0,42.0,83124830.0,9.5,9.165151,0.0,0.0,0.0,141.55,Attack
223653,0.020286,84.17,6.11,64.0,3.751915,3.751915,0.0,0.0,0.0,0.0,...,3.450844,56.47,83331830.0,9.5,10.470297,4.898925,200.336121,0.06,141.55,Attack
28578,0.077025,34204.5,16.84,65.91,8881.473845,8881.473845,0.0,0.0,0.0,0.0,...,0.043589,50.2,83011750.0,9.5,10.0,0.0,0.0,0.0,141.55,Attack



## Encoding Labels

In [7]:
for _class in apply_classes:    
    match _class:
        case '33+1':
            full_label_encoder = LabelEncoder()
            all_data['33+1']['label'] = full_label_encoder.fit_transform(all_data['33+1']['label'])
            
        case '8+1':
            class_label_encoder = LabelEncoder()
            all_data['8+1']['label'] = class_label_encoder.fit_transform(all_data['8+1']['label'])
            
        case '1+1':
            binary_label_encoder = LabelEncoder()
            all_data['1+1']['label'] = binary_label_encoder.fit_transform(all_data['1+1']['label'])
            

# Store label mappings
label_mapping = {index: label for index, label in enumerate(full_label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: full_label_encoder.transform([label])[0] for label in full_label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(all_data[apply_classes[0]][all_data[apply_classes[0]]['label'] == code].iloc[0])

all_data[apply_classes[0]].head()

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration         57.674651
Header_Length       4090.900000
Protocol Type          6.600000
Duration             127.200000
Rate

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
214640,0.030366,2.22,46.59,64.0,2.964837,2.964837,0.0,0.0,0.0,0.0,...,8.571046,586.82,83693800.0,9.5,34.345174,12.160495,1059.069333,0.07,141.55,23
70255,0.0,54.0,6.0,64.0,10.733352,10.733352,0.0,0.0,0.0,0.0,...,0.0,54.0,82951230.0,9.5,10.392305,0.0,0.0,0.0,141.55,20
138876,0.0,0.0,1.0,64.0,4.603006,4.603006,0.0,0.0,0.0,0.0,...,0.0,42.0,83124830.0,9.5,9.165151,0.0,0.0,0.0,141.55,6
223653,0.020286,84.17,6.11,64.0,3.751915,3.751915,0.0,0.0,0.0,0.0,...,3.450844,56.47,83331830.0,9.5,10.470297,4.898925,200.336121,0.06,141.55,8
28578,0.077025,34204.5,16.84,65.91,8881.473845,8881.473845,0.0,0.0,0.0,0.0,...,0.043589,50.2,83011750.0,9.5,10.0,0.0,0.0,0.0,141.55,21


## X, y Split

In [8]:
X = {}
y = {}

for _class in apply_classes:
    X[_class] = all_data[_class].drop('label', axis=1)
    y[_class] = all_data[_class]['label']

print(f'X: {X[apply_classes[0]].shape}, y: {y[apply_classes[0]].shape}')

X: (1200712, 46), y: (1200712,)


# Sampling

In [9]:
if apply_sampling is not None:
    
    undersampler = None
    oversampler = None
    
    for sampler in apply_sampling:
        match apply_sampling:
            case 'RandomOverSampler':
                from imblearn.over_sampling import RandomOverSampler
                oversampler = RandomOverSampler(random_state=42)
            case 'RandomUnderSampler':
                from imblearn.under_sampling import RandomUnderSampler
                undersampler = RandomUnderSampler(random_state=42)
            case 'SMOTENC':
                from imblearn.over_sampling import SMOTENC
                cat_cols = [
                    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
                    'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
                    'cwr_flag_number', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
                    'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP',
                    'ICMP', 'IPv', 'LLC'
                ]
                oversampler = SMOTENC(categorical_features=cat_cols, random_state=42)
            case 'Clustering':
                from imblearn.under_sampling import ClusterCentroids
                undersampler = ClusterCentroids(random_state=42)
    
    for _class in apply_classes:
        if undersampler is not None:
            X[_class], y[_class] = undersampler.fit_resample(X[_class], y[_class])  
        if oversampler is not None:
            X[_class], y[_class] = oversampler.fit_resample(X[_class], y[_class])
    
    print(f'X: {X[apply_classes[0]].shape}, y: {y[apply_classes[0]].shape}')
else:
    print('No sampling selected.')

X: (6299180, 46), y: (6299180,)


In [12]:
# Recombine the resampled features and labels back
all_data_resampled = {}
for _class in apply_classes:
    all_data_resampled[_class] = pd.concat([X[_class], y[_class]], axis=1)

print("Resampled Data (UNSCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(all_data_resampled[apply_classes[0]][all_data_resampled[apply_classes[0]]['label'] == code].iloc[0])

Resampled Data (SCALED):
First instance of Backdoor_Malware (code 0):
flow_duration         57.674651
Header_Length       4090.900000
Protocol Type          6.600000
Duration             127.200000
Rate                  51.419116
Srate                 51.419116
Drate                  0.000000
fin_flag_number        0.000000
syn_flag_number        0.000000
rst_flag_number        0.000000
psh_flag_number        0.000000
ack_flag_number        1.000000
ece_flag_number        0.000000
cwr_flag_number        0.000000
ack_count              0.400000
syn_count              0.700000
fin_count              0.000000
urg_count             11.700000
rst_count             21.000000
HTTP                   1.000000
HTTPS                  0.000000
DNS                    0.000000
Telnet                 0.000000
SMTP                   0.000000
SSH                    0.000000
IRC                    0.000000
TCP                    1.000000
UDP                    0.000000
DHCP                   0.000000
AR

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,...,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0
mean,123.8829,217724.2,10.35427,79.9247,5669.31,5669.31,1.404651e-06,0.02975117,0.1124045,0.04817198,...,183.2557,340.2319,82461840.0,9.456774,21.65222,258.9545,153256.7,0.5462021,140.435,16.5
std,1089.516,781346.4,9.810282,32.63418,70067.18,70067.18,0.0009927696,0.1699001,0.3158635,0.2141295,...,312.8897,446.4293,53571140.0,2.568699,13.89509,442.6446,581692.5,0.4362739,66.14306,9.810709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,0.002667084,58.0,6.0,64.0,3.013854,3.013854,0.0,0.0,0.0,0.0,...,0.0,54.08,82972990.0,9.5,10.3979,0.0,0.0,0.0,141.55,8.0
50%,1.403468,2921.3,6.17,64.0,20.86955,20.86955,0.0,0.0,0.0,0.0,...,41.48171,114.8,83253470.0,9.5,15.17666,58.65689,2782.999,0.8,141.55,16.5
75%,30.82904,35514.2,10.9,84.5,91.41608,91.41608,0.0,0.0,0.0,0.0,...,236.8902,556.64,83681070.0,9.5,33.28663,335.2353,99285.77,0.95,141.55,25.0
max,68358.93,9797334.0,47.0,255.0,8388608.0,8388608.0,1.270713,1.0,1.0,1.0,...,8440.653,6726.8,167639400.0,15.0,124.2106,11936.89,81509960.0,1.0,244.6,33.0


## Real vs Resampled Dataset Analysis

In [13]:
all_data_resampled[apply_classes[0]].describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,...,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0,6299180.0
mean,123.8829,217724.2,10.35427,79.9247,5669.31,5669.31,1.404651e-06,0.02975117,0.1124045,0.04817198,...,183.2557,340.2319,82461840.0,9.456774,21.65222,258.9545,153256.7,0.5462021,140.435,16.5
std,1089.516,781346.4,9.810282,32.63418,70067.18,70067.18,0.0009927696,0.1699001,0.3158635,0.2141295,...,312.8897,446.4293,53571140.0,2.568699,13.89509,442.6446,581692.5,0.4362739,66.14306,9.810709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,0.002667084,58.0,6.0,64.0,3.013854,3.013854,0.0,0.0,0.0,0.0,...,0.0,54.08,82972990.0,9.5,10.3979,0.0,0.0,0.0,141.55,8.0
50%,1.403468,2921.3,6.17,64.0,20.86955,20.86955,0.0,0.0,0.0,0.0,...,41.48171,114.8,83253470.0,9.5,15.17666,58.65689,2782.999,0.8,141.55,16.5
75%,30.82904,35514.2,10.9,84.5,91.41608,91.41608,0.0,0.0,0.0,0.0,...,236.8902,556.64,83681070.0,9.5,33.28663,335.2353,99285.77,0.95,141.55,25.0
max,68358.93,9797334.0,47.0,255.0,8388608.0,8388608.0,1.270713,1.0,1.0,1.0,...,8440.653,6726.8,167639400.0,15.0,124.2106,11936.89,81509960.0,1.0,244.6,33.0


In [14]:
for _class in apply_classes:
    original_report = ProfileReport(all_data[_class], title=f'{_class} Original Data', minimal=True)
    resampled_report = ProfileReport(all_data_resampled[_class], title=f'{_class} Resampled Data', minimal=True)
    comparison_report = original_report.compare(resampled_report)
    comparison_report.to_file(f'./profile_reports/{apply_sampling}_{_class}_resampling_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Evaluator Model

## Imports

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Preprocessing
### Scaling Numerical Features

In [16]:
num_cols = [
    'flow_duration', 'Header_Length',  'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count', 'fin_count',
    'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]

scaler = StandardScaler()
for _class in apply_classes:
    all_data_resampled[_class][num_cols] = scaler.fit_transform(all_data_resampled[_class][num_cols])

### Splitting

In [30]:
X_train = {}
X_test = {}
y_train = {}
y_test = {}
    
for _class in apply_classes:
    X = all_data_resampled[_class].drop('label', axis=1)
    y = all_data_resampled[_class]['label']
    
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train.update({_class: X_train_temp})
    X_test.update({_class: X_test_temp})
    y_train.update({_class: y_train_temp})
    y_test.update({_class: y_test_temp})
    
print(f'X_train: {X_train[apply_classes[0]].shape}, y_train: {y_train[apply_classes[0]].shape}, X_test: {X_test[apply_classes[0]].shape}, y_test: {y_test[apply_classes[0]].shape}')

X_train: (5039344, 46), y_train: (5039344,), X_test: (1259836, 46), y_test: (1259836,)


## Training

In [None]:
for evaluator_type in apply_evaluators:
    match evaluator_type:
        case 'XGBoost':
            from xgboost import XGBClassifier
            evaluator = XGBClassifier()
        case 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            evaluator = LogisticRegression(random_state=42, n_jobs=-1)
        case 'Perceptron':
            from sklearn.linear_model import Perceptron
            evaluator = Perceptron(random_state=42, n_jobs=-1)
        case 'AdaBoost':
            from sklearn.ensemble import AdaBoostClassifier
            evaluator = AdaBoostClassifier(random_state=42, algorithm='SAMME')
        case 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            evaluator = RandomForestClassifier(random_state=42, n_jobs=-1)
        case 'DeepNeuralNetwork':
            from sklearn.neural_network import MLPClassifier
            evaluator = MLPClassifier(random_state=42)
        case 'KNearestNeighbor':
            from sklearn.neighbors import KNeighborsClassifier
            evaluator = KNeighborsClassifier(n_jobs=-1)
        case _:
            print(f'Invalid evaluator model: {evaluator_type}')
    
    
    
    for _class in apply_classes:
        print(f'{datetime.now()} : Training {evaluator_type} on {_class} classes')
        evaluator.fit(X_train[_class], y_train[_class])
    
        print(f'{datetime.now()} : Predicting {evaluator_type} on {_class} classes')
        y_pred = evaluator.predict(X_test[_class])
    
        print(f'{evaluator_type} {_class} Metrics')
        print(f'   Accuracy: {accuracy_score(y_test[_class], y_pred)}')
        print(f'   Precision: {precision_score(y_test[_class], y_pred, average='weighted', zero_division=0.0)}')
        print(f'   Recall: {recall_score(y_test[_class], y_pred, average='weighted')}')
        print(f'   F1: {f1_score(y_test[_class], y_pred, average='weighted')}')
        print()

2024-05-09 11:04:22.336080 : Training Perceptron on 33+1 classes
2024-05-09 11:06:57.512421 : Predicting Perceptron on 33+1 classes
Perceptron 33+1 Metrics
   Accuracy: 0.4360908880203455
   Precision: 0.5661272082039608
   Recall: 0.4360908880203455
   F1: 0.3907401297570423

2024-05-09 11:06:59.457092 : Training Perceptron on 8+1 classes
2024-05-09 11:07:53.872813 : Predicting Perceptron on 8+1 classes
Perceptron 8+1 Metrics
   Accuracy: 0.5840778053693482
   Precision: 0.6168154798262434
   Recall: 0.5840778053693482
   F1: 0.570434646547599

2024-05-09 11:07:55.545708 : Training Perceptron on 1+1 classes
2024-05-09 11:07:59.221922 : Predicting Perceptron on 1+1 classes
Perceptron 1+1 Metrics
   Accuracy: 0.9725634201682106
   Precision: 0.9727818072792449
   Recall: 0.9725634201682106
   F1: 0.9725599456715085

2024-05-09 11:07:59.832133 : Training XGBoost on 33+1 classes


## Model Analysis

In [None]:
# Disabled, needs re-implementation
# 
# from sklearn.metrics import confusion_matrix
# 
# cm = pd.DataFrame(confusion_matrix(y_test, y_pred), columns = full_label_encoder.classes_)
# cm.insert(0, column='Actual', value=full_label_encoder.classes_)
# cm