# Setting Parameters

In [1]:
# Classes: '33+1', '8+1', '1+1'
apply_classes = ['33+1', '8+1', '1+1']

# Samplers: None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', ['Clustering', 'SMOTE']
# Note: SMOTE is O(n^2) to O(n^3). Using a sample size of 1 CSV file takes ~10 min per class to oversample on a OC'd 9700k.
#       Limit the sample size to 1 CSV file unless you want to measure completion time on a geologic timescale.
apply_sampling = ['Clustering', 'SMOTE']    # Select ONE from above

# Evaluators: 'XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor'
apply_evaluators = ['Perceptron', 'AdaBoost']


# Notebook parameter validation
for _class in apply_classes:
    if _class not in ['33+1', '8+1', '1+1']:
        assert False, f'{_class} is an invalid class structure.'

if apply_sampling not in [None, 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', ['Clustering', 'SMOTE']]:
    assert False, f'{apply_sampling} is an invalid sampling method.'
    
for evaluator in apply_evaluators:
    if evaluator not in ['XGBoost', 'LogisticRegression', 'Perceptron', 'AdaBoost', 
                         'RandomForest', 'DeepNeuralNetwork', 'KNearestNeighbor']:
        assert False, f'{evaluator} is an invalid evaluator.'

# Dataset Handling
## Common Imports

In [2]:
import os
import pandas as pd
import random
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from ydata_profiling import ProfileReport
from tqdm import tqdm

## Loading the Dataset

In [3]:
DATASET_DIRECTORY = '../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 1

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=42)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,...,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0,445425.0
mean,5.612908,77506.56,9.095821,66.357249,8772.66,8772.66,6e-06,0.086814,0.20678,0.090792,...,124.46458,32.963125,124.434945,83194570.0,9.498975,13.115676,46.583525,30296.55,0.095964,141.525915
std,260.000899,465701.3,8.96729,14.006584,96411.35,96411.35,0.003123,0.281562,0.404997,0.287313,...,240.847022,159.228725,240.957556,17045770.0,0.819398,8.617158,225.211178,298232.4,0.231838,21.070995
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.099072,2.099072,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071570.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.74101,15.74101,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124530.0,9.5,10.392305,0.0,0.0,0.0,141.55
75%,0.104157,310.01,14.8,64.0,118.4368,118.4368,0.0,0.0,0.0,0.0,...,54.050113,0.374406,54.06,83344000.0,9.5,10.396725,0.505921,1.363472,0.08,141.55
max,49718.694009,9815047.0,47.0,255.0,6291456.0,6291456.0,2.07295,1.0,1.0,1.0,...,6860.337778,5200.00805,5929.2,167639400.0,13.5,113.119698,7353.921909,30538430.0,1.0,244.6


# Preprocessing
## Duplicating data for classes

In [4]:
all_data = {}

for _class in apply_classes:
    all_data[_class] = full_data.copy()
    
    match _class:            
        case '8+1':
            label_categories = {
                'Backdoor_Malware': 'Web',
                'BenignTraffic': 'Benign',
                'BrowserHijacking': 'Web',
                'CommandInjection': 'DDoS',
                'DDoS-ACK_Fragmentation': 'DDoS',
                'DDoS-HTTP_Flood': 'DDoS',
                'DDoS-ICMP_Flood': 'DDoS',
                'DDoS-ICMP_Fragmentation': 'DDoS',
                'DDoS-PSHACK_Flood': 'DDoS',
                'DDoS-RSTFINFlood': 'DDoS',
                'DDoS-SYN_Flood': 'DDoS',
                'DDoS-SlowLoris': 'DDoS',
                'DDoS-SynonymousIP_Flood': 'DDoS',
                'DDoS-TCP_Flood': 'DDoS',
                'DDoS-UDP_Flood': 'DDoS',
                'DDoS-UDP_Fragmentation': 'DDoS',
                'DNS_Spoofing': 'Spoofing',
                'DictionaryBruteForce': 'BruteForce',
                'DoS-HTTP_Flood': 'DoS',
                'DoS-SYN_Flood': 'DoS',
                'DoS-TCP_Flood': 'DoS',
                'DoS-UDP_Flood': 'DoS',
                'MITM-ArpSpoofing': 'Spoofing',
                'Mirai-greeth_flood': 'Mirai',
                'Mirai-greip_flood': 'Mirai',
                'Mirai-udpplain': 'Mirai',
                'Recon-HostDiscovery': 'Recon',
                'Recon-OSScan': 'Recon',
                'Recon-PingSweep': 'Recon',
                'Recon-PortScan': 'Recon',
                'SqlInjection': 'Web',
                'Uploading_Attack': 'Web',
                'VulnerabilityScan': 'Recon',
                'XSS': 'Web'
            }
            all_data['8+1']['label'] = all_data['8+1']['label'].map(label_categories)
            
        case '1+1':
            all_data['1+1'].loc[all_data['1+1']['label'] != 'BenignTraffic', 'label'] = 'Attack'
            all_data['1+1'].loc[all_data['1+1']['label'] == 'BenignTraffic', 'label'] = 'Benign'

all_data[apply_classes[0]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
267764,0.472307,76.14,6.0,64.0,1.531264,1.531264,0.0,0.0,1.0,0.0,...,0.0,54.0,83365310.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SynonymousIP_Flood
192444,0.0,53.46,5.94,63.36,88.841668,88.841668,0.0,1.0,0.0,1.0,...,0.099279,54.06,83344170.0,9.5,10.394727,0.140855,0.142091,0.07,141.55,DDoS-RSTFINFlood
203607,5.089336,108.0,6.0,64.0,0.392979,0.392979,0.0,0.0,1.0,0.0,...,0.0,54.0,82985260.0,9.5,10.392305,0.0,0.0,0.0,141.55,DoS-SYN_Flood
189478,0.158949,14675.0,17.0,64.0,5578.350028,5578.350028,0.0,0.0,0.0,0.0,...,0.0,50.0,83102340.0,9.5,10.0,0.0,0.0,0.0,141.55,DDoS-UDP_Flood
444698,0.0,0.0,1.0,64.0,17.530025,17.530025,0.0,0.0,0.0,0.0,...,0.0,42.0,83127790.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood


In [5]:
all_data[apply_classes[1]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
267764,0.472307,76.14,6.0,64.0,1.531264,1.531264,0.0,0.0,1.0,0.0,...,0.0,54.0,83365310.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS
192444,0.0,53.46,5.94,63.36,88.841668,88.841668,0.0,1.0,0.0,1.0,...,0.099279,54.06,83344170.0,9.5,10.394727,0.140855,0.142091,0.07,141.55,DDoS
203607,5.089336,108.0,6.0,64.0,0.392979,0.392979,0.0,0.0,1.0,0.0,...,0.0,54.0,82985260.0,9.5,10.392305,0.0,0.0,0.0,141.55,DoS
189478,0.158949,14675.0,17.0,64.0,5578.350028,5578.350028,0.0,0.0,0.0,0.0,...,0.0,50.0,83102340.0,9.5,10.0,0.0,0.0,0.0,141.55,DDoS
444698,0.0,0.0,1.0,64.0,17.530025,17.530025,0.0,0.0,0.0,0.0,...,0.0,42.0,83127790.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS


In [6]:
all_data[apply_classes[2]].head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
267764,0.472307,76.14,6.0,64.0,1.531264,1.531264,0.0,0.0,1.0,0.0,...,0.0,54.0,83365310.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
192444,0.0,53.46,5.94,63.36,88.841668,88.841668,0.0,1.0,0.0,1.0,...,0.099279,54.06,83344170.0,9.5,10.394727,0.140855,0.142091,0.07,141.55,Attack
203607,5.089336,108.0,6.0,64.0,0.392979,0.392979,0.0,0.0,1.0,0.0,...,0.0,54.0,82985260.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
189478,0.158949,14675.0,17.0,64.0,5578.350028,5578.350028,0.0,0.0,0.0,0.0,...,0.0,50.0,83102340.0,9.5,10.0,0.0,0.0,0.0,141.55,Attack
444698,0.0,0.0,1.0,64.0,17.530025,17.530025,0.0,0.0,0.0,0.0,...,0.0,42.0,83127790.0,9.5,9.165151,0.0,0.0,0.0,141.55,Attack



## Encoding Labels

In [7]:
for _class in apply_classes:    
    match _class:
        case '33+1':
            full_label_encoder = LabelEncoder()
            all_data['33+1']['label'] = full_label_encoder.fit_transform(all_data['33+1']['label'])
            
        case '8+1':
            class_label_encoder = LabelEncoder()
            all_data['8+1']['label'] = class_label_encoder.fit_transform(all_data['8+1']['label'])
            
        case '1+1':
            binary_label_encoder = LabelEncoder()
            all_data['1+1']['label'] = binary_label_encoder.fit_transform(all_data['1+1']['label'])
            

# Store label mappings
label_mapping = {index: label for index, label in enumerate(full_label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: full_label_encoder.transform([label])[0] for label in full_label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(all_data[apply_classes[0]][all_data[apply_classes[0]]['label'] == code].iloc[0])

all_data[apply_classes[0]].head()

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration         0.532177
Header_Length      3595.300000
Protocol Type         6.000000
Duration             97.700000
Rate    

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
267764,0.472307,76.14,6.0,64.0,1.531264,1.531264,0.0,0.0,1.0,0.0,...,0.0,54.0,83365310.0,9.5,10.392305,0.0,0.0,0.0,141.55,12
192444,0.0,53.46,5.94,63.36,88.841668,88.841668,0.0,1.0,0.0,1.0,...,0.099279,54.06,83344170.0,9.5,10.394727,0.140855,0.142091,0.07,141.55,9
203607,5.089336,108.0,6.0,64.0,0.392979,0.392979,0.0,0.0,1.0,0.0,...,0.0,54.0,82985260.0,9.5,10.392305,0.0,0.0,0.0,141.55,19
189478,0.158949,14675.0,17.0,64.0,5578.350028,5578.350028,0.0,0.0,0.0,0.0,...,0.0,50.0,83102340.0,9.5,10.0,0.0,0.0,0.0,141.55,14
444698,0.0,0.0,1.0,64.0,17.530025,17.530025,0.0,0.0,0.0,0.0,...,0.0,42.0,83127790.0,9.5,9.165151,0.0,0.0,0.0,141.55,6


## X, y Splitting

In [8]:
X = {}
y = {}

for _class in apply_classes:
    X[_class] = all_data[_class].drop('label', axis=1)
    y[_class] = all_data[_class]['label']

print(f'X: {X[apply_classes[0]].shape}, y: {y[apply_classes[0]].shape}')

X: (445425, 46), y: (445425,)


# Sampling

In [9]:
if apply_sampling is not None:
    
    undersampler = None
    oversampler = None
    
    if type(apply_sampling) is not list:
        match apply_sampling:
            case 'RandomOverSampler':
                from imblearn.over_sampling import RandomOverSampler
                oversampler = RandomOverSampler(random_state=42)
                
            case 'RandomUnderSampler':
                from imblearn.under_sampling import RandomUnderSampler
                undersampler = RandomUnderSampler(random_state=42)
                
            case 'SMOTE':
                from imblearn.over_sampling import SMOTENC
                cat_cols = [
                    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
                    'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
                    'cwr_flag_number', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
                    'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP',
                    'ICMP', 'IPv', 'LLC'
                ]
                oversampler = SMOTENC(categorical_features=cat_cols, random_state=42, n_jobs=-1)
            
    else:   # apply_sampling is a list containing an undersampler and an oversampler
        for sampler in apply_sampling:
            match sampler:
                case 'SMOTE':
                    from imblearn.over_sampling import SMOTENC
                    cat_cols = [
                        'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
                        'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
                        'cwr_flag_number', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
                        'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP',
                        'ICMP', 'IPv', 'LLC'
                    ]
                    oversampler = SMOTENC(categorical_features=cat_cols, random_state=42, n_jobs=-1)
                    
                case 'Clustering':
                    from imblearn.under_sampling import ClusterCentroids
                    undersampler = ClusterCentroids(random_state=42)
    
    for _class in apply_classes:
        if undersampler is not None:
            print(f'{datetime.now()}: Applying undersampling to {_class} label grouping...')
            X[_class], y[_class] = undersampler.fit_resample(X[_class], y[_class])
            
        if oversampler is not None:
            print(f'{datetime.now()}: Applying oversampling to {_class} label grouping...') # Useful for tracking SMOTE oversampling completion times.
            X[_class], y[_class] = oversampler.fit_resample(X[_class], y[_class])
    
    print(f'X: {X[apply_classes[0]].shape}, y: {y[apply_classes[0]].shape}')
else:
    print('No sampling selected.')

2024-05-09 18:51:06.722449: Applying undersampling to 33+1 label grouping...




2024-05-09 18:51:08.233609: Applying oversampling to 33+1 label grouping...
2024-05-09 18:51:08.312115: Applying undersampling to 8+1 label grouping...




2024-05-09 18:51:13.449779: Applying oversampling to 8+1 label grouping...
2024-05-09 18:51:13.584290: Applying undersampling to 1+1 label grouping...
2024-05-09 18:59:37.639170: Applying oversampling to 1+1 label grouping...




X: (510, 46), y: (510,)


In [10]:
# Recombine the resampled features and labels back
all_data_resampled = {}
for _class in apply_classes:
    all_data_resampled[_class] = pd.concat([X[_class], y[_class]], axis=1)

print("Resampled Data (UNSCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(all_data_resampled[apply_classes[0]][all_data_resampled[apply_classes[0]]['label'] == code].iloc[0])

Resampled Data (UNSCALED):
First instance of Backdoor_Malware (code 0):
flow_duration      3.974183e+02
Header_Length      6.399988e+03
Protocol Type      1.010000e+01
Duration           1.072375e+02
Rate               2.542702e+01
Srate              2.542702e+01
Drate              0.000000e+00
fin_flag_number    0.000000e+00
syn_flag_number    0.000000e+00
rst_flag_number    0.000000e+00
psh_flag_number   -1.387779e-17
ack_flag_number    5.000000e-01
ece_flag_number    0.000000e+00
cwr_flag_number    0.000000e+00
ack_count          8.750000e-02
syn_count          5.375000e-01
fin_count          3.750000e-02
urg_count          1.843750e+01
rst_count          3.690000e+01
HTTP               0.000000e+00
HTTPS              3.750000e-01
DNS                0.000000e+00
Telnet             0.000000e+00
SMTP               0.000000e+00
SSH                0.000000e+00
IRC                0.000000e+00
TCP                6.250000e-01
UDP                2.500000e-01
DHCP               0.000000e+00


## Real vs Resampled Dataset Analysis

In [11]:
all_data_resampled[apply_classes[0]].describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,...,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0
mean,65.60543,628552.8,9.743165,84.44695,143670.7,143670.7,1.521917e-07,0.02535025,0.08669093,0.03257091,...,429.8321,588.106993,81279280.0,9.446321,29.168725,607.7932,683485.0,0.5749131,140.133986,16.5
std,181.9577,1432935.0,9.255085,31.634993,610937.9,610937.9,3.436973e-06,0.1545748,0.2718636,0.1642388,...,603.1723,573.581628,56647910.0,2.704695,16.516407,853.7792,2009730.0,0.4131479,68.955838,9.820341
min,-4.440892e-16,0.0,0.995556,25.6,-9.094947e-13,-9.094947e-13,-1.355253e-20,-1.084202e-18,-8.326673e-17,-1.457168e-16,...,-2.220446e-16,42.0,2.264977e-06,3.166667,9.165151,-3.552714e-15,-1.136868e-13,-1.387779e-17,22.055556,0.0
25%,0.07160757,139.9981,6.0,64.0,22.70499,22.70499,0.0,0.0,0.0,0.0,...,2.775174,77.7,50424650.0,8.789277,12.396269,3.86741,469.1409,0.06435673,117.43634,8.0
50%,3.97356,20834.2,6.421944,66.742111,148.1169,148.1169,0.0,0.0,0.0,0.0,...,247.8504,531.105,83033920.0,9.5,30.471228,350.5134,132139.5,0.8,141.55,16.5
75%,34.59219,428809.0,9.113333,100.025,1446.547,1446.547,0.0,0.0,0.0,0.0,...,589.1735,906.299524,99863510.0,9.5,41.310296,828.545,474111.8,0.9391892,141.55,25.0
max,1475.088,8289201.0,47.0,247.0,6249515.0,6249515.0,7.761777e-05,1.0,1.0,1.0,...,4405.425,3830.8,167639400.0,13.5,84.018889,6230.212,22762200.0,1.0,244.6,33.0


### Generate Reports

In [12]:
for _class in apply_classes:
    original_report = ProfileReport(all_data[_class], title=f'{_class} Original Data', minimal=True)
    resampled_report = ProfileReport(all_data_resampled[_class], title=f'{_class} Resampled Data', minimal=True)
    comparison_report = original_report.compare(resampled_report)
    comparison_report.to_file(f'./profile_reports/{apply_sampling}_{_class}_resampling_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Evaluator Model

## Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Preprocessing
### Scaling Numerical Features

In [None]:
num_cols = [
    'flow_duration', 'Header_Length',  'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count', 'fin_count',
    'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]

scaler = StandardScaler()
for _class in apply_classes:
    all_data_resampled[_class][num_cols] = scaler.fit_transform(all_data_resampled[_class][num_cols])

### X, y Train/Test Splitting

In [None]:
X_train = {}
X_test = {}
y_train = {}
y_test = {}
    
for _class in apply_classes:
    X = all_data_resampled[_class].drop('label', axis=1)
    y = all_data_resampled[_class]['label']
    
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train.update({_class: X_train_temp})
    X_test.update({_class: X_test_temp})
    y_train.update({_class: y_train_temp})
    y_test.update({_class: y_test_temp})
    
print(f'X_train: {X_train[apply_classes[0]].shape}, y_train: {y_train[apply_classes[0]].shape}, X_test: {X_test[apply_classes[0]].shape}, y_test: {y_test[apply_classes[0]].shape}')

## Training

In [None]:
for evaluator_type in apply_evaluators:
    match evaluator_type:
        case 'XGBoost':
            from xgboost import XGBClassifier
            evaluator = XGBClassifier()
        case 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            evaluator = LogisticRegression(random_state=42, n_jobs=-1)
        case 'Perceptron':
            from sklearn.linear_model import Perceptron
            evaluator = Perceptron(random_state=42, n_jobs=-1)
        case 'AdaBoost':
            from sklearn.ensemble import AdaBoostClassifier
            evaluator = AdaBoostClassifier(random_state=42, algorithm='SAMME')
        case 'RandomForest':
            from sklearn.ensemble import RandomForestClassifier
            evaluator = RandomForestClassifier(random_state=42, n_jobs=-1)
        case 'DeepNeuralNetwork':
            from sklearn.neural_network import MLPClassifier
            evaluator = MLPClassifier(random_state=42)
        case 'KNearestNeighbor':
            from sklearn.neighbors import KNeighborsClassifier
            evaluator = KNeighborsClassifier(n_jobs=-1)
        case _:
            print(f'Invalid evaluator model: {evaluator_type}')
    
    
    
    for _class in apply_classes:
        # XGBoost for binary classification must be a binary objective
        if evaluator_type == 'XGBoost' and _class == '1+1':
            evaluator = XGBClassifier(objective='binary:logistic')
            
        print(f'{datetime.now()} : Training {evaluator_type} on {apply_sampling} balanced data with {_class} label classes')
        evaluator.fit(X_train[_class], y_train[_class])
    
        print(f'{datetime.now()} : Predicting {evaluator_type} on {_class} classes')
        y_pred = evaluator.predict(X_test[_class])
    
        print(f'{evaluator_type} {_class} Metrics')
        print(f'   Accuracy: {accuracy_score(y_test[_class], y_pred)}')
        print(f'   Precision: {precision_score(y_test[_class], y_pred, average='weighted', zero_division=0.0)}')
        print(f'   Recall: {recall_score(y_test[_class], y_pred, average='weighted')}')
        print(f'   F1: {f1_score(y_test[_class], y_pred, average='weighted')}')
        print()

## Model Analysis

In [None]:
# Disabled, needs multi-label-group and multi-estimator re-implementation
# 
# from sklearn.metrics import confusion_matrix
# 
# cm = pd.DataFrame(confusion_matrix(y_test, y_pred), columns = full_label_encoder.classes_)
# cm.insert(0, column='Actual', value=full_label_encoder.classes_)
# cm