# Setup
## Parameters

In [2]:
label_classes = ['33+1', '7+1', '1+1']  # Classes: '33+1', '7+1', '1+1'

sampling_method = 'ClusterCentroids'   # Samplers: 'None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids'
# Note: SMOTE is O(n^2) to O(n^3). Using a sample size of 1 CSV file takes ~10 min per class to oversample on a OC'd 9700k.
#       Limit the sample size to 1 CSV file unless you want to measure completion time on a geologic timescale.

csv_sample_size = 5

# Random Seeds:
file_seed = 42
sampler_seed = 42

# Import/Export Directories
dataset_directory = '../dataset/'
metrics_directory = './metrics'

In [3]:
# Notebook parameter validation
sampler_categories = ['None', 'RandomOverSampler', 'RandomUnderSampler', 'SMOTE', 'ClusterCentroids']
label_categories = ['33+1', '7+1', '1+1']

for _class in label_classes:
    if _class not in label_categories:
        assert False, f'{_class} is an invalid class structure.'

if sampling_method not in sampler_categories:
    assert False, f'{sampling_method} is an invalid sampling method.'

# Common Packages

In [4]:
import os
import pandas as pd
import pickle
import random
import time
import subprocess
from IPython.display import display
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from ydata_profiling import ProfileReport
from tqdm import tqdm

# Dataset Handling
## Loading

In [5]:
 # If your dataset is within your python project directory, change this to the relative path to your dataset
all_csv_filepaths = [filename for filename in os.listdir(dataset_directory) if filename.endswith('.csv')]

print(f'CSVs in {dataset_directory}: {len(all_csv_filepaths)}')

# If there are more than X CSV files, randomly select X files from the list

random.seed(file_seed)
if len(all_csv_filepaths) > csv_sample_size:
    csv_filepaths = random.sample(all_csv_filepaths, csv_sample_size)
csv_filepaths.sort()

full_data = pd.DataFrame()
for i, file in enumerate(csv_filepaths):
    print(f'{file} : File {i+1} out of {len(csv_filepaths)}')
    data_path = os.path.join(dataset_directory, file)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df], axis=0)

print(f'\nDataset Shape: {full_data.shape}')

# Print the number of unique labels
print(f"There are {full_data['label'].nunique()} unique labels in the dataset.")

# prints an instance of each class
print('\nBefore encoding:')

display(full_data.drop_duplicates(subset=['label'], inplace=False).sort_values('label'))
display(full_data.describe())

CSVs in ../dataset/: 169
part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv : File 1 out of 5
part-00028-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv : File 2 out of 5
part-00062-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv : File 3 out of 5
part-00070-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv : File 4 out of 5
part-00163-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv : File 5 out of 5

Dataset Shape: (1389408, 47)
There are 34 unique labels in the dataset.

Before encoding:


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
589,182.737697,851457.0,6.0,247.0,4.232843,4.232843,0.0,0.0,0.0,0.0,...,0.0,1494.0,0.0001765966,5.5,54.662601,0.0,0.0,0.0,38.5,Backdoor_Malware
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,166520200.0,13.5,41.470705,2419.498399,2944407.0,1.0,244.6,BenignTraffic
3165,71.564239,13996.3,6.4,117.5,1.453585,1.453585,0.0,0.0,0.0,0.0,...,177.937593,106.8,166845800.0,13.5,17.065125,252.1258,31932.41,1.0,244.6,BrowserHijacking
20000,1.96379,2639521.0,6.0,57.0,1054.355838,1054.355838,0.0,0.0,0.0,0.0,...,325.869877,790.0,167638900.0,13.5,51.918921,462.351581,184001.9,0.6,244.6,CommandInjection
8,0.0,741.86,5.88,62.72,9.228077,9.228077,0.0,0.0,0.0,0.0,...,542.128415,926.32,83336120.0,9.5,42.640569,766.695299,311354.9,0.95,141.55,DDoS-ACK_Fragmentation
2953,9.410321,4246.65,6.11,67.59,2.815156,2.815156,0.0,0.0,0.0,0.0,...,334.607028,399.0,83158170.0,9.5,22.701259,471.751159,236887.6,0.87,141.55,DDoS-HTTP_Flood
4,0.0,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.0,42.0,83132130.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood
96,0.024429,4.44,1.09,62.92,2.547955,2.547955,0.0,0.0,0.0,0.0,...,548.427959,906.18,83283950.0,9.5,43.11089,775.681256,316840.4,0.95,141.55,DDoS-ICMP_Fragmentation
25,5.44112,108.0,6.0,64.0,0.367572,0.367572,0.0,0.0,0.0,0.0,...,0.0,54.0,83314580.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-PSHACK_Flood
5,0.0,54.0,6.0,64.0,1.249915,1.249915,0.0,1.0,0.0,1.0,...,0.0,54.0,83348320.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-RSTFINFlood


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,...,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0,1389408.0
mean,5.547284,76948.21,9.071959,66.3537,9100.031,9100.031,2.972157e-06,0.08666857,0.2068197,0.09048098,...,124.8526,33.32764,124.8652,83201530.0,9.499315,13.13095,47.09721,30460.5,0.0967093,141.5341
std,250.7318,462626.9,8.960943,14.01211,100538.5,100538.5,0.001116505,0.2813489,0.4050252,0.2868697,...,240.853,159.62,241.1862,17066180.0,0.8201526,8.636618,225.7483,319010.9,0.2334036,21.09603
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.098604,2.098604,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071570.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.76732,15.76732,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124530.0,9.5,10.3923,0.0,0.0,0.0,141.55
75%,0.1048333,297.7725,14.44,64.0,117.8035,117.8035,0.0,0.0,0.0,0.0,...,54.05258,0.3779892,54.06,83343990.0,9.5,10.39675,0.5059213,1.387757,0.08,141.55
max,61533.51,9814697.0,47.0,255.0,8388608.0,8388608.0,0.8910143,1.0,1.0,1.0,...,8015.998,8643.503,6871.6,167639400.0,15.0,119.8546,12223.76,85610220.0,1.0,244.6


In [6]:
print(full_data['label'].value_counts())

label
DDoS-ICMP_Flood            214682
DDoS-UDP_Flood             161098
DDoS-TCP_Flood             133745
DDoS-PSHACK_Flood          121873
DDoS-SYN_Flood             120616
DDoS-RSTFINFlood           120522
DDoS-SynonymousIP_Flood    106717
DoS-UDP_Flood               98594
DoS-TCP_Flood               79332
DoS-SYN_Flood               60222
BenignTraffic               32822
Mirai-greeth_flood          29785
Mirai-udpplain              26642
Mirai-greip_flood           22351
DDoS-ICMP_Fragmentation     13536
MITM-ArpSpoofing             9178
DDoS-ACK_Fragmentation       8532
DDoS-UDP_Fragmentation       8490
DNS_Spoofing                 5423
Recon-HostDiscovery          3941
Recon-OSScan                 2918
Recon-PortScan               2422
DoS-HTTP_Flood               2134
VulnerabilityScan            1081
DDoS-HTTP_Flood               859
DDoS-SlowLoris                703
DictionaryBruteForce          403
BrowserHijacking              177
CommandInjection              155
SqlInjec

# Preprocessing
## Duplicating the dataset for multiple labeling schemes

In [7]:
all_data = {}

for _class in label_classes:
    all_data[_class] = full_data.copy()
    
    match _class:            
        case '7+1':
            label_categories = {
                'Backdoor_Malware': 'Web',
                'BenignTraffic': 'Benign',
                'BrowserHijacking': 'Web',
                'CommandInjection': 'DDoS',
                'DDoS-ACK_Fragmentation': 'DDoS',
                'DDoS-HTTP_Flood': 'DDoS',
                'DDoS-ICMP_Flood': 'DDoS',
                'DDoS-ICMP_Fragmentation': 'DDoS',
                'DDoS-PSHACK_Flood': 'DDoS',
                'DDoS-RSTFINFlood': 'DDoS',
                'DDoS-SYN_Flood': 'DDoS',
                'DDoS-SlowLoris': 'DDoS',
                'DDoS-SynonymousIP_Flood': 'DDoS',
                'DDoS-TCP_Flood': 'DDoS',
                'DDoS-UDP_Flood': 'DDoS',
                'DDoS-UDP_Fragmentation': 'DDoS',
                'DNS_Spoofing': 'Spoofing',
                'DictionaryBruteForce': 'BruteForce',
                'DoS-HTTP_Flood': 'DoS',
                'DoS-SYN_Flood': 'DoS',
                'DoS-TCP_Flood': 'DoS',
                'DoS-UDP_Flood': 'DoS',
                'MITM-ArpSpoofing': 'Spoofing',
                'Mirai-greeth_flood': 'Mirai',
                'Mirai-greip_flood': 'Mirai',
                'Mirai-udpplain': 'Mirai',
                'Recon-HostDiscovery': 'Recon',
                'Recon-OSScan': 'Recon',
                'Recon-PingSweep': 'Recon',
                'Recon-PortScan': 'Recon',
                'SqlInjection': 'Web',
                'Uploading_Attack': 'Web',
                'VulnerabilityScan': 'Recon',
                'XSS': 'Web'
            }
            all_data['7+1']['label'] = all_data['7+1']['label'].map(label_categories)
            
        case '1+1':
            all_data['1+1'].loc[all_data['1+1']['label'] != 'BenignTraffic', 'label'] = 'Attack'
            all_data['1+1'].loc[all_data['1+1']['label'] == 'BenignTraffic', 'label'] = 'Benign'

for _class in label_classes:
    print(f'{_class} Labels')
    display(all_data[_class].head(5))

33+1 Labels


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.0,42.0,83006880.0,9.5,9.165151,0.0,0.0,0.0,141.55,DoS-UDP_Flood
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,166520200.0,13.5,41.470705,2419.498399,2944407.0,1.0,244.6,BenignTraffic
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.0,54.0,83361420.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SynonymousIP_Flood
3,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,54.0,83089920.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SYN_Flood
4,0.0,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.0,42.0,83132130.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood


7+1 Labels


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.0,42.0,83006880.0,9.5,9.165151,0.0,0.0,0.0,141.55,DoS
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,166520200.0,13.5,41.470705,2419.498399,2944407.0,1.0,244.6,Benign
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.0,54.0,83361420.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS
3,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,54.0,83089920.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS
4,0.0,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.0,42.0,83132130.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS


1+1 Labels


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.0,42.0,83006880.0,9.5,9.165151,0.0,0.0,0.0,141.55,Attack
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,166520200.0,13.5,41.470705,2419.498399,2944407.0,1.0,244.6,Benign
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.0,54.0,83361420.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
3,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,54.0,83089920.0,9.5,10.392305,0.0,0.0,0.0,141.55,Attack
4,0.0,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.0,42.0,83132130.0,9.5,9.165151,0.0,0.0,0.0,141.55,Attack



## Encoding Labels

In [14]:
for _class in label_classes:    
    match _class:
        case '33+1':
            full_label_encoder = LabelEncoder()
            all_data['33+1']['label'] = full_label_encoder.fit_transform(all_data['33+1']['label'])
            label_mapping = {index: label for index, label in enumerate(full_label_encoder.classes_)}
            
            with open("./label_encoders/full_label_encoder.pkl", "wb") as encoder_file:
                pickle.dump(full_label_encoder, encoder_file)
                
            print(f'{_class } Label mappings: {label_mapping}')
            
        case '7+1':
            group_label_encoder = LabelEncoder()
            all_data['7+1']['label'] = group_label_encoder.fit_transform(all_data['7+1']['label'])
            label_mapping = {index: label for index, label in enumerate(group_label_encoder.classes_)}
            
            with open("./label_encoders/group_label_encoder.pkl", "wb") as encoder_file:
                 pickle.dump(group_label_encoder, encoder_file)
                
            print(f'{_class } Label mappings: {label_mapping}')
            
        case '1+1':
            binary_label_encoder = LabelEncoder()
            all_data['1+1']['label'] = binary_label_encoder.fit_transform(all_data['1+1']['label'])
            label_mapping = {index: label for index, label in enumerate(binary_label_encoder.classes_)}
            
            with open("./label_encoders/binary_label_encoder.pkl", "wb") as encoder_file:
                pickle.dump(binary_label_encoder, encoder_file)
                
            print(f'{_class } Label mappings: {label_mapping}')

# Print specific instances after label encoding
print("\nAfter encoding:")
for _class in label_classes:
    print(f'{_class} Labels')
    display(all_data[_class].drop_duplicates(subset=['label']).sort_values('label'))

33+1 Label mappings: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33}
7+1 Label mappings: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7}
1+1 Label mappings: {0: 0, 1: 1}

After encoding:
33+1 Labels


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
589,182.737697,851457.0,6.0,247.0,4.232843,4.232843,0.0,0.0,0.0,0.0,...,0.0,1494.0,0.0001765966,5.5,54.662601,0.0,0.0,0.0,38.5,0
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,166520200.0,13.5,41.470705,2419.498399,2944407.0,1.0,244.6,1
3165,71.564239,13996.3,6.4,117.5,1.453585,1.453585,0.0,0.0,0.0,0.0,...,177.937593,106.8,166845800.0,13.5,17.065125,252.1258,31932.41,1.0,244.6,2
20000,1.96379,2639521.0,6.0,57.0,1054.355838,1054.355838,0.0,0.0,0.0,0.0,...,325.869877,790.0,167638900.0,13.5,51.918921,462.351581,184001.9,0.6,244.6,3
8,0.0,741.86,5.88,62.72,9.228077,9.228077,0.0,0.0,0.0,0.0,...,542.128415,926.32,83336120.0,9.5,42.640569,766.695299,311354.9,0.95,141.55,4
2953,9.410321,4246.65,6.11,67.59,2.815156,2.815156,0.0,0.0,0.0,0.0,...,334.607028,399.0,83158170.0,9.5,22.701259,471.751159,236887.6,0.87,141.55,5
4,0.0,0.0,1.0,64.0,3.178249,3.178249,0.0,0.0,0.0,0.0,...,0.0,42.0,83132130.0,9.5,9.165151,0.0,0.0,0.0,141.55,6
96,0.024429,4.44,1.09,62.92,2.547955,2.547955,0.0,0.0,0.0,0.0,...,548.427959,906.18,83283950.0,9.5,43.11089,775.681256,316840.4,0.95,141.55,7
25,5.44112,108.0,6.0,64.0,0.367572,0.367572,0.0,0.0,0.0,0.0,...,0.0,54.0,83314580.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
5,0.0,54.0,6.0,64.0,1.249915,1.249915,0.0,1.0,0.0,1.0,...,0.0,54.0,83348320.0,9.5,10.392305,0.0,0.0,0.0,141.55,9


7+1 Labels


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,166520200.0,13.5,41.470705,2419.498399,2944407.0,1.0,244.6,0
14947,75.354027,736347.2,8.2,195.8,65.143734,65.143734,0.0,0.0,0.0,0.0,...,79.124445,230.4,0.002075195,5.5,13.805843,111.898864,20738.0,0.7,38.5,1
2,4.549627,108.0,6.0,64.0,0.762226,0.762226,0.0,0.0,1.0,0.0,...,0.0,54.0,83361420.0,9.5,10.392305,0.0,0.0,0.0,141.55,2
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.0,42.0,83006880.0,9.5,9.165151,0.0,0.0,0.0,141.55,3
9,0.14915,141558.08,17.0,64.0,15341.193417,15341.193417,0.0,0.0,0.0,0.0,...,0.0,554.0,83763030.0,9.5,33.286634,0.0,0.0,0.0,141.55,4
71,0.041108,9598.2,6.0,191.8,2642.452426,2642.452426,0.0,0.0,0.0,0.0,...,897.825981,922.8,166846600.0,13.5,28.004693,1272.775542,830444.4,1.0,244.6,5
34,195.816558,32701.8,9.3,150.2,3.486841,3.486841,0.0,0.0,0.0,0.0,...,51.032829,143.4,0.01942711,5.5,15.982143,72.171319,2921.517,0.9,38.5,6
249,21.235179,1268.6,10.5,75.4,15.794498,15.794498,0.0,0.0,0.0,0.0,...,120.346076,196.1,0.0154846,5.5,20.439088,170.195053,16122.26,0.9,38.5,7


1+1 Labels


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.625763,84.0,17.0,64.0,0.432361,0.432361,0.0,0.0,0.0,0.0,...,0.0,42.0,83006880.0,9.5,9.165151,0.0,0.0,0.0,141.55,0
1,76.135781,428611.8,8.2,151.9,42.352135,42.352135,0.0,0.0,0.0,0.0,...,1707.500922,655.0,166520200.0,13.5,41.470705,2419.498399,2944407.0,1.0,244.6,1


## X, y Splitting

In [36]:
X = {}
y = {}

for _class in label_classes:
    X[_class] = all_data[_class].drop('label', axis=1)
    y[_class] = all_data[_class]['label']

print(f'X: {X[label_classes[0]].shape}, y: {y[label_classes[0]].shape}')

X: (1389408, 46), y: (1389408,)


# Sampling

In [37]:
# Load previous metrics dataframe
try: 
    df_metrics = pd.read_json(path_or_buf=metrics_directory+'/sampler_metrics.json', orient='index')
    
except FileNotFoundError:
    df_metrics = pd.DataFrame(columns=['Sampler', 'Label Classes', 'Test Duration'])

df_metrics

Unnamed: 0,Sampler,Label Classes,Test Duration
5,RandomOverSampler,1+1,1.424515
3,RandomOverSampler,33+1,4.009733
12,RandomOverSampler,7+1,13.519539
4,RandomOverSampler,8+1,3.737586
2,RandomUnderSampler,1+1,0.289977
0,RandomUnderSampler,33+1,0.389955
1,RandomUnderSampler,8+1,0.313602
6,SMOTE,1+1,352.289062
8,SMOTE,33+1,2038.355977
7,SMOTE,8+1,2128.925626


In [38]:
if sampling_method != 'None':
    
    match sampling_method:
        case 'RandomOverSampler':
            from imblearn.over_sampling import RandomOverSampler
            sampler = RandomOverSampler(random_state=sampler_seed)
            
        case 'RandomUnderSampler':
            from imblearn.under_sampling import RandomUnderSampler
            sampler = RandomUnderSampler(random_state=sampler_seed)
            
        case 'SMOTE':
            from imblearn.over_sampling import SMOTENC
            cat_cols = [
                'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 
                'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
                'cwr_flag_number', 'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 
                'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC'
            ]
            sampler = SMOTENC(categorical_features=cat_cols, random_state=sampler_seed, n_jobs=-1)
            
        case 'ClusterCentroids':
            from imblearn.under_sampling import ClusterCentroids
            sampler = ClusterCentroids(random_state=sampler_seed)
            # There will be a warning for a memory leak on Windows with KML. The leak is apparently very small and 
            # limiting to 1 thread to avoid it will reduce the performance significantly. It is best to ignore. 
    
    
    
    for _class in label_classes:
        print(f'{datetime.now()}: Applying {sampling_method} to {_class} label dataset...')
        
        # start the hardware logging
        proc = subprocess.Popen(['python', '../synthetic_generation_GANs/GAN_analysis/hardwareAnalyzer.py'])
        
        # Train sampler and sample data
        start_time = time.time()
        X[_class], y[_class] = sampler.fit_resample(X[_class], y[_class])
        sampler_duration = time.time() - start_time
        
        # Ensure we kill the subprocess when done
        proc.terminate()
        try:
            proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            proc.kill()
        
        print(f'{datetime.now()}: {sampling_method} on {_class} label dataset completed.')
        print(f'{_class} : X: {X[_class].shape}, y: {y[_class].shape}')
        
        sampler_metrics = [sampling_method, _class, sampler_duration]
        
        # Add sampling duration to dataframe and display
        update_row = df_metrics.loc[(df_metrics['Sampler'] == sampling_method) &
                                    (df_metrics['Label Classes'] == _class)]
        
        if update_row.empty:    
            # No previous record
            df_metrics.loc[len(df_metrics.index)] = sampler_metrics
            
            print(f'{sampling_method} / {_class} Metrics')
            display(df_metrics.loc[len(df_metrics.index)-1])
        
        else:   
            # Previous record exists
            update_row = sampler_metrics
        
            print(f'{sampling_method} / {_class} Metrics')
            display(df_metrics.loc[(df_metrics['Sampler'] == sampler) &
                                    (df_metrics['Label Classes'] == _class)])
else:
    print('No sampling selected.')

2024-05-17 18:49:43.081045: Applying ClusterCentroids to 33+1 label dataset...




2024-05-17 18:49:49.816746: ClusterCentroids on 33+1 label dataset completed.
33+1 : X: (1088, 46), y: (1088,)
ClusterCentroids / 33+1 Metrics




Unnamed: 0,Sampler,Label Classes,Test Duration


2024-05-17 18:49:49.819829: Applying ClusterCentroids to 8+1 label dataset...
2024-05-17 18:50:40.475814: ClusterCentroids on 8+1 label dataset completed.
8+1 : X: (3224, 46), y: (3224,)
ClusterCentroids / 8+1 Metrics




Unnamed: 0,Sampler,Label Classes,Test Duration


2024-05-17 18:50:40.479882: Applying ClusterCentroids to 1+1 label dataset...


KeyboardInterrupt: 

## Save Sampler Metrics to File

In [None]:
df_metrics['Sampler'] = pd.Categorical(df_metrics['Sampler'], categories=sampler_categories)

df_metrics.sort_values(['Sampler', 'Label Classes'], inplace=True)

df_metrics.to_json(path_or_buf=metrics_directory+'/sampler_metrics.json', orient='index')

display(df_metrics)

In [None]:
# Recombine the resampled features and labels back
all_data_resampled = {}
for _class in label_classes:
    all_data_resampled[_class] = pd.concat([X[_class], y[_class]], axis=1)

print("Resampled Data (UNSCALED):")
for _class in label_classes:
    print(f'{_class} Labels')
    display(all_data[_class].drop_duplicates(subset=['label']).sort_values('label'))

### Update class label metrics file

In [None]:
# If there's no sampled_dataset_metrics.json, make a new one and store the unsampled dataset metrics
try: 
    df_label_counts = pd.read_json(path_or_buf=metrics_directory+'/sampling_label_counts.json', orient='index')
    
except FileNotFoundError:
    # schema:   Sampler | Label Classes | 0 | 1 | 2 | 3 | ... | 31 | 32 | 33
    df_label_counts = pd.DataFrame(columns=['Sampler', 'Label Classes'] + [str(i) for i in range(34)])

# Update dataframe
for label_class in label_classes:
    
    row_index = df_label_counts.index[(df_label_counts['Sampler'] == sampling_method) & (df_label_counts['Label Classes'] == label_class)]
    row_index = row_index.tolist()
    
    value_counts = all_data_resampled[label_class]['label'].value_counts()
    value_counts.sort_index(inplace=True)
    
    match len(row_index):
        case 0: # No previous record
            row_index = len(df_label_counts.index)
            df_label_counts.loc[row_index, 'Sampler'] = sampling_method
            df_label_counts.loc[row_index, 'Label Classes'] = label_class
                        
            for i in range(len(value_counts)):
                df_label_counts.loc[row_index, str(i)] = value_counts[i]
                
        case 1: # Update previous record
            for i in range(len(value_counts)):
                df_label_counts.loc[row_index, str(i)] = value_counts[i]
                
        case _:
            assert False, f'ERROR: {sampling_method} / {label_class} is duplicated. This should NOT happen.'
            
# Update file
df_label_counts.to_json(path_or_buf=metrics_directory+'/sampling_label_counts.json', orient='index')

display(df_label_counts)

## Exporting Resampled Dataset to File

In [14]:
for _class in label_classes:
    all_data_resampled[_class].to_parquet(
        path=f'./resampled_datasets/{sampling_method}_{_class}_resampled_dataset.parquet'
    )

## Real vs Resampled Dataset Analysis

In [15]:
all_data_resampled[label_classes[0]].describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,...,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0,7299188.0
mean,116.928,217506.3,10.39298,79.6714,5387.146,5387.146,1.157435e-06,0.02981633,0.1119738,0.04812111,...,184.7299,334.1616,83521750.0,9.506723,21.58725,261.0329,155635.7,0.550174,141.7245,16.5
std,976.0585,776082.0,9.810671,32.10693,69214.25,69214.25,0.0006981647,0.1700803,0.3153343,0.2140221,...,315.8183,435.0052,53584310.0,2.568512,13.79854,446.7126,636795.4,0.4367754,66.15359,9.810709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,0.002863438,58.0,6.0,64.0,3.037998,3.037998,0.0,0.0,0.0,0.0,...,0.0,54.08,82977520.0,9.5,10.39813,0.0,0.0,0.0,141.55,8.0
50%,1.316201,3000.7,6.16,64.0,20.14971,20.14971,0.0,0.0,0.0,0.0,...,40.8337,114.285,83254240.0,9.5,15.07503,57.50767,2721.608,0.81,141.55,16.5
75%,29.39338,34375.0,11.1,84.96,95.15429,95.15429,0.0,0.0,0.0,0.0,...,252.4264,554.0,83693840.0,9.5,33.27663,356.6228,113805.4,0.95,141.55,25.0
max,61533.51,9814697.0,47.0,255.0,8388608.0,8388608.0,0.8910143,1.0,1.0,1.0,...,8643.503,6871.6,167639400.0,15.0,119.8546,12223.76,85610220.0,1.0,244.6,33.0


### Generate Reports

In [None]:
for _class in label_classes:
    original_report = ProfileReport(all_data[_class], title=f'{_class} Original Data', minimal=True)
    resampled_report = ProfileReport(all_data_resampled[_class], title=f'{_class} Resampled Data', minimal=True)
    comparison_report = original_report.compare(resampled_report)
    comparison_report.to_file(f'./profile_reports/{sampling_method}_{_class}_resampling_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]