# Imports

In [1]:
from imblearn.under_sampling import CondensedNearestNeighbour
import numpy as np
import os
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

# Loading

In [2]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 5

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

num_cols = [
    'flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count',
    'fin_count', 'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 
    'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight'
]
cat_cols = [
    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
    'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'HTTP', 'HTTPS', 
    'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC'
]

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,...,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0,1385251.0
mean,6.086423,77272.62,9.064225,66.35857,9136.173,9136.173,6.516857e-06,0.0862739,0.2070881,0.09023924,...,124.9866,33.42434,125.0793,83189330.0,9.498798,13.13377,47.23296,30824.99,0.09656077,141.5206
std,335.5535,464539.8,8.948236,14.06328,100287.2,100287.2,0.004418687,0.2807682,0.4052194,0.2865243,...,241.6227,160.5632,242.6297,17100920.0,0.8217052,8.646518,227.0902,327391.6,0.2332904,21.13469
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.100169,2.100169,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071560.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.79307,15.79307,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124520.0,9.5,10.3923,0.0,0.0,0.0,141.55
75%,0.1050667,285.99,14.33,64.0,117.1712,117.1712,0.0,0.0,0.0,0.0,...,54.05258,0.3719096,54.06,83343900.0,9.5,10.39675,0.5059213,1.344216,0.08,141.55
max,218113.4,9795649.0,47.0,255.0,8388608.0,8388608.0,5.022185,1.0,1.0,1.0,...,8201.775,8643.503,8754.0,167639400.0,15.0,128.0783,12223.76,85610220.0,1.0,244.6


# Preprocessing
## Encoding Labels

In [3]:
label_encoder = LabelEncoder()
full_data['label'] = label_encoder.fit_transform(full_data['label'])

# Store label mappings
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: label_encoder.transform([label])[0] for label in label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data[full_data['label'] == code].iloc[0])

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration      1.972717e+02
Header_Length      1.170530e+04
Protocol Type      6.000000e+00
Duration           1.047000e+02
Rate

Save dataset for later comparison

In [4]:
original_data = full_data.copy()
original_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
164260,0.147075,21224.0,17.0,64.0,4376.083823,4376.083823,0.0,0.0,0.0,0.0,...,0.0,50.0,83098760.0,9.5,10.0,0.0,0.0,0.0,141.55,14
159703,0.009695,69.94,6.22,63.62,133.539945,133.539945,0.0,0.0,1.0,0.0,...,2.328323,54.92,83093310.0,9.5,10.483598,3.298447,26.885065,0.22,141.55,10
235836,0.220598,15592.63,1.45,64.0,59.932602,59.932602,0.0,0.0,0.0,0.0,...,145.840843,188.96,83132490.0,9.5,12.762688,206.388075,83210.101065,0.28,141.55,6
56911,0.0,54.0,6.0,64.0,1.533302,1.533302,0.0,0.0,0.0,0.0,...,0.0,54.0,83072380.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
141604,0.0,54.0,6.0,64.0,65.774432,65.774432,0.0,0.0,1.0,0.0,...,0.0,54.0,83362480.0,9.5,10.392305,0.0,0.0,0.0,141.55,12


## Scaling Numeric Features

In [5]:
# # feature scaling
# scaler = StandardScaler()
# 
# for data_set in tqdm(data_sets):
#     scaler.fit(pd.read_csv(DATASET_DIRECTORY + data_set)[num_cols])
#     
# # Scale the features in the dataframe
# full_data[num_cols] = scaler.transform(full_data[num_cols])
# 
# # Display the first few entries to verify the changes
# print(full_data)
# 
# # prep the data to be inputted into model
# data = full_data

## Preprocessed DataFrame

In [6]:
full_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
164260,0.147075,21224.0,17.0,64.0,4376.083823,4376.083823,0.0,0.0,0.0,0.0,...,0.0,50.0,83098760.0,9.5,10.0,0.0,0.0,0.0,141.55,14
159703,0.009695,69.94,6.22,63.62,133.539945,133.539945,0.0,0.0,1.0,0.0,...,2.328323,54.92,83093310.0,9.5,10.483598,3.298447,26.885065,0.22,141.55,10
235836,0.220598,15592.63,1.45,64.0,59.932602,59.932602,0.0,0.0,0.0,0.0,...,145.840843,188.96,83132490.0,9.5,12.762688,206.388075,83210.101065,0.28,141.55,6
56911,0.0,54.0,6.0,64.0,1.533302,1.533302,0.0,0.0,0.0,0.0,...,0.0,54.0,83072380.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
141604,0.0,54.0,6.0,64.0,65.774432,65.774432,0.0,0.0,1.0,0.0,...,0.0,54.0,83362480.0,9.5,10.392305,0.0,0.0,0.0,141.55,12


# Sampling (Condensed Nearest Neighbor Under-Sampling)

In [7]:
min_class_size = full_data['label'].value_counts().min()

# Apply random over-sampling
undersampler = CondensedNearestNeighbour()

X = full_data.drop('label', axis=1)
y = full_data['label']
X_resampled, y_res = undersampler.fit_resample(X, y)

# Combine the resampled features and labels back into a single DataFrame
full_data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
full_data_resampled['label'] = y_res

print("Resampled Data (SCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

full_data_resampled.head()

Resampled Data (SCALED):
First instance of Backdoor_Malware (code 0):
flow_duration      9.758581e-01
Header_Length      6.660940e+04
Protocol Type      6.000000e+00
Duration           5.040000e+01
Rate               6.401756e+01
Srate              6.401756e+01
Drate              0.000000e+00
fin_flag_number    0.000000e+00
syn_flag_number    0.000000e+00
rst_flag_number    0.000000e+00
psh_flag_number    0.000000e+00
ack_flag_number    1.000000e+00
ece_flag_number    0.000000e+00
cwr_flag_number    0.000000e+00
ack_count          0.000000e+00
syn_count          2.000000e+00
fin_count          0.000000e+00
urg_count          8.000000e+00
rst_count          6.150000e+01
HTTP               0.000000e+00
HTTPS              1.000000e+00
DNS                0.000000e+00
Telnet             0.000000e+00
SMTP               0.000000e+00
SSH                0.000000e+00
IRC                0.000000e+00
TCP                1.000000e+00
UDP                0.000000e+00
DHCP               0.000000e+00
AR

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.975858,66609.4,6.0,50.4,64.017555,64.017555,0.0,0.0,0.0,0.0,...,639.672476,351.6,167629700.0,13.5,30.419257,901.016534,406687.527512,1.0,244.6,0
1,197.271666,11705.3,6.0,104.7,75.845424,75.845424,0.0,0.0,0.0,0.0,...,588.22887,147.4,167629900.0,13.5,30.581277,833.158486,348274.713591,1.0,244.6,0
2,1760.627373,15276.2,12.6,74.6,0.062983,0.062983,0.0,0.0,0.0,0.0,...,121.749366,185.9,0.01903262,5.5,20.647813,172.179605,16481.338452,0.9,38.5,0
3,55.706056,6109.2,14.3,108.7,394.899773,394.899773,0.0,0.0,0.0,0.0,...,62.412485,171.4,167629700.0,13.5,15.384394,87.961674,3873.426256,1.0,244.6,0
4,269.47221,13603.0,7.1,82.7,164.231782,164.231782,0.0,0.0,0.0,0.0,...,73.95056,104.3,0.004173017,5.5,15.885492,104.581885,6233.014506,0.9,38.5,0


# Post-Processing
## Inverse-Scaling

In [8]:
# full_data_resampled[num_cols] = scaler.inverse_transform(full_data_resampled[num_cols], copy=None)
# 
# print("Resampled Data (UNSCALED):")
# for label, code in class_codes.items():
#     # Print the first instance of each class
#     print(f"First instance of {label} (code {code}):")
#     print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])
# 
# print("Number of negative values in DataFrame: ")
# print(sum(n < 0 for n in full_data_resampled.values.flatten()))
# 
# full_data_resampled.head()

## Synthetic Data Analysis

In [9]:
full_data_resampled.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,...,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0,1069.0
mean,309.898999,84936.76,8.809531,96.98473,2672.39406,2672.39406,0.0,0.003742,0.035547,0.018709,...,203.03488,333.785704,20819770.0,6.480823,21.014848,287.069814,209298.4,0.787617,64.407203,18.334892
std,2426.498554,486942.2,4.682025,40.08694,26039.362786,26039.362786,0.0,0.061084,0.185245,0.135559,...,356.521939,516.096902,54070310.0,2.657555,13.97093,504.467301,728989.5,0.267202,67.039701,11.195994
min,0.0,0.0,0.0,0.0,0.033641,0.033641,0.0,0.0,0.0,0.0,...,0.0,42.0,2.980232e-07,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,1.893714,2485.2,6.0,64.0,5.562707,5.562707,0.0,0.0,0.0,0.0,...,22.545978,85.9,0.003767991,5.5,12.820782,31.932203,808.7095,0.8,38.5,3.0
50%,24.183873,7539.5,7.6,87.1,22.955122,22.955122,0.0,0.0,0.0,0.0,...,56.611538,121.5,0.0135844,5.5,15.414805,79.975591,3847.493,0.9,38.5,22.0
75%,161.00155,22232.5,10.4,118.2,76.476637,76.476637,0.0,0.0,0.0,0.0,...,149.163106,223.9,0.03138831,5.5,21.017304,211.261154,34305.46,0.9,38.5,27.0
max,51182.926771,9233031.0,47.0,247.8,524910.331235,524910.331235,0.0,1.0,1.0,1.0,...,3206.440385,3704.0,167639400.0,13.5,87.177081,4543.962586,12328700.0,1.0,244.6,33.0


In [10]:
from ydata_profiling import ProfileReport

original_report = ProfileReport(original_data, title='Original Data', minimal=True)
resampled_report = ProfileReport(full_data_resampled, title='Resampled Data', minimal=True)
comparison_report = original_report.compare(resampled_report)
comparison_report.to_file('./profile_reports/cnn_original_vs_resampled.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]