# Imports

In [1]:
from imblearn.under_sampling import TomekLinks
import numpy as np
import os
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

# Loading

In [2]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 1

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

num_cols = [
    'flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count',
    'fin_count', 'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 
    'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight'
]
cat_cols = [
    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
    'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'HTTP', 'HTTPS', 
    'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC'
]

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,...,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0,439594.0
mean,4.934348,77265.05,9.064897,66.342481,9204.108,9204.108,1.2e-05,0.086543,0.208117,0.090631,...,124.275709,33.299073,124.240745,83142440.0,9.496672,13.108761,47.063373,30975.61,0.095869,141.465351
std,191.776457,465015.5,8.94788,13.985398,101024.1,101024.1,0.007467,0.281165,0.405961,0.287084,...,239.406789,161.058529,239.770791,17014220.0,0.817578,8.603101,227.801588,324465.4,0.23212,21.026719
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.092206,2.092206,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071560.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.81505,15.81505,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124520.0,9.5,10.392305,0.0,0.0,0.0,141.55
75%,0.105906,273.0875,14.3,64.0,116.6419,116.6419,0.0,0.0,0.0,0.0,...,54.046126,0.368667,54.06,83343910.0,9.5,10.39632,0.505921,1.328944,0.08,141.55
max,47596.388779,9815966.0,47.0,255.0,7340032.0,7340032.0,4.948488,1.0,1.0,1.0,...,7355.187698,6125.574152,6292.4,167639400.0,15.0,118.17373,8662.870043,46912120.0,1.0,244.6


# Preprocessing
## Encoding Labels

In [3]:
label_encoder = LabelEncoder()
full_data['label'] = label_encoder.fit_transform(full_data['label'])

# Store label mappings
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: label_encoder.transform([label])[0] for label in label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data[full_data['label'] == code].iloc[0])

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration      1.946327e+02
Header_Length      1.565400e+04
Protocol Type      8.200000e+00
Duration           1.169000e+02
Rate

Save dataset for later comparison

In [4]:
original_data = full_data.copy()
original_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
218446,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,54.0,83314730.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
357726,0.103091,39775.0,17.0,64.0,7714.379049,7714.379049,0.0,0.0,0.0,0.0,...,0.0,50.0,83016670.0,9.5,10.0,0.0,0.0,0.0,141.55,21
71397,4.575015,108.0,6.0,64.0,0.437157,0.437157,0.0,0.0,1.0,0.0,...,0.0,54.0,82972700.0,9.5,10.392305,0.0,0.0,0.0,141.55,19
350296,4.767506,77569.5,10.9,51.9,69926.712725,69926.712725,0.0,0.0,0.0,0.0,...,156.917956,339.2,0.0155334,5.5,23.098786,221.915501,55941.663086,0.7,38.5,22
379863,0.0,54.0,6.0,64.0,1.588546,1.588546,0.0,0.0,0.0,0.0,...,0.0,54.0,83332150.0,9.5,10.392305,0.0,0.0,0.0,141.55,8


## Scaling Numeric Features

In [5]:
# # feature scaling
# scaler = StandardScaler()
# 
# for data_set in tqdm(data_sets):
#     scaler.fit(pd.read_csv(DATASET_DIRECTORY + data_set)[num_cols])
#     
# # Scale the features in the dataframe
# full_data[num_cols] = scaler.transform(full_data[num_cols])
# 
# # Display the first few entries to verify the changes
# print(full_data)
# 
# # prep the data to be inputted into model
# data = full_data

## Preprocessed DataFrame

In [6]:
full_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
218446,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,54.0,83314730.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
357726,0.103091,39775.0,17.0,64.0,7714.379049,7714.379049,0.0,0.0,0.0,0.0,...,0.0,50.0,83016670.0,9.5,10.0,0.0,0.0,0.0,141.55,21
71397,4.575015,108.0,6.0,64.0,0.437157,0.437157,0.0,0.0,1.0,0.0,...,0.0,54.0,82972700.0,9.5,10.392305,0.0,0.0,0.0,141.55,19
350296,4.767506,77569.5,10.9,51.9,69926.712725,69926.712725,0.0,0.0,0.0,0.0,...,156.917956,339.2,0.0155334,5.5,23.098786,221.915501,55941.663086,0.7,38.5,22
379863,0.0,54.0,6.0,64.0,1.588546,1.588546,0.0,0.0,0.0,0.0,...,0.0,54.0,83332150.0,9.5,10.392305,0.0,0.0,0.0,141.55,8


# Sampling (Tomek Links Under-Sampling)

In [7]:
min_class_size = full_data['label'].value_counts().min()

# Apply random over-sampling
undersampler = TomekLinks()

X = full_data.drop('label', axis=1)
y = full_data['label']
X_resampled, y_res = undersampler.fit_resample(X, y)

# Combine the resampled features and labels back into a single DataFrame
full_data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
full_data_resampled['label'] = y_res

print("Resampled Data (SCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

full_data_resampled.head()

Resampled Data (SCALED):
First instance of Backdoor_Malware (code 0):
flow_duration      1.946327e+02
Header_Length      1.565400e+04
Protocol Type      8.200000e+00
Duration           1.169000e+02
Rate               6.297856e-01
Srate              6.297856e-01
Drate              0.000000e+00
fin_flag_number    0.000000e+00
syn_flag_number    0.000000e+00
rst_flag_number    0.000000e+00
psh_flag_number    1.000000e+00
ack_flag_number    1.000000e+00
ece_flag_number    0.000000e+00
cwr_flag_number    0.000000e+00
ack_count          0.000000e+00
syn_count          0.000000e+00
fin_count          0.000000e+00
urg_count          6.700000e+01
rst_count          1.206000e+02
HTTP               0.000000e+00
HTTPS              0.000000e+00
DNS                0.000000e+00
Telnet             0.000000e+00
SMTP               0.000000e+00
SSH                0.000000e+00
IRC                0.000000e+00
TCP                1.000000e+00
UDP                0.000000e+00
DHCP               0.000000e+00
AR

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,54.0,83314730.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
1,0.103091,39775.0,17.0,64.0,7714.379049,7714.379049,0.0,0.0,0.0,0.0,...,0.0,50.0,83016670.0,9.5,10.0,0.0,0.0,0.0,141.55,21
2,4.575015,108.0,6.0,64.0,0.437157,0.437157,0.0,0.0,1.0,0.0,...,0.0,54.0,82972700.0,9.5,10.392305,0.0,0.0,0.0,141.55,19
3,4.767506,77569.5,10.9,51.9,69926.712725,69926.712725,0.0,0.0,0.0,0.0,...,156.917956,339.2,0.0155334,5.5,23.098786,221.915501,55941.663086,0.7,38.5,22
4,0.0,54.0,6.0,64.0,1.588546,1.588546,0.0,0.0,0.0,0.0,...,0.0,54.0,83332150.0,9.5,10.392305,0.0,0.0,0.0,141.55,8


# Post-Processing
## Inverse-Scaling

In [8]:
# full_data_resampled[num_cols] = scaler.inverse_transform(full_data_resampled[num_cols], copy=None)
# 
# print("Resampled Data (UNSCALED):")
# for label, code in class_codes.items():
#     # Print the first instance of each class
#     print(f"First instance of {label} (code {code}):")
#     print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])
# 
# print("Number of negative values in DataFrame: ")
# print(sum(n < 0 for n in full_data_resampled.values.flatten()))
# 
# full_data_resampled.head()

## Synthetic Data Analysis

In [9]:
full_data_resampled.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,...,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0,435936.0
mean,4.362445,70790.32,9.070816,66.064185,9135.501,9135.501,1.3e-05,0.087203,0.209636,0.091252,...,30.531798,120.629873,83623830.0,9.519754,12.98808,43.149552,27621.47,0.090489,142.059683,12.538685
std,190.468631,439567.4,8.975896,13.057217,100509.7,100509.7,0.007498,0.282133,0.407049,0.287967,...,153.120993,229.913704,15541660.0,0.746475,8.382838,216.56976,309747.8,0.223821,19.201071,5.791523
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,0.0,54.0,6.0,64.0,2.068859,2.068859,0.0,0.0,0.0,0.0,...,0.0,50.0,83072060.0,9.5,10.0,0.0,0.0,0.0,141.55,8.0
50%,0.0,54.0,6.0,64.0,15.54783,15.54783,0.0,0.0,0.0,0.0,...,0.0,54.0,83124630.0,9.5,10.392305,0.0,0.0,0.0,141.55,12.0
75%,0.099278,195.445,14.91,64.0,116.3953,116.3953,0.0,0.0,0.0,0.0,...,0.335305,54.06,83344010.0,9.5,10.39517,0.463495,1.063923,0.07,141.55,15.0
max,47596.388779,9815966.0,47.0,255.0,7340032.0,7340032.0,4.948488,1.0,1.0,1.0,...,6125.574152,6292.4,167639400.0,15.0,118.17373,8662.870043,46912120.0,1.0,244.6,33.0


In [10]:
from ydata_profiling import ProfileReport

original_report = ProfileReport(original_data, title='Original Data', minimal=True)
resampled_report = ProfileReport(full_data_resampled, title='Resampled Data', minimal=True)
comparison_report = original_report.compare(resampled_report)
comparison_report.to_file('./profile_reports/tomek_original_vs_resampled.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]