## Imports

In [1]:
from imblearn.under_sampling import RandomUnderSampler  
import numpy as np
import os
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

# Loading

In [2]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 5

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

num_cols = [
    'flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count',
    'fin_count', 'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 
    'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight'
]
cat_cols = [
    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
    'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'HTTP', 'HTTPS', 
    'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC'
]

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,...,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0,1197844.0
mean,5.721863,76788.52,9.072623,66.35984,9136.002,9136.002,3.87203e-06,0.08672498,0.2071004,0.0906295,...,125.0259,33.32414,124.931,83162080.0,9.497472,13.13407,47.09461,30353.5,0.0962343,141.4859
std,277.9237,461140.9,8.955192,14.07544,99583.83,99583.83,0.001418527,0.2814317,0.4052283,0.2870816,...,242.0582,159.148,241.7161,17001670.0,0.8169249,8.65027,225.0865,306044.1,0.2326707,21.01139
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.090505,2.090505,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071570.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.77737,15.77737,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124530.0,9.5,10.3923,0.0,0.0,0.0,141.55
75%,0.1048883,264.1925,14.59,64.0,117.8863,117.8863,0.0,0.0,0.0,0.0,...,54.04656,0.3707605,54.06,83344000.0,9.5,10.39663,0.5059213,1.328944,0.08,141.55
max,64045.49,9834752.0,47.0,255.0,8388608.0,8388608.0,1.276993,1.0,1.0,1.0,...,9231.15,8290.916,9622.8,167639400.0,15.0,133.5917,11725.13,77137500.0,1.0,244.6


# Preprocessing
## Encoding Labels

In [3]:
label_encoder = LabelEncoder()
full_data['label'] = label_encoder.fit_transform(full_data['label'])

# Store label mappings
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: label_encoder.transform([label])[0] for label in label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data[full_data['label'] == code].iloc[0])

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration       1123.672136
Header_Length      23496.800000
Protocol Type          8.100000
Duration             118.700000
Rate

Save dataset for later comparison

In [4]:
original_data = full_data.copy()
original_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
142496,0.0,54.0,6.0,64.0,1.779477,1.779477,0.0,0.0,0.0,0.0,...,0.0,54.0,83072250.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
3301,0.0,54.0,6.0,64.0,22.1141,22.1141,0.0,0.0,0.0,0.0,...,0.0,54.0,83072360.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
195253,0.0,54.0,6.0,64.0,106.722577,106.722577,0.0,1.0,0.0,1.0,...,0.0,54.0,83344940.0,9.5,10.392305,0.0,0.0,0.0,141.55,9
63448,0.0,54.0,6.0,64.0,0.313506,0.313506,0.0,0.0,0.0,0.0,...,0.0,54.0,83331760.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
25614,0.0,0.0,0.99,63.36,0.399377,0.399377,0.0,0.0,0.0,0.0,...,1.072258,42.18,83128740.0,9.5,9.232941,1.517764,6.492759,0.19,141.55,6


## Scaling Numeric Features

In [5]:
# # feature scaling
# scaler = StandardScaler()
# 
# for data_set in tqdm(data_sets):
#     scaler.fit(pd.read_csv(DATASET_DIRECTORY + data_set)[num_cols])
#     
# # Scale the features in the dataframe
# full_data[num_cols] = scaler.transform(full_data[num_cols])
# 
# # Display the first few entries to verify the changes
# print(full_data)
# 
# # prep the data to be inputted into model
# data = full_data

## Preprocessed DataFrame

In [6]:
full_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
142496,0.0,54.0,6.0,64.0,1.779477,1.779477,0.0,0.0,0.0,0.0,...,0.0,54.0,83072250.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
3301,0.0,54.0,6.0,64.0,22.1141,22.1141,0.0,0.0,0.0,0.0,...,0.0,54.0,83072360.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
195253,0.0,54.0,6.0,64.0,106.722577,106.722577,0.0,1.0,0.0,1.0,...,0.0,54.0,83344940.0,9.5,10.392305,0.0,0.0,0.0,141.55,9
63448,0.0,54.0,6.0,64.0,0.313506,0.313506,0.0,0.0,0.0,0.0,...,0.0,54.0,83331760.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
25614,0.0,0.0,0.99,63.36,0.399377,0.399377,0.0,0.0,0.0,0.0,...,1.072258,42.18,83128740.0,9.5,9.232941,1.517764,6.492759,0.19,141.55,6


# Sampling (Naive-Random Under-Sampling)

In [7]:
min_class_size = full_data['label'].value_counts().min()

# Apply random over-sampling
rus = RandomUnderSampler(random_state=42)

X = full_data.drop('label', axis=1)
y = full_data['label']
X_resampled, y_res = rus.fit_resample(X, y)

# Combine the resampled features and labels back into a single DataFrame
full_data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
full_data_resampled['label'] = y_res

print("Resampled Data (SCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

full_data_resampled.head()

Resampled Data (SCALED):
First instance of Backdoor_Malware (code 0):
flow_duration      4.442195e+01
Header_Length      1.802100e+03
Protocol Type      1.480000e+01
Duration           1.341000e+02
Rate               6.139804e+02
Srate              6.139804e+02
Drate              0.000000e+00
fin_flag_number    0.000000e+00
syn_flag_number    0.000000e+00
rst_flag_number    0.000000e+00
psh_flag_number    0.000000e+00
ack_flag_number    0.000000e+00
ece_flag_number    0.000000e+00
cwr_flag_number    0.000000e+00
ack_count          0.000000e+00
syn_count          5.000000e-01
fin_count          0.000000e+00
urg_count          0.000000e+00
rst_count          0.000000e+00
HTTP               0.000000e+00
HTTPS              0.000000e+00
DNS                0.000000e+00
Telnet             0.000000e+00
SMTP               0.000000e+00
SSH                0.000000e+00
IRC                0.000000e+00
TCP                0.000000e+00
UDP                1.000000e+00
DHCP               0.000000e+00
AR

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
256326,44.421948,1802.1,14.8,134.1,613.98035,613.98035,0.0,0.0,0.0,0.0,...,74.835189,142.1,167629700.0,13.5,17.673911,105.781708,5603.91708,1.0,244.6,0
22573,908.342341,61587.8,10.4,80.2,2.504011,2.504011,0.0,0.0,0.0,0.0,...,110.508291,196.0,0.03883259,5.5,20.589052,156.282324,13605.732303,0.9,38.5,0
18612,365.257377,5729.2,5.4,117.2,57.623553,57.623553,0.0,0.0,0.0,0.0,...,187.618436,96.7,167629900.0,13.5,18.357933,265.780237,35535.742009,1.0,244.6,0
237667,1123.672136,23496.8,8.1,118.7,0.152705,0.152705,0.0,0.0,0.0,0.0,...,36.441684,116.5,0.0939594,5.5,15.734196,51.536323,1817.781351,0.8,38.5,0
86674,1410.724987,4372.8,6.0,99.4,0.040755,0.040755,0.0,0.0,0.0,0.0,...,38.442496,75.2,167629800.0,13.5,13.203679,54.400222,1486.87442,1.0,244.6,0


# Post-Processing
## Inverse-Scaling

In [8]:
# full_data_resampled[num_cols] = scaler.inverse_transform(full_data_resampled[num_cols], copy=None)
# 
# print("Resampled Data (UNSCALED):")
# for label, code in class_codes.items():
#     # Print the first instance of each class
#     print(f"First instance of {label} (code {code}):")
#     print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])
# 
# print("Number of negative values in DataFrame: ")
# print(sum(n < 0 for n in full_data_resampled.values.flatten()))
# 
# full_data_resampled.head()

## Synthetic Data Analysis

In [9]:
full_data_resampled.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,...,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,180.372752,204311.5,10.271808,80.438377,5127.82,5127.82,0.0,0.029412,0.124183,0.043573,...,202.371275,357.570523,82792010.0,9.465142,21.946247,285.874021,202166.0,0.53963,140.651961,16.5
std,1801.574022,699676.4,9.88559,33.567051,74002.27,74002.27,0.0,0.16905,0.32997,0.204254,...,353.260669,483.186466,53564130.0,2.567918,14.414224,499.444535,810956.2,0.440271,66.15599,9.816056
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,1.28746e-06,5.5,9.165151,0.0,0.0,0.0,38.5,0.0
25%,0.002805,58.0,6.0,64.0,2.519658,2.519658,0.0,0.0,0.0,0.0,...,0.0,54.12,82977290.0,9.5,10.394944,0.0,0.0,0.0,141.55,8.0
50%,1.205756,3212.22,6.11,64.0,21.07903,21.07903,0.0,0.0,0.0,0.0,...,42.08859,114.805,83254100.0,9.5,14.876225,59.549623,2773.682,0.8,141.55,16.5
75%,28.13917,45228.45,10.4,86.65,91.39486,91.39486,0.0,0.0,0.0,0.0,...,303.858042,572.82,83681260.0,9.5,33.286634,422.243269,142646.9,0.95,141.55,25.0
max,47687.063836,9543481.0,47.0,247.0,2097152.0,2097152.0,0.0,1.0,1.0,1.0,...,2881.513731,4491.3,167639400.0,13.5,99.659223,4075.075799,15073070.0,1.0,244.6,33.0


In [10]:
from ydata_profiling import ProfileReport

original_report = ProfileReport(original_data, title='Original Data', minimal=True)
resampled_report = ProfileReport(full_data_resampled, title='Resampled Data', minimal=True)
comparison_report = original_report.compare(resampled_report)
comparison_report.to_file('./profile_reports/random_original_vs_resampled.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]