## Imports

In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE  # Import SMOTE
import random
from tqdm import tqdm
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle

# Loading

In [2]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 1

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

num_cols = [
    'flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count',
    'fin_count', 'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 
    'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight'
]
cat_cols = [
    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
    'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'HTTP', 'HTTPS', 
    'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC'
]

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
print(full_data)

['part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']
['part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']
data set part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv out of 1 

Before encoding:
First instance of DDoS-SYN_Flood:
flow_duration                  0.0
Header_Length                53.46
Protocol Type                 5.94
Duration                     63.36
Rate                        1.1458
Srate                       1.1458
Drate                          0.0
fin_flag_number                0.0
syn_flag_number                1.0
rst_flag_number                0.0
psh_flag_number                0.0
ack_flag_number                0.0
ece_flag_number                0.0
cwr_flag_number                0.0
ack_count           

# Preprocessing
## Encoding Labels

In [3]:
label_encoder = LabelEncoder()
full_data['label'] = label_encoder.fit_transform(full_data['label'])

# Store label mappings
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: label_encoder.transform([label])[0] for label in label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data[full_data['label'] == code].iloc[0])


Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration      9.265858e-01
Header_Length      9.335040e+04
Protocol Type      5.500000e+00
Duration           5.120000e+01
Rate

## Scaling Numeric Features

In [4]:
# feature scaling
scaler = StandardScaler()

for data_set in tqdm(data_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + data_set)[num_cols])
    
# Scale the features in the dataframe
full_data[num_cols] = scaler.transform(full_data[num_cols])

# Display the first few entries to verify the changes
print(full_data)

# prep the data to be inputted into model
data = full_data

100%|██████████| 1/1 [00:00<00:00,  1.18it/s]

        flow_duration  Header_Length  Protocol Type  Duration      Rate  \
33308       -0.006349      -0.168361           6.00 -0.167903 -0.092665   
187239      -0.021994      -0.144313          17.00 -0.167903  0.100247   
11907       -0.022256      -0.168477           6.00 -0.167903 -0.092329   
63608       -0.022256      -0.168477           6.00 -0.167903 -0.092667   
107281       0.047345      -0.168167           6.00 -0.044655 -0.092669   
...               ...            ...            ...       ...       ...   
109259      -0.022169      -0.151982          17.00 -0.167903  0.249658   
50057       -0.022256      -0.168477           6.00 -0.167903 -0.092655   
5192        -0.022256      -0.168477           6.00 -0.167903 -0.092281   
208780      -0.022252      -0.168475           6.11 -0.182069 -0.092288   
128037      -0.021646      -0.125428          17.00 -0.167903 -0.033609   

           Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_number  \
33308  -0.092665    0




## Preprocessed DataFrame

In [5]:
full_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
33308,-0.006349,-0.168361,6.0,-0.167903,-0.092665,-0.092665,0.0,0.0,1.0,0.0,...,-0.206322,-0.296223,0.010889,0.00294,-0.320215,-0.206144,-0.090697,-0.415473,0.002821,12
187239,-0.021994,-0.144313,17.0,-0.167903,0.100247,0.100247,0.0,0.0,0.0,0.0,...,-0.206322,-0.312759,-0.003031,0.00294,-0.365427,-0.206144,-0.090697,-0.415473,0.002821,14
11907,-0.022256,-0.168477,6.0,-0.167903,-0.092329,-0.092329,0.0,0.0,0.0,0.0,...,-0.206322,-0.296223,-0.006077,0.00294,-0.320215,-0.206144,-0.090697,-0.415473,0.002821,13
63608,-0.022256,-0.168477,6.0,-0.167903,-0.092667,-0.092667,0.0,0.0,0.0,0.0,...,-0.206322,-0.296223,-0.006271,0.00294,-0.320215,-0.206144,-0.090697,-0.415473,0.002821,13
107281,0.047345,-0.168167,6.0,-0.044655,-0.092669,-0.092669,0.0,0.0,0.0,0.0,...,-0.189912,-0.292626,-0.013613,0.00294,-0.311952,-0.189708,-0.090524,0.098145,0.002821,20


# Sampling (SMOTE Over-Sampling)

In [6]:
min_class_size = full_data['label'].value_counts().min()

# Apply random over-sampling
smote_os = SMOTE(k_neighbors=4, random_state=42)

X = full_data.drop('label', axis=1)
y = full_data['label']
X_resampled, y_res = smote_os.fit_resample(X, y)

# Combine the resampled features and labels back into a single DataFrame
full_data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
full_data_resampled['label'] = y_res

print("Resampled Data (SCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

full_data_resampled.head()

Resampled Data (SCALED):
First instance of Backdoor_Malware (code 0):
flow_duration     -0.018820
Header_Length      0.032414
Protocol Type      5.500000
Duration          -1.074550
Rate              -0.091767
Srate             -0.091767
Drate              0.000000
fin_flag_number    0.000000
syn_flag_number    0.000000
rst_flag_number    0.000000
psh_flag_number    0.000000
ack_flag_number    1.000000
ece_flag_number    0.000000
cwr_flag_number    0.000000
ack_count         -0.316729
syn_count          2.229601
fin_count         -0.290548
urg_count          0.013052
rst_count          0.118630
HTTP               0.000000
HTTPS              1.000000
DNS                0.000000
Telnet             0.000000
SMTP               0.000000
SSH                0.000000
IRC                0.000000
TCP                1.000000
UDP                0.000000
DHCP               0.000000
ARP                0.000000
ICMP               0.000000
IPv                1.000000
LLC                1.000000
Tot su

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.006349,-0.168361,6.0,-0.167903,-0.092665,-0.092665,0.0,0.0,1.0,0.0,...,-0.206322,-0.296223,0.010889,0.00294,-0.320215,-0.206144,-0.090697,-0.415473,0.002821,12
1,-0.021994,-0.144313,17.0,-0.167903,0.100247,0.100247,0.0,0.0,0.0,0.0,...,-0.206322,-0.312759,-0.003031,0.00294,-0.365427,-0.206144,-0.090697,-0.415473,0.002821,14
2,-0.022256,-0.168477,6.0,-0.167903,-0.092329,-0.092329,0.0,0.0,0.0,0.0,...,-0.206322,-0.296223,-0.006077,0.00294,-0.320215,-0.206144,-0.090697,-0.415473,0.002821,13
3,-0.022256,-0.168477,6.0,-0.167903,-0.092667,-0.092667,0.0,0.0,0.0,0.0,...,-0.206322,-0.296223,-0.006271,0.00294,-0.320215,-0.206144,-0.090697,-0.415473,0.002821,13
4,0.047345,-0.168167,6.0,-0.044655,-0.092669,-0.092669,0.0,0.0,0.0,0.0,...,-0.189912,-0.292626,-0.013613,0.00294,-0.311952,-0.189708,-0.090524,0.098145,0.002821,20


# Post-Processing
## Inverse-Scaling

In [7]:
full_data_resampled[num_cols] = scaler.inverse_transform(full_data_resampled[num_cols], copy=None)

print("Resampled Data (UNSCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

print("Number of negative values in DataFrame: ")
print(sum(n < 0 for n in full_data_resampled.values.flatten()))

full_data_resampled.head()

Resampled Data (UNSCALED):
First instance of Backdoor_Malware (code 0):
flow_duration      9.265858e-01
Header_Length      9.335040e+04
Protocol Type      5.500000e+00
Duration           5.120000e+01
Rate               8.575327e+01
Srate              8.575327e+01
Drate              0.000000e+00
fin_flag_number    0.000000e+00
syn_flag_number    0.000000e+00
rst_flag_number    0.000000e+00
psh_flag_number    0.000000e+00
ack_flag_number    1.000000e+00
ece_flag_number    0.000000e+00
cwr_flag_number    0.000000e+00
ack_count          0.000000e+00
syn_count          1.800000e+00
fin_count          0.000000e+00
urg_count          7.200000e+00
rst_count          7.830000e+01
HTTP               0.000000e+00
HTTPS              1.000000e+00
DNS                0.000000e+00
Telnet             0.000000e+00
SMTP               0.000000e+00
SSH                0.000000e+00
IRC                0.000000e+00
TCP                1.000000e+00
UDP                0.000000e+00
DHCP               0.000000e+00


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,4.288836,108.0,6.0,64.0,0.532663,0.532663,0.0,0.0,1.0,0.0,...,0.0,54.0,83361260.0,9.5,10.392305,0.0,0.0,0.0,141.55,12
1,0.070704,11276.0,17.0,64.0,18298.327225,18298.327225,0.0,0.0,0.0,0.0,...,0.0,50.0,83123570.0,9.5,10.0,0.0,0.0,0.0,141.55,14
2,0.0,54.0,6.0,64.0,32.403336,32.403336,0.0,0.0,0.0,0.0,...,0.0,54.0,83071570.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
3,0.0,54.0,6.0,64.0,0.399501,0.399501,0.0,0.0,0.0,0.0,...,0.0,54.0,83068270.0,9.5,10.392305,0.0,0.0,0.0,141.55,13
4,18.764876,198.0,6.0,65.74,0.146889,0.146889,0.0,0.0,0.0,0.0,...,2.696344,54.87,82942900.0,9.5,10.464001,3.819791,61.574028,0.12,141.55,20


## Synthetic Data Analysis

In [8]:
full_data_resampled.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,...,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0,1139986.0
mean,104.1765,216971.5,10.43356,78.47432,4892.831,4892.831,1.775804e-07,0.02964556,0.1102261,0.0490272,...,175.0225,326.6268,87944450.0,9.717141,21.16006,247.4353,140402.8,0.5587166,147.1535,16.5
std,730.7171,788902.4,9.771454,28.00244,62816.19,62816.19,4.906756e-05,0.1693637,0.3115322,0.2145552,...,289.6947,405.8108,53165900.0,2.548059,13.06832,409.834,512561.5,0.4374696,65.62268,9.810713
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,0.006666375,58.86,6.0,64.0,4.789924,4.789924,0.0,0.0,0.0,0.0,...,0.0,54.12,82999730.0,9.5,10.39943,0.0,0.0,0.0,141.55,8.0
50%,2.451058,4719.151,6.358265,64.64206,24.21524,24.21524,0.0,0.0,0.0,0.0,...,48.206,122.3556,83330870.0,9.5,15.39078,68.08737,3681.009,0.8399309,141.55,16.5
75%,48.45046,38412.59,11.27592,85.77678,106.3859,106.3859,0.0,0.0,0.0,0.0,...,227.0818,554.0,83769320.0,9.5,33.07644,320.9719,99329.96,0.95,141.55,25.0
max,65735.3,9815555.0,47.0,255.0,6291456.0,6291456.0,0.02202362,1.0,1.0,1.0,...,6225.799,5091.6,167639400.0,13.5,112.022,8804.609,44329820.0,1.0,244.6,33.0


In [None]:
from ydata_profiling import ProfileReport

original_report = ProfileReport(full_data, title='Original Data')
original_report.to_file("original_report.html")

resampled_report = ProfileReport(full_data_resampled, title='Resampled Data')
resampled_report.to_file("resampled_report.html")

comparison_report = original_report.compare(resampled_report)
comparison_report.to_file('original_vs_resampled.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]