## Imports

In [1]:
from imblearn.over_sampling import SMOTENC  # SMOTE for Nominal and Continuous
import numpy as np
import os
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

# Loading

In [2]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 1

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

num_cols = [
    'flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count',
    'fin_count', 'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 
    'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight'
]
cat_cols = [
    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
    'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'HTTP', 'HTTPS', 
    'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC'
]

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,...,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0,235026.0
mean,4.770044,75896.45,9.04422,66.298206,9048.387,9048.387,1.1e-05,0.086556,0.2077,0.090454,...,124.06725,33.470811,124.053255,83209520.0,9.499624,13.097474,47.307138,31793.82,0.095433,141.539681
std,199.761689,457559.9,8.915657,13.861096,97411.53,97411.53,0.004266,0.281184,0.405662,0.286831,...,239.762521,163.081267,240.719829,16987320.0,0.81543,8.592848,230.686087,338729.9,0.231661,20.985772
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,4.768372e-07,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.092067,2.092067,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83068550.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.73803,15.73803,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124510.0,9.5,10.392305,0.0,0.0,0.0,141.55
75%,0.104165,247.66,13.8,64.0,115.9475,115.9475,0.0,0.0,0.0,0.0,...,54.046126,0.359288,54.06,83343900.0,9.5,10.396174,0.505921,1.313486,0.08,141.55
max,47871.07856,9618315.0,47.0,255.0,7340032.0,7340032.0,2.059601,1.0,1.0,1.0,...,7822.769286,6687.263193,4702.9,167639400.0,14.0,120.977793,9476.110538,45163320.0,1.0,244.6


# Preprocessing
## Encoding Labels

In [3]:
label_encoder = LabelEncoder()
full_data['label'] = label_encoder.fit_transform(full_data['label'])

# Store label mappings
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: label_encoder.transform([label])[0] for label in label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data[full_data['label'] == code].iloc[0])

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration      6.500640e-02
Header_Length      1.962000e+02
Protocol Type      8.100000e+00
Duration           6.530000e+01
Rate

Save dataset for later comparison

In [4]:
original_data = full_data.copy()
original_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
67720,0.0,54.0,6.0,64.0,331.570448,331.570448,0.0,0.0,0.0,0.0,...,0.0,54.0,83313710.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
135264,2.397592,108.0,6.0,64.0,0.878495,0.878495,0.0,0.0,1.0,0.0,...,0.0,54.0,83362570.0,9.5,10.392305,0.0,0.0,0.0,141.55,12
43905,26.076783,16852.8,8.2,100.9,5.195949,5.195949,0.0,0.0,0.0,0.0,...,336.169649,392.3,166612600.0,13.5,20.679959,476.700042,128421.00025,1.0,244.6,22
149469,0.033693,21225.0,17.0,64.0,12586.401367,12586.401367,0.0,0.0,0.0,0.0,...,0.0,50.0,83011560.0,9.5,10.0,0.0,0.0,0.0,141.55,21
154644,0.121074,27930.0,17.0,64.0,4499.648197,4499.648197,0.0,0.0,0.0,0.0,...,0.0,50.0,83015700.0,9.5,10.0,0.0,0.0,0.0,141.55,21


## Scaling Numeric Features

In [5]:
# feature scaling
scaler = StandardScaler()

for data_set in tqdm(data_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + data_set)[num_cols])
    
# Scale the features in the dataframe
full_data[num_cols] = scaler.transform(full_data[num_cols])

# Display the first few entries to verify the changes
print(full_data)

# prep the data to be inputted into model
data = full_data

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.28it/s]

        flow_duration  Header_Length  Protocol Type  Duration      Rate  \
67720       -0.023879      -0.165754           6.00 -0.165803 -0.089485   
135264      -0.011876      -0.165636           6.00 -0.165803 -0.092879   
43905        0.106661      -0.129041           8.20  2.496330 -0.092835   
149469      -0.023710      -0.119485          17.00 -0.165803  0.036320   
154644      -0.023273      -0.104831          17.00 -0.165803 -0.046696   
...               ...            ...            ...       ...       ...   
229119      -0.023879      -0.165757           5.88 -0.258148 -0.092751   
5192        -0.023879      -0.165872           1.00 -0.165803 -0.091683   
208780       0.127273      -0.156863           7.10 -1.146968 -0.092725   
229611      -0.023879      -0.165865           1.31 -0.126845 -0.092875   
128037      -0.023879      -0.165872           1.00 -0.165803 -0.092800   

           Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_number  \
67720  -0.089485    0




## Preprocessed DataFrame

In [6]:
full_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
67720,-0.023879,-0.165754,6.0,-0.165803,-0.089485,-0.089485,0.0,0.0,0.0,0.0,...,-0.205241,-0.291016,0.006133,0.000461,-0.314817,-0.205072,-0.093862,-0.411954,0.000492,8
135264,-0.011876,-0.165636,6.0,-0.165803,-0.092879,-0.092879,0.0,0.0,1.0,0.0,...,-0.205241,-0.291016,0.00901,0.000461,-0.314817,-0.205072,-0.093862,-0.411954,0.000492,12
43905,0.106661,-0.129041,8.2,2.49633,-0.092835,-0.092835,0.0,0.0,0.0,0.0,...,1.856127,1.114355,4.909739,4.90586,0.88242,1.861377,0.285264,3.904715,4.910972,22
149469,-0.02371,-0.119485,17.0,-0.165803,0.03632,0.03632,0.0,0.0,0.0,0.0,...,-0.205241,-0.307633,-0.011653,0.000461,-0.360472,-0.205072,-0.093862,-0.411954,0.000492,21
154644,-0.023273,-0.104831,17.0,-0.165803,-0.046696,-0.046696,0.0,0.0,0.0,0.0,...,-0.205241,-0.307633,-0.01141,0.000461,-0.360472,-0.205072,-0.093862,-0.411954,0.000492,21


# Sampling (SMOTE Over-Sampling)

In [9]:
min_class_size = full_data['label'].value_counts().min()

# Apply random over-sampling
smote_os = SMOTENC(categorical_features=cat_cols, random_state=42)

X = full_data.drop('label', axis=1)
y = full_data['label']
X_resampled, y_res = smote_os.fit_resample(X, y)

# Combine the resampled features and labels back into a single DataFrame
full_data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
full_data_resampled['label'] = y_res

print("Resampled Data (SCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

full_data_resampled.head()

smote built
smote fit
Resampled Data (SCALED):
First instance of Backdoor_Malware (code 0):
flow_duration     -0.023553
Header_Length     -0.165444
Protocol Type      8.100000
Duration          -0.072015
Rate              -0.092082
Srate             -0.092082
Drate              0.000000
fin_flag_number    0.000000
syn_flag_number    0.000000
rst_flag_number    0.000000
psh_flag_number    0.000000
ack_flag_number    0.000000
ece_flag_number    0.000000
cwr_flag_number    0.000000
ack_count          1.784888
syn_count          1.015885
fin_count         -0.332220
urg_count         -0.084721
rst_count         -0.111528
HTTP               0.000000
HTTPS              0.000000
DNS                0.000000
Telnet             0.000000
SMTP               0.000000
SSH                0.000000
IRC                0.000000
TCP                0.000000
UDP                0.000000
DHCP               0.000000
ARP                0.000000
ICMP               0.000000
IPv                1.000000
LLC         

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.023879,-0.165754,6.0,-0.165803,-0.089485,-0.089485,0.0,0.0,0.0,0.0,...,-0.205241,-0.291016,0.006133,0.000461,-0.314817,-0.205072,-0.093862,-0.411954,0.000492,8
1,-0.011876,-0.165636,6.0,-0.165803,-0.092879,-0.092879,0.0,0.0,1.0,0.0,...,-0.205241,-0.291016,0.00901,0.000461,-0.314817,-0.205072,-0.093862,-0.411954,0.000492,12
2,0.106661,-0.129041,8.2,2.49633,-0.092835,-0.092835,0.0,0.0,0.0,0.0,...,1.856127,1.114355,4.909739,4.90586,0.88242,1.861377,0.285264,3.904715,4.910972,22
3,-0.02371,-0.119485,17.0,-0.165803,0.03632,0.03632,0.0,0.0,0.0,0.0,...,-0.205241,-0.307633,-0.011653,0.000461,-0.360472,-0.205072,-0.093862,-0.411954,0.000492,21
4,-0.023273,-0.104831,17.0,-0.165803,-0.046696,-0.046696,0.0,0.0,0.0,0.0,...,-0.205241,-0.307633,-0.01141,0.000461,-0.360472,-0.205072,-0.093862,-0.411954,0.000492,21


# Post-Processing
## Inverse-Scaling

In [10]:
full_data_resampled[num_cols] = scaler.inverse_transform(full_data_resampled[num_cols], copy=None)

print("Resampled Data (UNSCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

print("Number of negative values in DataFrame: ")
print(sum(n < 0 for n in full_data_resampled.values.flatten()))

full_data_resampled.head()

Resampled Data (UNSCALED):
First instance of Backdoor_Malware (code 0):
flow_duration      6.500640e-02
Header_Length      1.962000e+02
Protocol Type      8.100000e+00
Duration           6.530000e+01
Rate               7.857334e+01
Srate              7.857334e+01
Drate              0.000000e+00
fin_flag_number    0.000000e+00
syn_flag_number    0.000000e+00
rst_flag_number    0.000000e+00
psh_flag_number    0.000000e+00
ack_flag_number    0.000000e+00
ece_flag_number    0.000000e+00
cwr_flag_number    0.000000e+00
ack_count          6.000000e-01
syn_count          1.000000e+00
fin_count          0.000000e+00
urg_count          0.000000e+00
rst_count          2.000000e+00
HTTP               0.000000e+00
HTTPS              0.000000e+00
DNS                0.000000e+00
Telnet             0.000000e+00
SMTP               0.000000e+00
SSH                0.000000e+00
IRC                0.000000e+00
TCP                0.000000e+00
UDP                0.000000e+00
DHCP               0.000000e+00


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.0,54.0,6.0,64.0,331.570448,331.570448,0.0,0.0,0.0,0.0,...,0.0,54.0,83313710.0,9.5,10.392305,0.0,0.0,0.0,141.55,8
1,2.397592,108.0,6.0,64.0,0.878495,0.878495,0.0,0.0,1.0,0.0,...,0.0,54.0,83362570.0,9.5,10.392305,0.0,0.0,0.0,141.55,12
2,26.076783,16852.8,8.2,100.9,5.195949,5.195949,0.0,0.0,0.0,0.0,...,336.169649,392.3,166612600.0,13.5,20.679959,476.700042,128421.00025,1.0,244.6,22
3,0.033693,21225.0,17.0,64.0,12586.401367,12586.401367,0.0,0.0,0.0,0.0,...,0.0,50.0,83011560.0,9.5,10.0,0.0,0.0,0.0,141.55,21
4,0.121074,27930.0,17.0,64.0,4499.648197,4499.648197,0.0,0.0,0.0,0.0,...,0.0,50.0,83015700.0,9.5,10.0,0.0,0.0,0.0,141.55,21


## Synthetic Data Analysis

In [13]:
full_data_resampled.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,...,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0,1234064.0
mean,96.45946,219690.6,10.45673,79.02295,5088.202,5088.202,2.004325e-06,0.02938583,0.1140014,0.04794079,...,191.9804,334.411,86582800.0,9.652706,21.61972,271.4309,162936.3,0.5507287,145.4883,16.5
std,752.6567,774267.7,9.83422,28.78547,68110.89,68110.89,0.001861726,0.1688856,0.3178131,0.2136411,...,308.4139,408.9665,52877620.0,2.534764,13.29819,436.36,547244.6,0.4374627,65.28038,9.810712
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,4.768372e-07,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,0.005556011,58.06,6.0,64.0,4.91328,4.91328,0.0,0.0,0.0,0.0,...,0.0,54.12,82999090.0,9.5,10.39989,0.0,0.0,0.0,141.55,8.0
50%,2.044092,4014.606,6.11,64.7524,25.64525,25.64525,0.0,0.0,0.0,0.0,...,49.1107,126.6503,83314530.0,9.5,15.65987,69.30086,3889.396,0.8272346,141.55,16.5
75%,37.51861,36738.96,11.5,85.47141,115.3511,115.3511,0.0,0.0,0.0,0.0,...,277.863,554.6838,83759060.0,9.5,33.26893,392.5482,142064.3,0.95,141.55,25.0
max,47871.08,9618315.0,47.0,255.0,7340032.0,7340032.0,2.059601,1.0,1.0,1.0,...,6687.263,4702.9,167639400.0,14.0,120.9778,9476.111,45163320.0,1.0,244.6,33.0


In [16]:
from ydata_profiling import ProfileReport

original_report = ProfileReport(original_data, title='Original Data', minimal=True)
resampled_report = ProfileReport(full_data_resampled, title='Resampled Data', minimal=True)
comparison_report = original_report.compare(resampled_report)
comparison_report.to_file('./profile_reports/smote_original_vs_resampled.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]