## Imports

In [1]:
from imblearn.over_sampling import RandomOverSampler  # SMOTE for Nominal and Continuous
import numpy as np
import os
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

# Loading

In [2]:
DATASET_DIRECTORY = '../../dataset/'  # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

# If there are more than X CSV files, randomly select X files from the list
sample_size = 1

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# list of csv files used
data_sets = csv_filepaths

num_cols = [
    'flow_duration', 'Header_Length', 'Duration', 'Rate', 'Srate', 'ack_count', 'syn_count',
    'fin_count', 'urg_count', 'rst_count', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 
    'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight'
]
cat_cols = [
    'Protocol Type', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
    'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number', 'HTTP', 'HTTPS', 
    'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC'
]

full_data = pd.DataFrame()
for data_set in data_sets:
    print(f"data set {data_set} out of {len(data_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, data_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# prints an instance of each class
print("Before encoding:")
unique_labels = full_data['label'].unique()
for label in unique_labels:
    print(f"First instance of {label}:")
    print(full_data[full_data['label'] == label].iloc[0])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# prove if the data is loaded properly
print("Real data:")
print(full_data[:2])
print(full_data.shape)

# Assuming 'label' is the column name for the labels in the DataFrame `synth_data`
unique_labels = full_data['label'].nunique()

# Print the number of unique labels
print(f"There are {unique_labels} unique labels in the dataset.")

class_counts = full_data['label'].value_counts()
print(class_counts)

# Display the first few entries to verify the changes
full_data.describe()

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
count,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,...,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0,231907.0
mean,5.444153,77261.18,9.064799,66.333007,8898.707,8898.707,1e-06,0.085978,0.208006,0.090153,...,124.764024,33.436894,124.832625,83200760.0,9.499281,13.126537,47.248926,31044.44,0.096515,141.532313
std,266.687177,464312.8,8.924316,13.971176,96109.83,96109.83,0.000392,0.280333,0.405882,0.286401,...,240.285567,161.275642,240.963043,16995250.0,0.816364,8.629896,228.101589,314622.7,0.232806,21.001456
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0
25%,0.0,54.0,6.0,64.0,2.094615,2.094615,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,83071570.0,9.5,10.0,0.0,0.0,0.0,141.55
50%,0.0,54.0,6.0,64.0,15.79442,15.79442,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,83124540.0,9.5,10.392305,0.0,0.0,0.0,141.55
75%,0.107725,313.62,14.57,64.0,119.3428,119.3428,0.0,0.0,0.0,0.0,...,54.050113,0.384222,54.06,83344010.0,9.5,10.396725,0.517043,1.491975,0.08,141.55
max,61593.854157,9656095.0,47.0,255.0,6291456.0,6291456.0,0.180698,1.0,1.0,1.0,...,4850.548254,5471.876023,5744.5,167639400.0,14.5,96.991257,7750.386473,30146330.0,1.0,244.6


# Preprocessing
## Encoding Labels

In [3]:
label_encoder = LabelEncoder()
full_data['label'] = label_encoder.fit_transform(full_data['label'])

# Store label mappings
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Label mappings:", label_mapping)

# Retrieve the numeric codes for classes
class_codes = {label: label_encoder.transform([label])[0] for label in label_encoder.classes_}

# Print specific instances after label encoding
print("After encoding:")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data[full_data['label'] == code].iloc[0])

Label mappings: {0: 'Backdoor_Malware', 1: 'BenignTraffic', 2: 'BrowserHijacking', 3: 'CommandInjection', 4: 'DDoS-ACK_Fragmentation', 5: 'DDoS-HTTP_Flood', 6: 'DDoS-ICMP_Flood', 7: 'DDoS-ICMP_Fragmentation', 8: 'DDoS-PSHACK_Flood', 9: 'DDoS-RSTFINFlood', 10: 'DDoS-SYN_Flood', 11: 'DDoS-SlowLoris', 12: 'DDoS-SynonymousIP_Flood', 13: 'DDoS-TCP_Flood', 14: 'DDoS-UDP_Flood', 15: 'DDoS-UDP_Fragmentation', 16: 'DNS_Spoofing', 17: 'DictionaryBruteForce', 18: 'DoS-HTTP_Flood', 19: 'DoS-SYN_Flood', 20: 'DoS-TCP_Flood', 21: 'DoS-UDP_Flood', 22: 'MITM-ArpSpoofing', 23: 'Mirai-greeth_flood', 24: 'Mirai-greip_flood', 25: 'Mirai-udpplain', 26: 'Recon-HostDiscovery', 27: 'Recon-OSScan', 28: 'Recon-PingSweep', 29: 'Recon-PortScan', 30: 'SqlInjection', 31: 'Uploading_Attack', 32: 'VulnerabilityScan', 33: 'XSS'}
After encoding:
First instance of Backdoor_Malware (code 0):
flow_duration        32.146720
Header_Length      5884.900000
Protocol Type         8.200000
Duration            181.200000
Rate    

Save dataset for later comparison

In [4]:
original_data = full_data.copy()
original_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
133359,0.0,54.0,6.0,64.0,8.851011,8.851011,0.0,0.0,0.0,0.0,...,0.0,54.0,82951110.0,9.5,10.392305,0.0,0.0,0.0,141.55,20
117276,0.015947,11095.0,17.0,64.0,95686.873845,95686.873845,0.0,0.0,0.0,0.0,...,0.0,50.0,83106620.0,9.5,10.0,0.0,0.0,0.0,141.55,14
118279,0.024685,145.49,44.07,67.43,27.940284,27.940284,0.0,0.0,0.0,0.0,...,70.96383,556.59,83681370.0,9.5,33.265566,100.500185,13852.124497,0.39,141.55,23
82515,0.0,54.0,6.0,64.0,0.881997,0.881997,0.0,0.0,1.0,0.0,...,0.0,54.0,83093180.0,9.5,10.392305,0.0,0.0,0.0,141.55,10
183199,0.0,182.0,17.0,64.0,1.675658,1.675658,0.0,0.0,0.0,0.0,...,0.0,182.0,83007040.0,9.5,19.078784,0.0,0.0,0.0,141.55,21


## Scaling Numeric Features

In [5]:
# feature scaling
scaler = StandardScaler()

for data_set in tqdm(data_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + data_set)[num_cols])
    
# Scale the features in the dataframe
full_data[num_cols] = scaler.transform(full_data[num_cols])

# Display the first few entries to verify the changes
print(full_data)

# prep the data to be inputted into model
data = full_data

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.07it/s]

        flow_duration  Header_Length  Protocol Type  Duration      Rate  \
133359      -0.020414      -0.166283           6.00 -0.166988 -0.092497   
117276      -0.020354      -0.142504          17.00 -0.166988  0.903012   
118279      -0.020321      -0.166086          44.07  0.078518 -0.092298   
82515       -0.020414      -0.166283           6.00 -0.166988 -0.092580   
183199      -0.020414      -0.166007          17.00 -0.166988 -0.092572   
...               ...            ...            ...       ...       ...   
229119      -0.020414      -0.166283           6.00 -0.166988 -0.092574   
5192         0.099701       1.183294           8.20  0.262469 -0.091841   
208780      -0.020054      -0.112330          17.00 -0.166988  0.019649   
229611      -0.019302      -0.114194          17.00 -0.166988 -0.066236   
128037      -0.017954       0.000325           6.00 -0.166988 -0.090694   

           Srate  Drate  fin_flag_number  syn_flag_number  rst_flag_number  \
133359 -0.092497    0




## Preprocessed DataFrame

In [6]:
full_data.head()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
133359,-0.020414,-0.166283,6.0,-0.166988,-0.092497,-0.092497,0.0,0.0,0.0,0.0,...,-0.207328,-0.293957,-0.01469,0.000881,-0.316833,-0.20714,-0.098672,-0.414572,0.000842,20
117276,-0.020354,-0.142504,17.0,-0.166988,0.903012,0.903012,0.0,0.0,0.0,0.0,...,-0.207328,-0.310557,-0.005539,0.000881,-0.362292,-0.20714,-0.098672,-0.414572,0.000842,14
118279,-0.020321,-0.166086,44.07,0.078518,-0.092298,-0.092298,0.0,0.0,0.0,0.0,...,0.232689,1.791803,0.028279,0.000881,2.33364,0.233455,-0.054644,1.260648,0.000842,23
82515,-0.020414,-0.166283,6.0,-0.166988,-0.09258,-0.09258,0.0,0.0,1.0,0.0,...,-0.207328,-0.293957,-0.00633,0.000881,-0.316833,-0.20714,-0.098672,-0.414572,0.000842,10
183199,-0.020414,-0.166007,17.0,-0.166988,-0.092572,-0.092572,0.0,0.0,0.0,0.0,...,-0.207328,0.237246,-0.011399,0.000881,0.689726,-0.20714,-0.098672,-0.414572,0.000842,21


# Sampling (Naive-Random Over-Sampling)

In [8]:
min_class_size = full_data['label'].value_counts().min()

# Apply random over-sampling
ros = RandomOverSampler(random_state=42)

X = full_data.drop('label', axis=1)
y = full_data['label']
X_resampled, y_res = ros.fit_resample(X, y)

# Combine the resampled features and labels back into a single DataFrame
full_data_resampled = pd.DataFrame(X_resampled, columns=X.columns)
full_data_resampled['label'] = y_res

print("Resampled Data (SCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

full_data_resampled.head()

Resampled Data (SCALED):
First instance of Backdoor_Malware (code 0):
flow_duration      0.100127
Header_Length     -0.153725
Protocol Type      8.200000
Duration           8.221730
Rate              -0.087261
Srate             -0.087261
Drate              0.000000
fin_flag_number    0.000000
syn_flag_number    0.000000
rst_flag_number    0.000000
psh_flag_number    1.000000
ack_flag_number    1.000000
ece_flag_number    0.000000
cwr_flag_number    0.000000
ack_count         -0.315163
syn_count         -0.499627
fin_count         -0.329063
urg_count          0.051572
rst_count         -0.069568
HTTP               0.000000
HTTPS              0.000000
DNS                0.000000
Telnet             0.000000
SMTP               0.000000
SSH                0.000000
IRC                0.000000
TCP                1.000000
UDP                0.000000
DHCP               0.000000
ARP                0.000000
ICMP               0.000000
IPv                1.000000
LLC                1.000000
Tot su

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,-0.020414,-0.166283,6.0,-0.166988,-0.092497,-0.092497,0.0,0.0,0.0,0.0,...,-0.207328,-0.293957,-0.01469,0.000881,-0.316833,-0.20714,-0.098672,-0.414572,0.000842,20
1,-0.020354,-0.142504,17.0,-0.166988,0.903012,0.903012,0.0,0.0,0.0,0.0,...,-0.207328,-0.310557,-0.005539,0.000881,-0.362292,-0.20714,-0.098672,-0.414572,0.000842,14
2,-0.020321,-0.166086,44.07,0.078518,-0.092298,-0.092298,0.0,0.0,0.0,0.0,...,0.232689,1.791803,0.028279,0.000881,2.33364,0.233455,-0.054644,1.260648,0.000842,23
3,-0.020414,-0.166283,6.0,-0.166988,-0.09258,-0.09258,0.0,0.0,1.0,0.0,...,-0.207328,-0.293957,-0.00633,0.000881,-0.316833,-0.20714,-0.098672,-0.414572,0.000842,10
4,-0.020414,-0.166007,17.0,-0.166988,-0.092572,-0.092572,0.0,0.0,0.0,0.0,...,-0.207328,0.237246,-0.011399,0.000881,0.689726,-0.20714,-0.098672,-0.414572,0.000842,21


# Post-Processing
## Inverse-Scaling

In [9]:
full_data_resampled[num_cols] = scaler.inverse_transform(full_data_resampled[num_cols], copy=None)

print("Resampled Data (UNSCALED):")
for label, code in class_codes.items():
    # Print the first instance of each class
    print(f"First instance of {label} (code {code}):")
    print(full_data_resampled[full_data_resampled['label'] == code].iloc[0])

print("Number of negative values in DataFrame: ")
print(sum(n < 0 for n in full_data_resampled.values.flatten()))

full_data_resampled.head()

Resampled Data (UNSCALED):
First instance of Backdoor_Malware (code 0):
flow_duration        32.146720
Header_Length      5884.900000
Protocol Type         8.200000
Duration            181.200000
Rate                512.130305
Srate               512.130305
Drate                 0.000000
fin_flag_number       0.000000
syn_flag_number       0.000000
rst_flag_number       0.000000
psh_flag_number       1.000000
ack_flag_number       1.000000
ece_flag_number       0.000000
cwr_flag_number       0.000000
ack_count             0.000000
syn_count             0.000000
fin_count             0.000000
urg_count             9.800000
rst_count            15.500000
HTTP                  0.000000
HTTPS                 0.000000
DNS                   0.000000
Telnet                0.000000
SMTP                  0.000000
SSH                   0.000000
IRC                   0.000000
TCP                   1.000000
UDP                   0.000000
DHCP                  0.000000
ARP                   0.00000

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.0,54.0,6.0,64.0,8.851011,8.851011,0.0,0.0,0.0,0.0,...,0.0,54.0,82951110.0,9.5,10.392305,0.0,0.0,0.0,141.55,20
1,0.015947,11095.0,17.0,64.0,95686.873845,95686.873845,0.0,0.0,0.0,0.0,...,0.0,50.0,83106620.0,9.5,10.0,0.0,0.0,0.0,141.55,14
2,0.024685,145.49,44.07,67.43,27.940284,27.940284,0.0,0.0,0.0,0.0,...,70.96383,556.59,83681370.0,9.5,33.265566,100.500185,13852.124497,0.39,141.55,23
3,0.0,54.0,6.0,64.0,0.881997,0.881997,0.0,0.0,1.0,0.0,...,0.0,54.0,83093180.0,9.5,10.392305,0.0,0.0,0.0,141.55,10
4,0.0,182.0,17.0,64.0,1.675658,1.675658,0.0,0.0,0.0,0.0,...,0.0,182.0,83007040.0,9.5,19.078784,0.0,0.0,0.0,141.55,21


## Synthetic Data Analysis

In [11]:
full_data_resampled.describe()

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
count,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,...,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0,1210570.0
mean,148.8601,219029.8,10.38558,79.60601,6009.604,6009.604,5.616543e-07,0.02986362,0.1133243,0.04994094,...,178.0459,328.8881,85718220.0,9.611568,21.18187,251.6979,143466.0,0.549659,144.4265,16.5
std,1179.789,800611.5,9.810462,32.15884,67607.33,67607.33,0.0002437147,0.1702111,0.3169889,0.217823,...,297.6414,425.8133,53529040.0,2.565652,13.45449,421.0627,517770.0,0.4399838,66.08589,9.810712
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,42.0,0.0,1.0,9.165151,0.0,0.0,0.0,1.0,0.0
25%,0.002677339,58.0,6.0,64.0,3.041885,3.041885,0.0,0.0,0.0,0.0,...,0.0,54.06,82985950.0,9.5,10.39618,0.0,0.0,0.0,141.55,8.0
50%,1.241577,3172.2,6.16,64.0,20.87201,20.87201,0.0,0.0,0.0,0.0,...,41.47535,113.1,83314050.0,9.5,15.15516,58.57179,2703.756,0.84,141.55,16.5
75%,29.1492,33347.0,11.5,83.1,108.3257,108.3257,0.0,0.0,0.0,0.0,...,232.0932,554.0,83707270.0,9.5,33.02609,327.9464,92788.66,0.95,141.55,25.0
max,61593.85,9656095.0,47.0,255.0,6291456.0,6291456.0,0.1806978,1.0,1.0,1.0,...,5471.876,5744.5,167639400.0,14.5,96.99126,7750.386,30146330.0,1.0,244.6,33.0


In [10]:
from ydata_profiling import ProfileReport

original_report = ProfileReport(original_data, title='Original Data', minimal=True)
resampled_report = ProfileReport(full_data_resampled, title='Resampled Data', minimal=True)
comparison_report = original_report.compare(resampled_report)
comparison_report.to_file('./profile_reports/random_original_vs_resampled.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]