In [17]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

Python version: 3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]
Version info: sys.version_info(major=3, minor=10, micro=11, releaselevel='final', serial=0)


In [18]:
# Read the dataset files list
file_list = pd.read_csv('../Dataset/file_list.csv')
file_list.head()

Unnamed: 0,File,Category,Attack,Class
0,ARP_Spoofing_train.pcap.csv,SPOOFING,ARP Spoofing,1
1,Benign_train.pcap.csv,BENIGN,Benign,0
2,MQTT-DDoS-Connect_Flood_train.pcap.csv,MQTT,DDoS Connect Flood,1
3,MQTT-DDoS-Publish_Flood_train.pcap.csv,MQTT,DDoS Publish Flood,1
4,MQTT-DoS-Connect_Flood_train.pcap.csv,MQTT,DoS Connect Flood,1


In [19]:
# Import the dataset
data_dir = "../Dataset"
first_file_path = os.path.join(data_dir, file_list.iloc[0]['File'])
column_names = pd.read_csv(first_file_path, nrows=0).columns.tolist()

datasets_list = [] 
for _, row in file_list.iterrows():
    file_path = os.path.join(data_dir, row['File'])
    if os.path.exists(file_path):
        data = pd.read_csv(file_path, header=None, skiprows=1, names=column_names)
        data['Category'] = row['Category']
        data['Attack'] = row['Attack']
        data['Class'] = row['Class']
        datasets_list.append(data)
    else:
        print(f"File not found: {file_path}")

# Combine all datasets into a single DataFrame
dataset = pd.concat(datasets_list, ignore_index=True)
dataset

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Category,Attack,Class
0,866.600000,10.4,64.0,45722.390222,45722.390222,0.0,0.0,0.0,0.0,0.3,...,1.694026e+08,5.500000,16.963645,213.095221,66236.076476,0.900000,38.500000,SPOOFING,ARP Spoofing,1
1,3934.300000,12.6,131.2,35708.799475,35708.799475,0.0,0.0,0.0,0.0,0.3,...,1.694026e+08,13.500000,30.885371,708.919620,251721.126817,1.000000,244.600000,SPOOFING,ARP Spoofing,1
2,5592.800000,12.6,97.6,66.403506,66.403506,0.0,0.0,0.0,0.0,0.3,...,1.361110e-02,5.500000,21.787095,290.694475,84028.647525,0.900000,38.500000,SPOOFING,ARP Spoofing,1
3,9303.600000,14.8,80.8,51.201280,51.201280,0.0,0.0,0.0,0.0,0.1,...,1.694026e+08,13.500000,26.954506,597.046005,178453.001691,1.000000,244.600000,SPOOFING,ARP Spoofing,1
4,8592.400000,12.6,98.6,42.706455,42.706455,0.0,0.0,0.0,0.0,0.2,...,1.393099e-02,5.500000,24.255148,375.324132,80115.110731,0.900000,38.500000,SPOOFING,ARP Spoofing,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160826,13234.910000,17.0,64.0,29460.045053,29460.045053,0.0,0.0,0.0,0.0,0.0,...,8.467524e+07,9.500000,10.164175,8.453393,279.410488,0.130000,141.550000,DoS,DoS UDP,1
7160827,18136.530000,17.0,64.0,29318.535060,29318.535060,0.0,0.0,0.0,0.0,0.0,...,8.467524e+07,9.500000,10.060924,3.012129,35.475423,0.130000,141.550000,DoS,DoS UDP,1
7160828,23038.040000,17.0,64.0,29363.556139,29363.556139,0.0,0.0,0.0,0.0,0.0,...,8.467524e+07,9.500000,10.075262,3.287396,34.963050,0.160000,141.550000,DoS,DoS UDP,1
7160829,58526.160000,17.0,64.0,26168.955034,26168.955034,0.0,0.0,0.0,0.0,0.0,...,8.467524e+07,9.500000,18.631072,205.641671,70932.249555,0.350000,141.550000,DoS,DoS UDP,1


In [20]:
dataset['Category'].value_counts()

Category
DDoS        4779859
DoS         1805529
MQTT         262938
BENIGN       192732
RECON        103726
SPOOFING      16047
Name: count, dtype: int64

In [21]:
# Select 'Category' to perform multiclass classification
selected_data = dataset.iloc[:, :46]
selected_data.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Category
0,866.6,10.4,64.0,45722.390222,45722.390222,0.0,0.0,0.0,0.0,0.3,...,150.681076,431.8,169402600.0,5.5,16.963645,213.095221,66236.076476,0.9,38.5,SPOOFING
1,3934.3,12.6,131.2,35708.799475,35708.799475,0.0,0.0,0.0,0.0,0.3,...,500.702909,406.3,169402600.0,13.5,30.885371,708.91962,251721.126817,1.0,244.6,SPOOFING
2,5592.8,12.6,97.6,66.403506,66.403506,0.0,0.0,0.0,0.0,0.3,...,205.552035,386.6,0.0136111,5.5,21.787095,290.694475,84028.647525,0.9,38.5,SPOOFING
3,9303.6,14.8,80.8,51.20128,51.20128,0.0,0.0,0.0,0.0,0.1,...,421.68366,300.2,169402600.0,13.5,26.954506,597.046005,178453.001691,1.0,244.6,SPOOFING
4,8592.4,12.6,98.6,42.706455,42.706455,0.0,0.0,0.0,0.0,0.2,...,265.394239,209.2,0.01393099,5.5,24.255148,375.324132,80115.110731,0.9,38.5,SPOOFING


In [22]:
# Over sampling
samples = 100000
categories = ['SPOOFING', 'BENIGN', 'MQTT', 'RECON', 'DDoS', 'DoS']

X = selected_data.iloc[:, :-1]
y = selected_data.iloc[:, -1]
class_counts = Counter(y)
print("Class distribution:", class_counts)

# Sampling strategies
smote_strategy = {}
undersample_strategy = {}
for category, count in class_counts.items():
    if count < samples:
        smote_strategy[category] = samples
    elif count > samples:
        undersample_strategy[category] = samples
print("SMOTE strategy:", smote_strategy)
print("Undersample strategy:", undersample_strategy)

# SMOTE
if smote_strategy:
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_smote, y_smote = smote.fit_resample(X, y)
else:
    X_smote, y_smote = X, y
    
smote_data = pd.DataFrame(X_smote, columns=X.columns)
smote_data['Label'] = y_smote

# Random under sampling
if undersample_strategy:
    undersampler = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=42)
    X_balanced, y_balanced = undersampler.fit_resample(smote_data.iloc[:, :-1], smote_data['Label'])
else:
    X_balanced, y_balanced = smote_data.iloc[:, :-1], smote_data['Label']

processed_data = pd.DataFrame(X_balanced, columns=X.columns)
processed_data['Label'] = y_balanced

Class distribution: Counter({'DDoS': 4779859, 'DoS': 1805529, 'MQTT': 262938, 'BENIGN': 192732, 'RECON': 103726, 'SPOOFING': 16047})
SMOTE strategy: {'SPOOFING': 100000}
Undersample strategy: {'BENIGN': 100000, 'MQTT': 100000, 'RECON': 100000, 'DDoS': 100000, 'DoS': 100000}


In [23]:
processed_data['Label'].value_counts()

Label
BENIGN      100000
DDoS        100000
DoS         100000
MQTT        100000
RECON       100000
SPOOFING    100000
Name: count, dtype: int64

In [24]:
# Assign clients ID
processed_data["Client_ID"] = np.random.randint(1, 6, size=len(processed_data))
processed_data["Client_ID"].value_counts()

Client_ID
2    120216
3    120139
4    120055
5    120050
1    119540
Name: count, dtype: int64

In [25]:
# Encode the class labels
category_mapping = {
    'BENIGN': 0,
    'DDoS': 1,
    'DoS': 2,
    'MQTT': 3,
    'RECON': 4,
    'SPOOFING': 5
}

processed_data['Label'] = processed_data['Label'].map(category_mapping)
print(processed_data['Label'].unique())

[0 1 2 3 4 5]


In [26]:
processed_data.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Label,Client_ID
172764,163172.2,6.0,64.0,1.772551,1.772551,0.0,0.0,0.0,0.0,0.5,...,80.7,0.0001121998,5.5,12.652344,18.369118,202.737444,0.9,38.5,0,2
173803,266693.5,6.0,64.0,2.03362,2.03362,0.0,0.0,0.0,0.0,0.5,...,80.5,169465500.0,13.5,12.610831,23.221657,270.507686,1.0,244.6,0,1
105350,263942.0,9.3,82.0,26.90114,26.90114,0.0,0.0,0.0,0.0,0.3,...,162.1,0.007022214,5.5,19.139165,361.675101,83374.267649,0.9,38.5,0,2
106123,1462.4,6.0,64.0,10073.426205,10073.426205,0.0,0.0,0.0,0.0,0.5,...,80.7,169461900.0,13.5,12.405708,18.97892,182.7442,1.0,244.6,0,3
139945,157695.6,6.0,64.0,1.250658,1.250658,0.0,0.0,0.0,0.0,0.5,...,81.0,9.951591e-05,5.5,12.488825,20.610944,252.745802,0.9,38.5,0,1


In [27]:
# Normalize the data
scaler = MinMaxScaler()
normalize_data = processed_data.columns[:-2]
processed_data[normalize_data] = scaler.fit_transform(processed_data[normalize_data])
processed_data.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Label,Client_ID
172764,0.016489,0.352941,0.25098,8.452181e-07,8.452181e-07,0.0,0.0,0.0,0.0,0.5,...,0.026291,7.56581e-09,0.321429,0.076036,0.018005,0.00039,0.9,0.153941,0,2
173803,0.026951,0.352941,0.25098,9.697056e-07,9.697056e-07,0.0,0.0,0.0,0.0,0.5,...,0.026155,0.9999767,0.892857,0.075131,0.022761,0.00052,1.0,1.0,0,1
105350,0.026673,0.547059,0.321569,1.282746e-05,1.282746e-05,0.0,0.0,0.0,0.0,0.3,...,0.08159,7.606584e-09,0.321429,0.217478,0.354503,0.1602,0.9,0.153941,0,2
106123,0.000148,0.352941,0.25098,0.004803384,0.004803384,0.0,0.0,0.0,0.0,0.5,...,0.026291,0.9999554,0.892857,0.070659,0.018603,0.000351,1.0,1.0,0,3
139945,0.015936,0.352941,0.25098,5.963604e-07,5.963604e-07,0.0,0.0,0.0,0.0,0.5,...,0.026495,7.565735e-09,0.321429,0.072471,0.020202,0.000486,0.9,0.153941,0,1


In [28]:
# Shuffle the dataset
processed_data = processed_data.sample(frac=1).reset_index(drop=True)
processed_data.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Label,Client_ID
0,5.5e-05,0.352941,0.25098,5e-06,5e-06,0.0,0.16,0.09,0.33,0.15,...,0.015836,0.499601,0.607143,0.04948,0.011763,0.00016,0.94,0.57697,3,1
1,9e-06,0.352941,0.25098,8e-06,8e-06,0.0,0.0,1.0,0.0,0.0,...,0.009864,0.499774,0.607143,0.031875,0.000465,1e-06,0.19,0.57697,1,1
2,0.004105,1.0,0.25098,0.011914,0.011914,0.0,0.0,0.0,0.0,0.0,...,0.005435,0.49967,0.607143,0.018203,0.0,0.0,0.0,0.57697,2,4
3,0.001876,0.280896,0.186642,8e-06,8e-06,0.0,0.0,0.0,0.0,0.357585,...,0.037572,0.999606,0.892857,0.135256,0.211664,0.046826,1.0,1.0,5,3
4,0.004661,0.352941,0.25098,3.9e-05,3.9e-05,0.0,0.0,0.0,0.0,0.71,...,0.08591,0.499597,0.607143,0.19866,0.059434,0.005015,0.74,0.57697,3,5


In [29]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 47 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Header_Length    600000 non-null  float64
 1   Protocol Type    600000 non-null  float64
 2   Duration         600000 non-null  float64
 3   Rate             600000 non-null  float64
 4   Srate            600000 non-null  float64
 5   Drate            600000 non-null  float64
 6   fin_flag_number  600000 non-null  float64
 7   syn_flag_number  600000 non-null  float64
 8   rst_flag_number  600000 non-null  float64
 9   psh_flag_number  600000 non-null  float64
 10  ack_flag_number  600000 non-null  float64
 11  ece_flag_number  600000 non-null  float64
 12  cwr_flag_number  600000 non-null  float64
 13  ack_count        600000 non-null  float64
 14  syn_count        600000 non-null  float64
 15  fin_count        600000 non-null  float64
 16  rst_count        600000 non-null  floa

In [30]:
processed_data.describe()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Label,Client_ID
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,...,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,0.050926,0.453399,0.270554,0.002799,0.002799,0.0,0.023437,0.158319,0.110421,0.143435,...,0.08498,0.4997047,0.607021,0.128803,0.073177,0.039158,0.461635,0.576791,2.5,3.001432
std,0.145736,0.274197,0.114028,0.012767,0.012767,0.0,0.070064,0.321652,0.26336,0.211294,...,0.198401,0.3546444,0.202708,0.215099,0.164838,0.11789,0.443954,0.300082,1.707827,1.413407
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.1e-05,0.352941,0.25098,2e-06,2e-06,0.0,0.0,0.0,0.0,0.0,...,0.008152,8.156627e-09,0.321429,0.026757,0.0,0.0,0.0,0.153941,1.0,2.0
50%,0.000488,0.352941,0.25098,7e-06,7e-06,0.0,0.0,0.0,0.0,0.0,...,0.016114,0.4996491,0.607143,0.050033,0.003556,3e-05,0.4,0.57697,2.5,3.0
75%,0.010811,0.417647,0.25098,7.1e-05,7.1e-05,0.0,0.0,0.11,0.0,0.27,...,0.032948,0.9991383,0.892857,0.091894,0.028886,0.001585,0.9,1.0,4.0,4.0
max,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0


In [31]:
processed_data.to_csv("multi.csv", index=False, header=True)