In [None]:
import pandas as pd
import os

dataset_path = '../../Datasets/CIC-DDOS2019/Kaggle Preprocessed/'
dataframes = []
file_list = os.listdir(dataset_path)
for file_name in file_list:
    if file_name.endswith('.parquet'):
        file_path = os.path.join(dataset_path, file_name)
        df = pd.read_parquet(file_path)
        dataframes.append(df)

In [None]:
# Concatenate dataset files and correct labels
df = pd.concat(dataframes)
del dataframes
df['Label'] = df['Label'].str.replace('DrDoS_DNS', 'DNS')
df['Label'] = df['Label'].str.replace('DrDoS_LDAP', 'LDAP')
df['Label'] = df['Label'].str.replace('DrDoS_MSSQL', 'MSSQL')
df['Label'] = df['Label'].str.replace('DrDoS_NTP', 'NTP')
df['Label'] = df['Label'].str.replace('DrDoS_NetBIOS', 'NetBIOS')
df['Label'] = df['Label'].str.replace('DrDoS_SNMP', 'SNMP')
df['Label'] = df['Label'].str.replace('DrDoS-SSDP', 'SSDP')
df['Label'] = df['Label'].str.replace('DrDoS_UDP', 'UDP')
df['Label'] = df['Label'].str.replace('UDPLag', 'UDP-lag')

In [None]:
# Remove classes that have few samples in the graphs approach for conformity
df = df[df['Label'] != 'DNS']
df = df[df['Label'] != 'LDAP']
df = df[df['Label'] != 'MSSQL']
df = df[df['Label'] != 'NTP']
df = df[df['Label'] != 'NetBIOS']
df = df[df['Label'] != 'SNMP']
df = df[df['Label'] != 'SSDP']
df = df[df['Label'] != 'Portmap']
df = df[df['Label'] != 'WebDDoS']

In [None]:
# Set all attack labels to Malicious class
df.loc[df['Label'] != 'Benign', 'Label'] = 'Malicious' # type: ignore

In [None]:
data = df[df.columns[:-1]] # type: ignore
categories = df['Label'].astype('category')

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
data = scaler.fit_transform(data)

In [None]:
# 10-fold data (for 10-fold cross validation), 
from sklearn.model_selection import StratifiedKFold
from run_models.run_models import run_models
from imblearn.over_sampling import SMOTE
n_splits = 10
kf = StratifiedKFold(n_splits=n_splits, shuffle=True)
results = []
current_split = 1
for train_idx, test_idx in kf.split(data, categories):
    print(f"Fold {current_split}/{n_splits}")
    data_train, data_test = data[train_idx], data[test_idx]
    categories_train, categories_test = categories.iloc[train_idx], categories.iloc[test_idx] # type: ignore
    # Oversample infrequent data using SMOTE
    ros = SMOTE(random_state=101)
    data_train, categories_train = ros.fit_resample(data_train, categories_train) # type: ignore
    results.append(run_models(data_train, categories_train, data_test, categories_test))
    current_split += 1

In [None]:
from classifiers.get_mean_performance import get_mean_performance
display(get_mean_performance(results,n_splits)) # type: ignore