# Introduction
The aim is to implement a ML solution that will perform better than the provided example jupyter notebook that utilises basic logistic regression from the scikit module. The data was scaled and then fit into that model. The following results were obtained:
# LogisticRegression
## LogisticRegression (34 classes)
accuracy_score:  0.802\
recall_score:  0.595\
precision_score:  0.487\
f1_score:  0.494

## LogisticRegression (8 classes)
accuracy_score =  0.832\
recall_score =  0.696\
precision_score =  0.512\
f1_score =  0.539

## LogisticRegression (2 classes)
accuracy_score:  0.989\
recall_score:  0.890\
precision_score:  0.864\
f1_score:  0.877

# SGD
## SGD (34 classes)
accuracy_score:  0.7860381477619159
recall_score:  0.5179118961920882
precision_score:  0.42184457529453795
f1_score:  0.4256465870416849

## SGD (8 classes)
accuracy_score =  0.8226369976250854
recall_score =  0.68058577451911
precision_score =  0.4617525150801485
f1_score =  0.49719100438594754

## SGD (2 classes)
accuracy_score:  0.9868773803425305
recall_score:  0.8686111308207715
precision_score:  0.8334084807767805
f1_score:  0.8500759333973384


It is worth noting that good accuracy score in this case is not a good metric due to high amount of malicious packet data compared to benign. To combat that, it is better to look at score metrics such as recall, precision and f1. Furthermore, duplication of benign entries is an option, but might not be a correct option due to the actual nature of DDoS attacks having very big amount of packets compared to normal benign traffic.

# Coding
## Importing packages and dataset

In [40]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
DATASET_DIR = r'../CICIoT2023'

In [41]:
df_sets = [k for k in os.listdir(DATASET_DIR) if k.endswith('.csv')]
df_sets.sort()
# Data is too large to parse as a single pd
index_range = range(len(df_sets))
import random
SEED = 42  # Set to None for randomness
if SEED:
    random.seed(SEED)
    print(f"INFO: Using seed {SEED}")
else:
    print(f"Using random seed")
CLASSES = 7 ## Valid values are 34, 7 or 2
if CLASSES not in (34, 7, 2):
    print("Please set a valid number of classes (34, 7, 2)")
    exit(1)
else:
    print(f"INFO: Using {CLASSES} classes")
# Manual train/test splitting
test_indexes_count = int(len(df_sets)*0.2)
train_indexes_count = int(len(df_sets)*0.8)
test_indexes = random.sample(index_range, test_indexes_count)
train_indexes = [i for i in index_range if i not in test_indexes]
test_data, train_data = [], []
print(test_indexes)
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

INFO: Using seed 42
INFO: Using 7 classes
[163, 28, 6, 70, 62, 57, 35, 26, 139, 22, 151, 108, 8, 7, 23, 55, 59, 129, 166, 143, 50, 160, 107, 56, 114, 71, 1, 40, 157, 87, 149, 39, 153]


## Scaling the data

In [42]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import joblib
scaler_filename = f"precomputed/scaler-{SEED}.save"

if os.path.exists(scaler_filename):
    scaler = joblib.load(scaler_filename) 
else:
    scaler = StandardScaler()
    for train_set in tqdm(train_indexes):
        scaler.fit(pd.read_csv(DATASET_DIR + '/' + df_sets[train_set])[X_columns])

    joblib.dump(scaler, scaler_filename)

## Initialise pytorch and check if GPU is available

In [43]:
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
if SEED:
    torch.manual_seed(SEED)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Detected calculation device: {DEVICE}")

Detected calculation device: cuda


## Define the custom neural network

In [44]:
class CustomClassifier(nn.Module):
    def __init__(self, input_dim, hidden_size, output_dim):
        super(CustomClassifier, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.LeakyReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(0.5)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.LeakyReLU(),
            nn.BatchNorm1d(hidden_size//2),
            nn.Dropout(0.5)
        )
        self.layer3 = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//4),
            nn.LeakyReLU(),
            nn.BatchNorm1d(hidden_size//4),
            nn.Dropout(0.5)
        )
        self.output_layer = nn.Linear(hidden_size//2, output_dim)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.output_layer(x)
        return x

# Determine the model sizes
dim_check_dataset = pd.read_csv(DATASET_DIR + '/' + df_sets[0])
input_size = dim_check_dataset[X_columns].shape[1]  # ANN input size is count of variables
hidden_size = 64
output_size = len(dim_check_dataset[y_column].unique())  # Output size is number of possible labels
del dim_check_dataset

# Define the model, loss function, and optimizer
model = CustomClassifier(input_size, hidden_size, output_size).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)
label_encoder = LabelEncoder()
pass

## Main training loop
The data is read every iteration due to the weight of the whole data not being able to fit to memory at the same time

In [None]:
# Define class count overrides
if True:
    dict_7classes = {}
    dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
    dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
    dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
    dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
    dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
    dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
    dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
    dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
    dict_7classes['DDoS-SlowLoris'] = 'DDoS'
    dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'
    
    dict_7classes['DoS-UDP_Flood'] = 'DoS'
    dict_7classes['DoS-SYN_Flood'] = 'DoS'
    dict_7classes['DoS-TCP_Flood'] = 'DoS'
    dict_7classes['DoS-HTTP_Flood'] = 'DoS'
    
    
    dict_7classes['Mirai-greeth_flood'] = 'Mirai'
    dict_7classes['Mirai-greip_flood'] = 'Mirai'
    dict_7classes['Mirai-udpplain'] = 'Mirai'
    
    dict_7classes['Recon-PingSweep'] = 'Recon'
    dict_7classes['Recon-OSScan'] = 'Recon'
    dict_7classes['Recon-PortScan'] = 'Recon'
    dict_7classes['VulnerabilityScan'] = 'Recon'
    dict_7classes['Recon-HostDiscovery'] = 'Recon'
    
    dict_7classes['DNS_Spoofing'] = 'Spoofing'
    dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'
    
    dict_7classes['BenignTraffic'] = 'Benign'
    
    dict_7classes['BrowserHijacking'] = 'Web'
    dict_7classes['Backdoor_Malware'] = 'Web'
    dict_7classes['XSS'] = 'Web'
    dict_7classes['Uploading_Attack'] = 'Web'
    dict_7classes['SqlInjection'] = 'Web'
    dict_7classes['CommandInjection'] = 'Web'
    
    
    dict_7classes['DictionaryBruteForce'] = 'BruteForce'
    dict_2classes = {}
    dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
    dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
    dict_2classes['DDoS-SYN_Flood'] = 'Attack'
    dict_2classes['DDoS-UDP_Flood'] = 'Attack'
    dict_2classes['DDoS-TCP_Flood'] = 'Attack'
    dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
    dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
    dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
    dict_2classes['DDoS-SlowLoris'] = 'Attack'
    dict_2classes['DDoS-HTTP_Flood'] = 'Attack'
    
    dict_2classes['DoS-UDP_Flood'] = 'Attack'
    dict_2classes['DoS-SYN_Flood'] = 'Attack'
    dict_2classes['DoS-TCP_Flood'] = 'Attack'
    dict_2classes['DoS-HTTP_Flood'] = 'Attack'
    
    
    dict_2classes['Mirai-greeth_flood'] = 'Attack'
    dict_2classes['Mirai-greip_flood'] = 'Attack'
    dict_2classes['Mirai-udpplain'] = 'Attack'
    
    dict_2classes['Recon-PingSweep'] = 'Attack'
    dict_2classes['Recon-OSScan'] = 'Attack'
    dict_2classes['Recon-PortScan'] = 'Attack'
    dict_2classes['VulnerabilityScan'] = 'Attack'
    dict_2classes['Recon-HostDiscovery'] = 'Attack'
    
    dict_2classes['DNS_Spoofing'] = 'Attack'
    dict_2classes['MITM-ArpSpoofing'] = 'Attack'
    
    dict_2classes['BenignTraffic'] = 'Benign'
    
    dict_2classes['BrowserHijacking'] = 'Attack'
    dict_2classes['Backdoor_Malware'] = 'Attack'
    dict_2classes['XSS'] = 'Attack'
    dict_2classes['Uploading_Attack'] = 'Attack'
    dict_2classes['SqlInjection'] = 'Attack'
    dict_2classes['CommandInjection'] = 'Attack'
    
    dict_2classes['DictionaryBruteForce'] = 'Attack'  # Defin # Def

num_epochs = len(df_sets)
prev_loss = float('inf')

for epoch, t_i in enumerate(tqdm(train_indexes[:len(train_indexes)])):
    iter_data = pd.read_csv(DATASET_DIR + '/' + df_sets[t_i])
    optimizer.zero_grad()  # Reset the gradients before iteration
    
    # Generate synthetic data for this iteration
    iter_data[X_columns] = scaler.transform(iter_data[X_columns])
    X = iter_data[X_columns].values  # Convert DataFrame to numpy array
    if CLASSES == 7:
        y = np.array([dict_7classes[k] for k in iter_data[y_column]])
    elif CLASSES == 2:
        y = np.array([dict_2classes[k] for k in iter_data[y_column]])
    else:
        y = iter_data[y_column].values
    X_tensor = torch.tensor(X, dtype=torch.float32).to(DEVICE)  # Convert numpy array to torch tensor
    
    y_encoded = label_encoder.fit_transform(y)  # Encode labels (convert them from string)
    y_tensor = torch.tensor(y_encoded, dtype=torch.long).to(DEVICE)
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    if prev_loss - loss.item() < 0.01:
        tqdm.write(f'Ending training on epoch {epoch+1}, Loss: {loss.item():.4f}')
        break
    if (epoch+1) % (train_indexes_count//10) == 0:
        tqdm.write(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')



  4%|▍         | 6/136 [01:14<32:33, 15.02s/it]

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
pass
model.eval()  # Set the model to evaluation mode
val_losses = []
true_labels = []
pred_labels = []

with torch.no_grad():
    for test_set in tqdm(test_indexes):
        iter_data = pd.read_csv(DATASET_DIR + '/' + df_sets[test_set])
        iter_data[X_columns] = scaler.transform(iter_data[X_columns])
        X = iter_data[X_columns].values  # Convert DataFrame to numpy array
        if CLASSES == 7:
            y = np.array([dict_7classes[k] for k in iter_data[y_column]])
        elif CLASSES == 2:
            y = np.array([dict_2classes[k] for k in iter_data[y_column]])
        else:
            y = iter_data[y_column].values
        X_tensor = torch.tensor(X, dtype=torch.float32).to(DEVICE)  # Convert numpy array to tensor
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        y_tensor = torch.tensor(y_encoded, dtype=torch.long).to(DEVICE)
        
        outputs = model(X_tensor)
        loss = criterion(outputs, y_tensor)
        val_losses.append(loss.item())
        
        # Convert logits to class predictions
        _, predicted = torch.max(outputs, 1)
        
        true_labels.extend(y_tensor.tolist())
        pred_labels.extend(predicted.tolist())
    
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='macro')
conf_matrix = confusion_matrix(true_labels, pred_labels)

print(f'Validation Loss: {np.mean(val_losses):.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)

from uuid import uuid4
import inspect
current_uuid = str(uuid4())
print(f"Saving model with uuid {current_uuid}")
# TODO - improve the writeout (include seed if set, include parameters e.g. hidden_layer sizes etc etc)
with open(f'results/{current_uuid}', 'w') as f:
    if SEED:
        f.write(f'SEED was set: {SEED}\n')
    f.write(f'Input, Hidden and Output: {input_size}, {hidden_size}, {output_size}')
    f.write(f'\n\nValidation Loss: {np.mean(val_losses):.4f}\nAccuracy: {accuracy:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1 Score: {f1:.4f}\n\n')
    # Hacky way of saving the model class for reuse
    v = vars(model.__class__)
    methods = [name for name, attr in v.items() if inspect.isfunction(attr)]
    f.write('class CustomClassifier(nn.Module):\n')
    for m in methods:
        f.write(inspect.getsource(getattr(model, m)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

class_names = label_encoder.classes_
plt.figure(figsize=(12, 9))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix_seaborn.png', dpi=500, bbox_inches='tight')
plt.show()

# Convert confusion matrix array to a DataFrame
cm_df = pd.DataFrame(conf_matrix, index=class_names, columns=class_names)

# Export the DataFrame to a CSV file
cm_df.to_csv('confusion_matrix.csv')