<a href="https://colab.research.google.com/github/Jumabek/net_intrusion_detection/blob/transformer/Ablation_Experiment_Transformer_Settings_NF_ToN_IoT_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Overview
##Ablation Study of Transformer Architecture Settings

An Ablation Study of Transformer Architecture Settings is a systematic approach to evaluating the impact of various components and configurations within a transformer model. By incrementally adding or removing specific settings or layers, and measuring the effect on performance, this study aims to identify the most influential factors in the model's accuracy and efficiency. The end goal is to fine-tune the transformer architecture to achieve the optimal balance of speed and predictive power, culminating in a well-tuned model with validated settings.

# Data Preparation for Model Training


In [None]:
import numpy as np
import pandas as pd
import io
import os
import sys

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset,Subset

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, brier_score_loss, roc_auc_score, average_precision_score

## Set a fixed random seed


In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Ensuring that PyTorch's convolution operations are deterministic
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(42)

## Loading and Preprocessing of Dataset:

In [None]:
# Downloading the Dataset
!gdown --id 1JNsyqlCwT8IVudqj3yZ1y6wgbG7me5Pq
!unzip /content/NF-ToN-IoT-v2.zip

Downloading...
From (original): https://drive.google.com/uc?id=1JNsyqlCwT8IVudqj3yZ1y6wgbG7me5Pq
From (redirected): https://drive.google.com/uc?id=1JNsyqlCwT8IVudqj3yZ1y6wgbG7me5Pq&confirm=t&uuid=03a07d63-25bf-4432-82d1-11c7b8631e2b
To: /content/NF-ToN-IoT-v2.zip
100% 185M/185M [00:01<00:00, 99.4MB/s]
Archive:  /content/NF-ToN-IoT-v2.zip
  inflating: NF-ToN-IoT-v2/NetFlow_v2_Features.csv  
  inflating: NF-ToN-IoT-v2/NF-ToN-IoT-v2.csv  


In [None]:
# Loading the NF-ToN-IoT-v2 dataset from a CSV file into a DataFrame. This process takes approximately 46 seconds.
df_whole = pd.read_csv("/content/NF-ToN-IoT-v2/NF-ToN-IoT-v2.csv")

In [None]:
# Dropping specific columns from the DataFrame that are not required for the analysis.
# These include 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', and certain byte-related columns.
df = df_whole.drop(columns=["Attack","IPV4_SRC_ADDR", "IPV4_DST_ADDR","SRC_TO_DST_SECOND_BYTES","DST_TO_SRC_SECOND_BYTES"])

# Converting all columns in the DataFrame to float type for consistency and to facilitate numerical operations.
df = df.astype(float)

# Reducing the dataset size by randomly sampling 0.1% of the data to make the dataset more manageable and speed up computations.
# The random state is set to 42 for reproducibility.
df = df.sample(frac=0.01, random_state=42)

# Extract feature matrix (X) and target vector (y) from the dataframe
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

#Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def normalize(data,  mean_i, min_i , max_i ):
    """
    Normalizes the data
    Parameters:
    - data: The data to be normalized
    - mean_i: The mean of the data
    - min_i: The minimum value of the data
    - max_i: The maximum value of the data
    """
    eps = 1e-15
    r = max_i - min_i + eps
    data = (data - mean_i) / r

    return data

mean_i = np.mean(X_train, axis=0)
min_i = np.min(X_train, axis=0)
max_i = np.max(X_train, axis=0)

X_train = normalize(X_train,  mean_i, min_i , max_i )
X_test = normalize(X_test,  mean_i, min_i , max_i )

# Data Conversion to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

##Defining necessary functions

In [None]:
def evaluate_model(test_loader, model, device, name):
    model.to(device)
    model.eval()
    true_labels = []
    predicted_labels = []
    predicted_probs = []

    with torch.no_grad():  # No need to track gradients
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)  # Move the data to the devic
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())
            predicted_probs.extend(torch.softmax(outputs, dim=1).cpu().numpy())

    predicted_probs = np.array(predicted_probs)
    accuracy = accuracy_score(true_labels, predicted_labels)

    brier_score = brier_score_loss(true_labels, predicted_probs[:, 1])
    pr_auc = average_precision_score(true_labels, predicted_probs[:, 1])
    roc_auc = roc_auc_score(true_labels, predicted_probs[:, 1])

    # Calculate model size
    buffer = io.BytesIO()
    torch.save(model.state_dict(), buffer)
    model_size_kb = buffer.tell() / 1024  # Convert size to kilobytes

    metrics = {
        name: {
            'accuracy': accuracy,
            'brier_score': brier_score,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc,
            'model_size_kb': model_size_kb
        }
    }

    return metrics


In [None]:
def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    model = model.to(device)
    for epoch in range(num_epochs):
      model.train()
      running_loss = 0.00

      for i, (data, labels) in enumerate(train_loader):
          data, labels = data.cuda(), labels.cuda()
          outputs = model(data)
          loss = criterion(outputs, labels)

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          running_loss += loss.item()

      # Calculate and log average metrics for the epoch
      average_loss = running_loss / len(train_loader)

      print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_loss:.4f}")


In [None]:
all_metrics = []

#Experiment

##Experiment 1 on default transformer (pytorch)

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, d_model, nhead, num_layers, dim_feedforward, dropout):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add a fake batch dimension for the transformer
        x = self.transformer(x, x)  # Encoder-Decoder Self-Attention
        x = x.squeeze(1)
        x = self.fc(x)
        return x

In [None]:
# Set hyperparameters
hyperparams = {
    'batch_size': 512,
    'num_epochs': 5,
    'learning_rate': 0.001,
    'd_model': 512,
    'nhead': 8,
    'num_layers': 6,
    'dim_feedforward': 2048,
    'dropout': 0.1,
    'device': 'cuda'
}

train_loader = DataLoader(train_dataset, hyperparams['batch_size'])
test_loader = DataLoader(test_dataset, hyperparams['batch_size'] )

# Initialize model
model = TransformerModel(
    input_dim = X_train.shape[1],
    num_classes = len(np.unique([y_train])),
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_layers=hyperparams['num_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=hyperparams['learning_rate'])

# Train and evaluate
train_model(model, train_loader, criterion, optimizer, hyperparams['num_epochs'], hyperparams["device"] )
metrics = evaluate_model(test_loader, model, hyperparams["device"], name="Default")
all_metrics.append(metrics)
print(all_metrics)



Epoch [1/5], Average Loss: 1.0334
Epoch [2/5], Average Loss: 0.6610
Epoch [3/5], Average Loss: 0.6540
Epoch [4/5], Average Loss: 0.6534
Epoch [5/5], Average Loss: 0.6544
[{'Default': {'accuracy': 0.6440377804014168, 'brier_score': 0.23105694884769903, 'roc_auc': 0.5057505399978416, 'pr_auc': 0.6467899725862387, 'model_size_kb': 172578.056640625}}]


##After changing num_layers = 2


In [None]:
# Set hyperparameters
hyperparams = {
    'batch_size': 512,
    'num_epochs': 5,
    'learning_rate': 0.001,
    'd_model': 512,
    'nhead': 8,
    'num_layers': 2,
    'dim_feedforward': 2048,
    'dropout': 0.1,
    'device': 'cuda'
}

train_loader = DataLoader(train_dataset, hyperparams['batch_size'])
test_loader = DataLoader(test_dataset, hyperparams['batch_size'] )

# Initialize model
model1 = TransformerModel(
    input_dim = X_train.shape[1],
    num_classes = len(np.unique([y_train])),
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_layers=hyperparams['num_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters(), lr=hyperparams['learning_rate'])

# Train and evaluate
train_model(model1, train_loader, criterion, optimizer, hyperparams['num_epochs'], hyperparams["device"] )
metrics = evaluate_model(test_loader, model1, hyperparams["device"], name="num_layers = 2")
all_metrics.append(metrics)




Epoch [1/5], Average Loss: 1.0501
Epoch [2/5], Average Loss: 0.6573
Epoch [3/5], Average Loss: 0.6546
Epoch [4/5], Average Loss: 0.6547
Epoch [5/5], Average Loss: 0.6546


##After Changing d_model = 16

In [None]:
# Set hyperparameters
hyperparams = {
    'batch_size': 512,
    'num_epochs': 5,
    'learning_rate': 0.001,
    'd_model': 16,
    'nhead': 8,
    'num_layers': 2,
    'dim_feedforward': 2048,
    'dropout': 0.1,
    'device': 'cuda'
}

train_loader = DataLoader(train_dataset, hyperparams['batch_size'])
test_loader = DataLoader(test_dataset, hyperparams['batch_size'] )

# Initialize model
model2 = TransformerModel(
    input_dim = X_train.shape[1],
    num_classes = len(np.unique([y_train])),
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_layers=hyperparams['num_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=hyperparams['learning_rate'])

# Train and evaluate
train_model(model2, train_loader, criterion, optimizer, hyperparams['num_epochs'], hyperparams["device"] )
metrics = evaluate_model(test_loader, model2, hyperparams["device"], name="d_model=16")
all_metrics.append(metrics)
print(all_metrics[-1])
print(all_metrics[-2])



Epoch [1/5], Average Loss: 0.5771
Epoch [2/5], Average Loss: 0.4561
Epoch [3/5], Average Loss: 0.3237
Epoch [4/5], Average Loss: 0.2718
Epoch [5/5], Average Loss: 0.2146
{'d_model=16': {'accuracy': 0.9217827626918536, 'brier_score': 0.060095591776708895, 'roc_auc': 0.9681971292331498, 'pr_auc': 0.9762898567029166, 'model_size_kb': 1110.697265625}}
{'num_layers = 2': {'accuracy': 0.6440377804014168, 'brier_score': 0.22950178484192701, 'roc_auc': 0.7099174156714138, 'pr_auc': 0.8262721489127085, 'model_size_kb': 57589.009765625}}


##After dim_feedforward = 256

In [None]:
# Set hyperparameters
hyperparams = {
    'batch_size': 512,
    'num_epochs': 5,
    'learning_rate': 0.001,
    'd_model': 16,
    'nhead': 8,
    'num_layers': 2,
    'dim_feedforward': 256,
    'dropout': 0.1,
    'device': 'cuda'
}

train_loader = DataLoader(train_dataset, hyperparams['batch_size'])
test_loader = DataLoader(test_dataset, hyperparams['batch_size'] )

# Initialize model
model3 = TransformerModel(
    input_dim = X_train.shape[1],
    num_classes = len(np.unique([y_train])),
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_layers=hyperparams['num_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model3.parameters(), lr=hyperparams['learning_rate'])

# Train and evaluate
train_model(model3, train_loader, criterion, optimizer, hyperparams['num_epochs'], hyperparams["device"] )
metrics = evaluate_model(test_loader, model3, hyperparams["device"], name="dim_feedforward = 256")
all_metrics.append(metrics)
print(all_metrics[-1])
print(all_metrics[-2])



Epoch [1/5], Average Loss: 0.5533
Epoch [2/5], Average Loss: 0.4256
Epoch [3/5], Average Loss: 0.3669
Epoch [4/5], Average Loss: 0.3099
Epoch [5/5], Average Loss: 0.2505
{'dim_feedforward = 256': {'accuracy': 0.9297520661157025, 'brier_score': 0.05585872371649252, 'roc_auc': 0.9663859893930896, 'pr_auc': 0.9734770802281659, 'model_size_kb': 186.697265625}}
{'d_model=16': {'accuracy': 0.9217827626918536, 'brier_score': 0.060095591776708895, 'roc_auc': 0.9681971292331498, 'pr_auc': 0.9762898567029166, 'model_size_kb': 1110.697265625}}


##After changing batch_size  = 128

In [None]:
# Set hyperparameters
hyperparams = {
    'batch_size': 128,
    'num_epochs': 5,
    'learning_rate': 0.001,
    'd_model': 16,
    'nhead': 8,
    'num_layers': 2,
    'dim_feedforward': 256,
    'dropout': 0.1,
    'device': 'cuda'
}

train_loader = DataLoader(train_dataset, hyperparams['batch_size'])
test_loader = DataLoader(test_dataset, hyperparams['batch_size'] )

# Initialize model
model4 = TransformerModel(
    input_dim = X_train.shape[1],
    num_classes = len(np.unique([y_train])),
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_layers=hyperparams['num_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model4.parameters(), lr=hyperparams['learning_rate'])

# Train and evaluate
train_model(model4, train_loader, criterion, optimizer, hyperparams['num_epochs'], hyperparams["device"] )
metrics = evaluate_model(test_loader, model4, hyperparams["device"], name="batch_size=128")
all_metrics.append(metrics)
print(all_metrics[-1])
print(all_metrics[-2])



Epoch [1/5], Average Loss: 0.4512
Epoch [2/5], Average Loss: 0.3012
Epoch [3/5], Average Loss: 0.2319
Epoch [4/5], Average Loss: 0.1706
Epoch [5/5], Average Loss: 0.1256
{'batch_size=128': {'accuracy': 0.9527744982290437, 'brier_score': 0.03950341036614989, 'roc_auc': 0.9837860042895819, 'pr_auc': 0.987048561740365, 'model_size_kb': 186.697265625}}
{'dim_feedforward = 256': {'accuracy': 0.9297520661157025, 'brier_score': 0.05585872371649252, 'roc_auc': 0.9663859893930896, 'pr_auc': 0.9734770802281659, 'model_size_kb': 186.697265625}}


##After nhead = 2


In [None]:
# Set hyperparameters
hyperparams = {
    'batch_size': 128,
    'num_epochs': 5,
    'learning_rate': 0.001,
    'd_model': 16,
    'nhead': 2,
    'num_layers': 2,
    'dim_feedforward': 256,
    'dropout': 0.1,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

train_loader = DataLoader(train_dataset, hyperparams['batch_size'])
test_loader = DataLoader(test_dataset, hyperparams['batch_size'] )

# Initialize model
model5 = TransformerModel(
    input_dim = X_train.shape[1],
    num_classes = len(np.unique([y_train])),
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_layers=hyperparams['num_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model5.parameters(), lr=hyperparams['learning_rate'])

# Train and evaluate
train_model(model5, train_loader, criterion, optimizer, hyperparams['num_epochs'], hyperparams["device"] )
metrics = evaluate_model(test_loader, model5, hyperparams["device"], name="nhead=2")
all_metrics.append(metrics)
print(all_metrics[-1])
print(all_metrics[-2])



Epoch [1/5], Average Loss: 0.4577
Epoch [2/5], Average Loss: 0.2941
Epoch [3/5], Average Loss: 0.2014
Epoch [4/5], Average Loss: 0.1518
Epoch [5/5], Average Loss: 0.1243
{'nhead=2': {'accuracy': 0.9504132231404959, 'brier_score': 0.04024276975882614, 'roc_auc': 0.9826326661832907, 'pr_auc': 0.9880936945251475, 'model_size_kb': 186.697265625}}
{'batch_size=128': {'accuracy': 0.9527744982290437, 'brier_score': 0.03950341036614989, 'roc_auc': 0.9837860042895819, 'pr_auc': 0.987048561740365, 'model_size_kb': 186.697265625}}


##Encoder Only

In [None]:
class TransformerModelEncoder(nn.Module):
    def __init__(self, input_dim, num_classes, d_model, nhead, num_layers, dim_feedforward, dropout):
        super(TransformerModelEncoder, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout
            ),
            num_layers=num_layers
        )
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add a fake batch dimension for the transformer
        x = self.transformer_encoder(x)  # Pass through the encoder
        x = x.squeeze(1)
        x = self.fc(x)
        return x

In [None]:
# Set hyperparameters
hyperparams = {
    'batch_size': 128,
    'num_epochs': 5,
    'learning_rate': 0.001,
    'd_model': 16,
    'nhead': 2,
    'num_layers': 2,
    'dim_feedforward': 256,
    'dropout': 0.1,
    'device': 'cuda'
}

train_loader = DataLoader(train_dataset, hyperparams['batch_size'])
test_loader = DataLoader(test_dataset, hyperparams['batch_size'] )

# Initialize model
model6 = TransformerModelEncoder(
    input_dim = X_train.shape[1],
    num_classes = len(np.unique([y_train])),
    d_model=hyperparams['d_model'],
    nhead=hyperparams['nhead'],
    num_layers=hyperparams['num_layers'],
    dim_feedforward=hyperparams['dim_feedforward'],
    dropout=hyperparams['dropout']
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model6.parameters(), lr=hyperparams['learning_rate'])

# Train and evaluate
train_model(model6, train_loader, criterion, optimizer, hyperparams['num_epochs'], hyperparams["device"] )
metrics = evaluate_model(test_loader, model6, hyperparams["device"], name="Encoder_only")
all_metrics.append(metrics)
print(all_metrics[-1])
print(all_metrics[-2])



Epoch [1/5], Average Loss: 0.4706
Epoch [2/5], Average Loss: 0.2588
Epoch [3/5], Average Loss: 0.1641
Epoch [4/5], Average Loss: 0.1374
Epoch [5/5], Average Loss: 0.1188
{'Encoder_only': {'accuracy': 0.961038961038961, 'brier_score': 0.03294386407522342, 'roc_auc': 0.9834118819285789, 'pr_auc': 0.9861840391596766, 'model_size_kb': 87.751953125}}
{'nhead=2': {'accuracy': 0.9504132231404959, 'brier_score': 0.04024276975882614, 'roc_auc': 0.9826326661832907, 'pr_auc': 0.9880936945251475, 'model_size_kb': 186.697265625}}


#Evaluation and plots