##Importing the necessary libraries

In [3]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, brier_score_loss, roc_auc_score, average_precision_score

In [4]:
pip install tab-transformer-pytorch

Collecting tab-transformer-pytorch
  Downloading tab_transformer_pytorch-0.3.0-py3-none-any.whl (6.9 kB)
Collecting einops>=0.3 (from tab-transformer-pytorch)
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: einops, tab-transformer-pytorch
Successfully installed einops-0.7.0 tab-transformer-pytorch-0.3.0


## Set a fixed random seed for reproducibility across all relevant libraries and operations.

In [5]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Ensuring that PyTorch's convolution operations are deterministic
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(42)

## Loading and Preprocessing the NF Ton IoT v2 Dataset:

In [6]:
# Downloading the Dataset
!gdown --id 1JNsyqlCwT8IVudqj3yZ1y6wgbG7me5Pq
!unzip /content/NF-ToN-IoT-v2.zip

Downloading...
From (original): https://drive.google.com/uc?id=1JNsyqlCwT8IVudqj3yZ1y6wgbG7me5Pq
From (redirected): https://drive.google.com/uc?id=1JNsyqlCwT8IVudqj3yZ1y6wgbG7me5Pq&confirm=t&uuid=bc656982-f12d-4fe0-b3fc-bf74432b0734
To: /content/NF-ToN-IoT-v2.zip
100% 185M/185M [00:03<00:00, 53.3MB/s]
Archive:  /content/NF-ToN-IoT-v2.zip
  inflating: NF-ToN-IoT-v2/NetFlow_v2_Features.csv  
  inflating: NF-ToN-IoT-v2/NF-ToN-IoT-v2.csv  


In [7]:
# Loading the NF-ToN-IoT-v2 dataset from a CSV file into a DataFrame. This process takes approximately 46 seconds.
df_whole = pd.read_csv("/content/NF-ToN-IoT-v2/NF-ToN-IoT-v2.csv")

# Dropping specific columns from the DataFrame that are not required for the analysis.
# These include 'Attack', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', and certain byte-related columns.
df_whole = df_whole.drop(columns=["Attack","IPV4_SRC_ADDR", "IPV4_DST_ADDR","SRC_TO_DST_SECOND_BYTES","DST_TO_SRC_SECOND_BYTES"])

# Converting all columns in the DataFrame to float type for consistency and to facilitate numerical operations.
df = df_whole.astype(float)

# Reducing the dataset size by randomly sampling 0.1% of the data to make the dataset more manageable and speed up computations.
# The random state is set to 42 for reproducibility.
df = df.sample(frac=0.001, random_state=42)

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

# Assuming df is your loaded DataFrame
categorical_columns = ['PROTOCOL', 'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE', 'FTP_COMMAND_RET_CODE']
continuous_columns = [col for col in df.columns if col not in categorical_columns + ['Label']]

# Convert categorical columns to type 'category'
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Extract features and target
X_categorical = df[categorical_columns].apply(lambda x: x.cat.codes).values
X_continuous = df[continuous_columns].values
y = df['Label'].values

# Normalization for continuous features only
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_continuous = scaler.fit_transform(X_continuous)

# Splitting the data
X_cat_train, X_cat_test, X_cont_train, X_cont_test, y_train, y_test = train_test_split(X_categorical, X_continuous, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
X_cat_train_tensor = torch.tensor(X_cat_train, dtype=torch.long)
X_cont_train_tensor = torch.tensor(X_cont_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_cat_test_tensor = torch.tensor(X_cat_test, dtype=torch.long)
X_cont_test_tensor = torch.tensor(X_cont_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


In [9]:
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

device = get_device()

In [16]:
class TabTransformer(nn.Module):
    def __init__(self, num_continuous, num_classes, num_embeddings_list, embedding_dim=32, num_transformer_layers=6, transformer_heads=8, transformer_forward_dim=128, dropout_rate=0.1):
        super(TabTransformer, self).__init__()

        # Embeddings for categorical features
        self.embeddings = nn.ModuleList([nn.Embedding(n_e, embedding_dim) for n_e in num_embeddings_list])

        # Continuous feature linear layer
        self.continuous_linear = nn.Linear(num_continuous, embedding_dim * len(num_embeddings_list))

        # Transformer
        transformer_layer = nn.TransformerEncoderLayer(d_model=embedding_dim * len(num_embeddings_list), nhead=transformer_heads, dim_feedforward=transformer_forward_dim, dropout=dropout_rate)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=num_transformer_layers)

        # Final classifier
        self.fc = nn.Linear(embedding_dim * len(num_embeddings_list), num_classes)

    def forward(self, x_cat, x_cont):
        embeddings = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
        x = torch.cat(embeddings, dim=1) + self.continuous_linear(x_cont)

        x = self.transformer_encoder(x.unsqueeze(1)).squeeze(1)
        x = self.fc(x)

        return x

# Model instantiation
num_embeddings_list = [df[col].nunique() for col in categorical_columns]
embedding_dim = 32
num_continuous = len(continuous_columns)
num_classes = len(df['Label'].unique())

# model = TabTransformer(num_continuous, num_classes, num_embeddings_list, embedding_dim)
# model = model.to(device)

# Example setup for DataLoader
train_dataset = TensorDataset(X_cat_train_tensor, X_cont_train_tensor, y_train_tensor)  # Ensure tensors are correctly prepared
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_dataset = TensorDataset(X_cat_test_tensor, X_cont_test_tensor, y_test_tensor)  # Ensure tensors are correctly prepared
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=True)


In [17]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch
from tab_transformer_pytorch import TabTransformer

# Assuming the device setup and tensors have been defined as per the previous steps
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model Configuration and Instantiation
# Note: Adjust categories based on your actual dataset
categories = tuple([int(df[col].nunique()) for col in categorical_columns])
num_continuous = X_cont_train_tensor.shape[1]  # Number of continuous features in your data

model = TabTransformer(
    categories=categories,               # Tuple with the number of unique values per categorical feature
    num_continuous=num_continuous,       # Number of continuous values
    dim=32,                               # Dimension, paper set at 32
    dim_out=1,                            # Output dimension, 1 for binary classification
    depth=6,                              # Depth, paper recommended 6
    heads=8,                              # Heads, paper recommends 8
    attn_dropout=0.1,                     # Post-attention dropout
    ff_dropout=0.1,                       # Feed-forward dropout
    mlp_hidden_mults=(4, 2),              # Relative multiples of each hidden dimension of the last MLP to logits
    mlp_act=nn.ReLU(),                    # Activation for final MLP
).to(device)

loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    for i, (cat_features, cont_features, labels) in enumerate(train_loader):
        cat_features, cont_features, labels = cat_features.to(device), cont_features.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(cat_features, cont_features)

        # Compute loss
        loss = loss_function(outputs, labels.float().unsqueeze(1))  # Ensure labels are the correct shape and type
        loss.backward()  # Compute gradient of the loss with respect to model parameters
        optimizer.step()  # Perform a single optimization step

        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')


Epoch [1/10], Loss: 0.4551
Epoch [2/10], Loss: 0.2716
Epoch [3/10], Loss: 0.1997
Epoch [4/10], Loss: 0.1494
Epoch [5/10], Loss: 0.1233
Epoch [6/10], Loss: 0.1041
Epoch [7/10], Loss: 0.0868
Epoch [8/10], Loss: 0.0833
Epoch [9/10], Loss: 0.0790
Epoch [10/10], Loss: 0.0669


In [18]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, brier_score_loss
import numpy as np
import torch

model.eval()  # Set the model to evaluation mode
true_labels = []
scores = []

with torch.no_grad():  # Disable gradient computation
    for cat_features, cont_features, labels in test_loader:
        cat_features, cont_features, labels = cat_features.to(device), cont_features.to(device), labels.to(device)
        outputs = model(cat_features, cont_features)

        # Assuming the output is a logit for binary classification, apply sigmoid to get probabilities
        probs = torch.sigmoid(outputs).squeeze().cpu().numpy()
        scores.extend(probs)

        true_labels.extend(labels.cpu().numpy())

# Convert probabilities to binary predictions based on a 0.5 threshold
predictions = np.array(scores) > 0.5

print("Classification Report:")
print(classification_report(true_labels, predictions, zero_division=0))

roc_auc = roc_auc_score(true_labels, scores)
print(f"AUC-ROC: {roc_auc:.4f}")

precision, recall, _ = precision_recall_curve(true_labels, scores)
auc_pr = auc(recall, precision)
print(f"AUC-PR: {auc_pr:.4f}")

brier_score = brier_score_loss(true_labels, scores)
print(f"Brier score: {brier_score:.4f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95      1206
           1       0.96      0.99      0.97      2182

    accuracy                           0.97      3388
   macro avg       0.97      0.96      0.96      3388
weighted avg       0.97      0.97      0.97      3388

AUC-ROC: 0.9825
AUC-PR: 0.9837
Brier score: 0.0285


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tab_transformer_pytorch import FTTransformer

# Example setup for DataLoader
train_dataset = TensorDataset(X_cat_train_tensor, X_cont_train_tensor, y_train_tensor)  # Ensure tensors are correctly prepared
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = FTTransformer(
    categories = categories,      # tuple containing the number of unique values within each category
    num_continuous = num_continuous,                # number of continuous values
    dim = 32,                           # dimension, paper set at 32
    dim_out = 1,                        # binary prediction, but could be anything
    depth = 6,                          # depth, paper recommended 6
    heads = 8,                          # heads, paper recommends 8
    attn_dropout = 0.1,                 # post-attention dropout
    ff_dropout = 0.1                    # feed forward dropout
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

loss_function = nn.BCEWithLogitsLoss()  # Assuming binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        x_categ, x_numer, y = [item.to(device) for item in batch]
        optimizer.zero_grad()
        outputs = model(x_categ, x_numer).squeeze(1)
        loss = loss_function(outputs, y.float())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

Epoch 1, Loss: 0.5710726689409327
Epoch 2, Loss: 0.4314321356791037
Epoch 3, Loss: 0.4529719253381093
Epoch 4, Loss: 0.37532821628782487
Epoch 5, Loss: 0.3593967612142916
Epoch 6, Loss: 0.30663899580637616
Epoch 7, Loss: 0.1947062247329288
Epoch 8, Loss: 0.14395135309961107
Epoch 9, Loss: 0.12496594808719776
Epoch 10, Loss: 0.11667687914989612


In [22]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, brier_score_loss

model.eval()
true_labels = []
scores = []

with torch.no_grad():
    for batch in test_loader:
        x_categ, x_numer, labels = [item.to(device) for item in batch]
        outputs = model(x_categ, x_numer).squeeze(1)
        probs = torch.sigmoid(outputs).cpu().numpy()
        scores.extend(probs)
        true_labels.extend(labels.cpu().numpy())

predictions = np.array(scores) > 0.5
print("Classification Report:")
print(classification_report(true_labels, predictions, zero_division=0))

roc_auc = roc_auc_score(true_labels, scores)
print(f"AUC-ROC: {roc_auc:.4f}")

precision, recall, _ = precision_recall_curve(true_labels, scores)
auc_pr = auc(recall, precision)
print(f"AUC-PR: {auc_pr:.4f}")

brier_score = brier_score_loss(true_labels, scores)
print(f"Brier score: {brier_score:.4f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      1206
           1       0.95      0.98      0.96      2182

    accuracy                           0.95      3388
   macro avg       0.96      0.94      0.95      3388
weighted avg       0.95      0.95      0.95      3388

AUC-ROC: 0.9830
AUC-PR: 0.9886
Brier score: 0.0362
