In [1]:
!git clone https://github.com/Jumabek/net_intrusion_detection.git

fatal: destination path 'net_intrusion_detection' already exists and is not an empty directory.


In [2]:
%cd /content/net_intrusion_detection

/content/net_intrusion_detection


In [3]:
!git checkout transformer

Already on 'transformer'
Your branch is up to date with 'origin/transformer'.


In [4]:
!gdown --id 1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu

Downloading...
From: https://drive.google.com/uc?id=1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu
To: /content/net_intrusion_detection/MachineLearningCSV.zip
100% 235M/235M [00:01<00:00, 165MB/s]


In [5]:
!unzip MachineLearningCSV.zip

Archive:  MachineLearningCSV.zip
replace MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [6]:
from preprocessing import read_data

def load_data(dataroot):
    data = read_data(dataroot, '*.pcap_ISCX.csv')
    num_records, num_features = data.shape
    print(f"There are {num_records} flow records with {num_features} feature dimensions")

    data = data.rename(columns=lambda x: x.strip())  # Strip whitespace from column names
    return data

In [7]:
data = load_data('MachineLearningCVE/')

MachineLearningCVE/*.pcap_ISCX.csv
['MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv', 'MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv', 'MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv']
There are 2830743 flow records with 79 feature dimensions


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import  LabelEncoder
import numpy as np

In [9]:
non_numeric_columns = data.columns.drop('Label')  
data[non_numeric_columns] = data[non_numeric_columns].apply(pd.to_numeric, errors='coerce')

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data = data.dropna()

num_columns = non_numeric_columns.tolist()

X = data.drop("Label", axis=1)
y = data["Label"]

In [10]:
from preprocessing import balance_data, normalize
X = normalize(X)
X = X.to_numpy()

In [11]:
# Preprocess the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [12]:
import numpy as np

unique_classes, counts = np.unique(y, return_counts=True)

counts

array([2271320,    1956,  128025,   10293,  230124,    5499,    5796,
          7935,      11,      36,  158804,    5897,    1507,      21,
           652])

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [14]:
class IDS2017Dataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [15]:
class IDS2017Transformer(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, num_classes):
        super(IDS2017Transformer, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer.encoder(x.unsqueeze(1))
        x = x.squeeze(1)
        x = self.classifier(x)
        return x

In [16]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import StratifiedKFold

dataset = IDS2017Dataset(X,y_encoded)

In [17]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from sklearn.metrics import balanced_accuracy_score,classification_report, f1_score, recall_score, accuracy_score, precision_score

num_epochs = 20

results = {}

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded)):
    train_dataset = Subset(dataset, train_idx)
    test_dataset = Subset(dataset, test_idx)

    train_loader = DataLoader(train_dataset, batch_size=8*1024, shuffle=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=8*1024, shuffle=False, num_workers=4)

    model = IDS2017Transformer(input_dim=X.shape[1], d_model=64, nhead=4, num_layers=2, num_classes=len(y.unique())).to(device)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print(f"Fold {fold + 1}:")
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    test_acc = balanced_accuracy_score(y_true,y_pred)*100
    print('balanced test acc: ',test_acc)
    results[fold]= (test_acc)i
    

Fold 1:
Epoch [1/20], Loss: 0.2177
Epoch [2/20], Loss: 0.0886
Epoch [3/20], Loss: 0.0660
Epoch [4/20], Loss: 0.0535
Epoch [5/20], Loss: 0.0470
Epoch [6/20], Loss: 0.0838
Epoch [7/20], Loss: 0.0475
Epoch [8/20], Loss: 0.0430
Epoch [9/20], Loss: 0.0449
Epoch [10/20], Loss: 0.0408
Epoch [11/20], Loss: 0.0398
Epoch [12/20], Loss: 0.0390
Epoch [13/20], Loss: 0.0399
Epoch [14/20], Loss: 0.0396
Epoch [15/20], Loss: 0.0373
Epoch [16/20], Loss: 0.0367
Epoch [17/20], Loss: 0.0364
Epoch [18/20], Loss: 0.0358
Epoch [19/20], Loss: 0.0412
Epoch [20/20], Loss: 0.0421
balanced test acc:  45.28964682854525
Fold 2:
Epoch [1/20], Loss: 0.2097
Epoch [2/20], Loss: 0.0865
Epoch [3/20], Loss: 0.0646
Epoch [4/20], Loss: 0.0530
Epoch [5/20], Loss: 0.0501
Epoch [6/20], Loss: 0.0455
Epoch [7/20], Loss: 0.0428
Epoch [8/20], Loss: 0.0412
Epoch [9/20], Loss: 0.0744
Epoch [10/20], Loss: 0.0488
Epoch [11/20], Loss: 0.0431
Epoch [12/20], Loss: 0.0410
Epoch [13/20], Loss: 0.0396
Epoch [14/20], Loss: 0.0390
Epoch [15/20