In [1]:
!git clone https://github.com/Jumabek/net_intrusion_detection.git

Cloning into 'net_intrusion_detection'...
remote: Enumerating objects: 248, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 248 (delta 16), reused 19 (delta 7), pack-reused 212[K
Receiving objects: 100% (248/248), 2.35 MiB | 7.97 MiB/s, done.
Resolving deltas: 100% (149/149), done.


In [2]:
%cd /content/net_intrusion_detection

/content/net_intrusion_detection


In [3]:
!gdown --id 1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu

Downloading...
From: https://drive.google.com/uc?id=1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu
To: /content/net_intrusion_detection/MachineLearningCSV.zip
100% 235M/235M [00:05<00:00, 40.3MB/s]


In [4]:
!unzip MachineLearningCSV.zip

Archive:  MachineLearningCSV.zip
   creating: MachineLearningCVE/
  inflating: MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv  


In [5]:
from preprocessing import read_data

def load_data(dataroot):
    data = read_data(dataroot, '*.pcap_ISCX.csv')
    num_records, num_features = data.shape
    print(f"There are {num_records} flow records with {num_features} feature dimensions")

    data = data.rename(columns=lambda x: x.strip())  # Strip whitespace from column names
    return data

In [6]:
data = load_data('MachineLearningCVE/')

MachineLearningCVE/*.pcap_ISCX.csv
['MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv', 'MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv', 'MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv']
There are 2830743 flow records with 79 feature dimensions


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import numpy as np

In [8]:
non_numeric_columns = data.columns.drop('Label')  # Excluding the label column
data[non_numeric_columns] = data[non_numeric_columns].apply(pd.to_numeric, errors='coerce')

# Replace infinite values with NaN and drop NaN values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data = data.dropna()

# Update num_columns after converting non-Label columns to float
num_columns = non_numeric_columns.tolist()

X = data.drop("Label", axis=1)
y = data["Label"]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X[num_columns])

In [9]:
# Preprocess the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [11]:
class IDS2017Dataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [12]:
class IDS2017Transformer(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, num_classes):
        super(IDS2017Transformer, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer.encoder(x.unsqueeze(1))
        x = x.squeeze(1)
        x = self.classifier(x)
        return x

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [22]:
train_dataset = IDS2017Dataset(X_train, y_train)
test_dataset = IDS2017Dataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=5*2014, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=5*1024, shuffle=False)

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = IDS2017Transformer(input_dim=X_train.shape[1], d_model=64, nhead=4, num_layers=2, num_classes=len(y.unique())).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

Epoch [1/10], Loss: 0.2660
Epoch [2/10], Loss: 0.0975
Epoch [3/10], Loss: 0.0711
Epoch [4/10], Loss: 0.0571
Epoch [5/10], Loss: 0.0505
Epoch [6/10], Loss: 0.0464
Epoch [7/10], Loss: 0.0449
Epoch [8/10], Loss: 0.0428
Epoch [9/10], Loss: 0.0413
Epoch [10/10], Loss: 0.0417


In [25]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

print(classification_report(y_true, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99    454141
           1       0.99      0.35      0.52       372
           2       1.00      0.98      0.99     25563
           3       0.98      0.99      0.99      2090
           4       0.95      0.99      0.97     46048
           5       0.90      0.98      0.94      1091
           6       0.98      0.97      0.98      1174
           7       1.00      0.79      0.88      1649
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         8
          10       0.89      0.96      0.92     31913
          11       0.95      0.98      0.97      1111
          12       1.00      0.07      0.13       304
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00       108

    accuracy                           0.98    565576
   macro avg       0.71      0.60      0.62    565576
weighted avg       0.98   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
