In [None]:
!git clone https://github.com/Jumabek/net_intrusion_detection.git

Cloning into 'net_intrusion_detection'...
remote: Enumerating objects: 251, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 251 (delta 17), reused 23 (delta 8), pack-reused 212[K
Receiving objects: 100% (251/251), 2.35 MiB | 24.59 MiB/s, done.
Resolving deltas: 100% (150/150), done.


In [None]:
%cd /content/net_intrusion_detection

/content/net_intrusion_detection


In [None]:
!gdown --id 1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu

Downloading...
From: https://drive.google.com/uc?id=1-t3RdDpmqMs4ABt9oobSapeNYTZJ9tpu
To: /content/net_intrusion_detection/MachineLearningCSV.zip
100% 235M/235M [00:04<00:00, 56.7MB/s]


In [None]:
!unzip MachineLearningCSV.zip

Archive:  MachineLearningCSV.zip
   creating: MachineLearningCVE/
  inflating: MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv  


In [None]:
from preprocessing import read_data

def load_data(dataroot):
    data = read_data(dataroot, '*.pcap_ISCX.csv')
    num_records, num_features = data.shape
    print(f"There are {num_records} flow records with {num_features} feature dimensions")

    data = data.rename(columns=lambda x: x.strip())  # Strip whitespace from column names
    return data

In [None]:
data = load_data('MachineLearningCVE/')

MachineLearningCVE/*.pcap_ISCX.csv
['MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv', 'MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv', 'MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv']
There are 2830743 flow records with 79 feature dimensions


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import numpy as np

In [None]:
non_numeric_columns = data.columns.drop('Label')  # Excluding the label column
data[non_numeric_columns] = data[non_numeric_columns].apply(pd.to_numeric, errors='coerce')

# Replace infinite values with NaN and drop NaN values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data = data.dropna()

# Update num_columns after converting non-Label columns to float
num_columns = non_numeric_columns.tolist()

X = data.drop("Label", axis=1)
y = data["Label"]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X[num_columns])

In [None]:
# Preprocess the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Get the unique classes and their counts
unique_classes, counts = np.unique(y_encoded, return_counts=True)
counts

array([2271320,    1956,  128025,   10293,  230124,    5499,    5796,
          7935,      11,      36,  158804,    5897,    1507,      21,
           652])

In [None]:
desired_samples = counts.copy()
desired_samples[desired_samples < 1100] = 1100  # Set the minimum number of samples to 1100

# Create a dictionary with class labels as keys and desired_samples as values
sampling_strategy = dict(zip(unique_classes, desired_samples))

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Assume X and y are the features and labels of the IDS2017 dataset

# Apply RandomOverSampler
ros = RandomOverSampler(random_state=42,sampling_strategy=sampling_strategy)
X_resampled, y_resampled = ros.fit_resample(X_scaled, y_encoded)
# Now you can proceed with splitting the data and creating DataLoaders as shown in previous examples

In [None]:
import numpy as np

# Get the unique classes and their counts
unique_classes, counts = np.unique(y_resampled, return_counts=True)

counts

array([2271320,    1956,  128025,   10293,  230124,    5499,    5796,
          7935,    1100,    1100,  158804,    5897,    1507,    1100,
          1100])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
class IDS2017Dataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
class IDS2017Transformer(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_layers, num_classes):
        super(IDS2017Transformer, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer.encoder(x.unsqueeze(1))
        x = x.squeeze(1)
        x = self.classifier(x)
        return x

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import StratifiedKFold

dataset = IDS2017Dataset(X_resampled,y_resampled)

In [None]:
# train_dataset = IDS2017Dataset(X_train, y_train)
# test_dataset = IDS2017Dataset(X_test, y_test)

# train_loader = DataLoader(train_dataset, batch_size=5*2014, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=5*1024, shuffle=False)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
# import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# # Calculate the number of instances for each class
# unique_classes, counts = np.unique(y_train, return_counts=True)

# # Calculate the inverse of the class counts (you can also use other methods to calculate class weights)
# class_weights = 1 / counts

# # Normalize the class weights so that they sum up to 1
# class_weights = class_weights / np.sum(class_weights)

# # Convert the class weights to a PyTorch tensor
# class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [None]:
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score, precision_score

num_epochs = 5

f1_scores = []
recall_scores = []
accuracy_scores = []
precision_scores = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_resampled, y_resampled)):
    train_dataset = Subset(dataset, train_idx)
    test_dataset = Subset(dataset, test_idx)

    train_loader = DataLoader(train_dataset, batch_size=7*1024, shuffle=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=7*1024, shuffle=False, num_workers=4)

    # Initialize your model and optimizer for each fold
    model = IDS2017Transformer(input_dim=X_resampled.shape[1], d_model=64, nhead=4, num_layers=2, num_classes=len(y.unique())).to(device)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print(f"Fold {fold + 1}:")
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

    # Evaluate your model on the test_loader
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)

    f1_scores.append(f1)
    recall_scores.append(recall)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)

    print(f"Metrics for Fold {fold + 1}:")
    print(f"F1 score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")

Fold 1:
Epoch [1/5], Loss: 0.2301
Epoch [2/5], Loss: 0.0917
Epoch [3/5], Loss: 0.0686
Epoch [4/5], Loss: 0.0688
Epoch [5/5], Loss: 0.0514
Metrics for Fold 1:
F1 score: 0.8002
Recall: 0.8365
Accuracy: 0.8365
Precision: 0.8203
Fold 2:
Epoch [1/5], Loss: 0.2226
Epoch [2/5], Loss: 0.0875
Epoch [3/5], Loss: 0.0624
Epoch [4/5], Loss: 0.0558
Epoch [5/5], Loss: 0.0485
Metrics for Fold 2:
F1 score: 0.8119
Recall: 0.8397
Accuracy: 0.8397
Precision: 0.8093
Fold 3:
Epoch [1/5], Loss: 0.2172
Epoch [2/5], Loss: 0.0859
Epoch [3/5], Loss: 0.0631
Epoch [4/5], Loss: 0.0535
Epoch [5/5], Loss: 0.0505
Metrics for Fold 3:
F1 score: 0.8179
Recall: 0.8470
Accuracy: 0.8470
Precision: 0.7973
Fold 4:
Epoch [1/5], Loss: 0.2256
Epoch [2/5], Loss: 0.0957
Epoch [3/5], Loss: 0.0696
Epoch [4/5], Loss: 0.0553
Epoch [5/5], Loss: 0.0502
Metrics for Fold 4:
F1 score: 0.8813
Recall: 0.8889
Accuracy: 0.8889
Precision: 0.8815
Fold 5:
Epoch [1/5], Loss: 0.2165
Epoch [2/5], Loss: 0.0934
Epoch [3/5], Loss: 0.0695
Epoch [4/5], L

In [None]:
avg_f1 = np.mean(f1_scores)
avg_recall = np.mean(recall_scores)
avg_accuracy = np.mean(accuracy_scores)
avg_precision = np.mean(precision_scores)

print("Average metrics:")
print(f"Average F1 score: {avg_f1:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")

Average metrics:
Average F1 score: 0.8354
Average Recall: 0.8579
Average Accuracy: 0.8579
Average Precision: 0.8362


In [None]:
# import numpy as np

# unique_classes, counts = np.unique(y_resampled, return_counts=True)
# decoded_labels = label_encoder.inverse_transform(unique_classes)
# proportions = counts / len(y_resampled)

# for cls, prop in zip(decoded_labels, proportions):
#     print(f"Class {cls}: {prop:.4f}")