In [None]:
import pandas as pd

# csv_path = "data/cicids2017_cleaned.csv"   
# df = pd.read_csv(csv_path)

print(df.columns.tolist())
label_cols = [c for c in df.columns if "label" in c.lower()]
print(label_cols)
for col in label_cols:
    print(df[col].value_counts())


['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'Average Packet Size', 'Subflow Fwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Max', 'Id

In [2]:
# Cell 1: Imports

import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# PyTorch Geometric
from torch_geometric.nn import SAGEConv

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())



  from .autonotebook import tqdm as notebook_tqdm


Torch version: 2.9.1+cu130
CUDA available: True


In [3]:
# Cell 2: Config & paths

X_PATH = r"processed\preprocessed_X_seq.npy"
Y_PATH = r"processed\preprocessed_y_seq.npy"

BATCH_SIZE = 32
EPOCHS = 15
LR = 1e-3
SEQ_LEN = 32

TRAIN_RATIO = 0.7
VAL_RATIO = 0.15   # remaining 0.15 ‚Üí test

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

print("Using device:", DEVICE)


Using device: cuda


In [4]:
# Cell 3: Load preprocessed windows & split into train/val/test

X_seq = np.load(X_PATH)   # shape: (N_windows, 32, 52)
y_seq = np.load(Y_PATH)   # shape: (N_windows, 32)

print("X_seq shape:", X_seq.shape)
print("y_seq shape:", y_seq.shape)

def create_splits(X_seq, y_seq, train_ratio=0.7, val_ratio=0.15):
    N = X_seq.shape[0]
    n_train = int(N * train_ratio)
    n_val = int(N * val_ratio)
    n_test = N - n_train - n_val

    X_train = X_seq[:n_train]
    y_train = y_seq[:n_train]
    X_val   = X_seq[n_train:n_train + n_val]
    y_val   = y_seq[n_train:n_train + n_val]
    X_test  = X_seq[n_train + n_val:]
    y_test  = y_seq[n_train + n_val:]

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

(X_train, y_train), (X_val, y_val), (X_test, y_test) = create_splits(
    X_seq, y_seq, TRAIN_RATIO, VAL_RATIO
)

print("Train windows:", X_train.shape[0])
print("Val windows:",   X_val.shape[0])
print("Test windows:",  X_test.shape[0])

# Show class balance in training set (flatten to flow-level)
unique, counts = np.unique(y_train.reshape(-1), return_counts=True)
print("Train label distribution (per-flow):", dict(zip(unique, counts)))


X_seq shape: (78773, 32, 52)
y_seq shape: (78773, 32)
Train windows: 55141
Val windows: 11815
Test windows: 11817
Train label distribution (per-flow): {np.int64(0): np.int64(1532563), np.int64(1): np.int64(231949)}


In [5]:
# Cell 4: PyTorch Dataset wrapping (X_seq, y_seq)

class FlowSequenceDatasetFromArrays(Dataset):
    """
    Each item is one window of flows:
      x: (seq_len, num_features)
      y: (seq_len,)
    """
    def __init__(self, X_seq, y_seq):
        assert X_seq.shape[0] == y_seq.shape[0]
        self.X = X_seq
        self.y = y_seq
        self.num_windows, self.seq_len, self.num_features = self.X.shape

    def __len__(self):
        return self.num_windows

    def __getitem__(self, idx):
        x_win = torch.tensor(self.X[idx], dtype=torch.float32)  # (L, F)
        y_win = torch.tensor(self.y[idx], dtype=torch.long)     # (L,)
        return x_win, y_win

train_ds = FlowSequenceDatasetFromArrays(X_train, y_train)
val_ds   = FlowSequenceDatasetFromArrays(X_val, y_val)
test_ds  = FlowSequenceDatasetFromArrays(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

xb, yb = next(iter(train_loader))
print("One batch shapes:", xb.shape, yb.shape)  # (B, L, F), (B, L)


One batch shapes: torch.Size([32, 32, 52]) torch.Size([32, 32])


In [6]:
# Cell 5: GraphSAGEEncoder (GNN branch) using a complete graph per window

class GraphSAGEEncoder(nn.Module):
    def __init__(self, emb_dim, hidden_dim=128, num_layers=2):
        super().__init__()

        layers = []
        in_dim = emb_dim
        for i in range(num_layers):
            out_dim = hidden_dim if i < num_layers - 1 else emb_dim
            layers.append(SAGEConv(in_dim, out_dim))
            in_dim = out_dim

        self.convs = nn.ModuleList(layers)
        self.act = nn.ReLU()

    def build_complete_edge_index(self, B, L, device):
        """
        Build a complete directed graph (no self-loops)
        for each of B windows, each with L nodes.
        Returns edge_index of shape (2, E_total).
        """
        idx = torch.arange(L, device=device)   # (L,)
        src, dst = torch.meshgrid(idx, idx, indexing="ij")  # (L, L)

        # Remove self loops
        mask = src != dst
        src = src[mask].reshape(-1)  # (E_per_window,)
        dst = dst[mask].reshape(-1)  # (E_per_window,)

        edge_indices = []
        for b in range(B):
            offset = b * L
            edge_indices.append(
                torch.stack([src + offset, dst + offset], dim=0)  # (2, E_per_window)
            )

        edge_index = torch.cat(edge_indices, dim=1)  # (2, E_total)
        return edge_index

    def forward(self, x):
        """
        x: (B, L, emb_dim)
        returns: (B, L, emb_dim)
        """
        B, L, D = x.shape
        device = x.device

        x_flat = x.reshape(B * L, D)  # (N, D) where N = B*L
        edge_index = self.build_complete_edge_index(B, L, device)  # (2, E)

        h = x_flat
        for conv in self.convs:
            h = conv(h, edge_index)
            h = self.act(h)

        h = h.reshape(B, L, D)
        return h


In [7]:
# Cell 6: Hybrid Transformer + GNN IDS model

class HybridTransformerGNN_IDS(nn.Module):
    def __init__(self, in_dim, emb_dim=128, n_heads=4, n_layers=2,
                 num_classes=2, max_seq_len=32, gnn_layers=2):
        super().__init__()

        # Shared flow embedding
        self.embed = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Linear(128, emb_dim)
        )

        # Positional embeddings
        self.pos_emb = nn.Embedding(max_seq_len, emb_dim)

        # Transformer encoder (temporal branch)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim,
            nhead=n_heads,
            dim_feedforward=256,
            dropout=0.1,
            batch_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        # GraphSAGE encoder (graph branch)
        self.gnn = GraphSAGEEncoder(
            emb_dim=emb_dim,
            hidden_dim=128,
            num_layers=gnn_layers
        )

        # Fusion MLP with dropout for better generalization
        self.fusion = nn.Sequential(
            nn.Linear(emb_dim * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
        )

        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        """
        x: (B, L, in_dim)
        returns: (B, L, num_classes)
        """
        B, L, _ = x.shape
        device = x.device

        # Shared flow embedding
        x_emb = self.embed(x)  # (B, L, emb_dim)

        # Transformer branch (temporal)
        positions = torch.arange(L, device=device).unsqueeze(0).expand(B, L)
        temp_in = x_emb + self.pos_emb(positions)
        z_temp = self.transformer(temp_in)  # (B, L, emb_dim)

        # GNN branch (relational)
        z_graph = self.gnn(x_emb)          # (B, L, emb_dim)

        # Fusion
        z = torch.cat([z_temp, z_graph], dim=-1)  # (B, L, 2*emb_dim)
        z = self.fusion(z)                        # (B, L, 128)

        logits = self.classifier(z)               # (B, L, num_classes)
        return logits

# Instantiate once to see param count
in_dim = X_seq.shape[-1]
model = HybridTransformerGNN_IDS(in_dim=in_dim, emb_dim=128, n_heads=4, n_layers=2,
                                 num_classes=2, max_seq_len=SEQ_LEN, gnn_layers=2)
model.to(DEVICE)

total_params = sum(p.numel() for p in model.parameters())
print("Hybrid model parameters:", total_params)


Hybrid model parameters: 457090


In [8]:
# Cell 7: Helper functions ‚Äì accuracy, class weights

def accuracy_from_logits(logits, targets):
    """
    logits:  (B, L, C)
    targets: (B, L)
    """
    preds = torch.argmax(logits, dim=-1)  # (B, L)
    correct = (preds == targets).float()
    return correct.mean().item()

def compute_class_weights_from_train(y_train, device):
    """
    y_train: (N_windows, L) numpy array with 0/1 labels
    Returns torch tensor of shape (2,) with inverse-frequency weights.
    """
    flat = y_train.reshape(-1)
    counts = np.bincount(flat, minlength=2).astype(np.float32)
    total = counts.sum()
    # Simple inverse-frequency weights, normalized
    weights = total / (2.0 * counts + 1e-8)
    w_tensor = torch.tensor(weights, device=device, dtype=torch.float32)
    print("Class counts (0,1):", counts, " -> class weights:", weights)
    return w_tensor

class_weights = compute_class_weights_from_train(y_train, DEVICE)


Class counts (0,1): [1532563.  231949.]  -> class weights: [0.5756736 3.8036637]


In [9]:
# Cell 8: Train hybrid model with class-weighted loss

hybrid_model = HybridTransformerGNN_IDS(
    in_dim=in_dim,
    emb_dim=128,
    n_heads=4,
    n_layers=2,
    num_classes=2,
    max_seq_len=SEQ_LEN,
    gnn_layers=2,
).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(hybrid_model.parameters(), lr=LR)

# Optional: LR scheduler (based on validation loss)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2
)

best_val_acc = -1.0
best_model_path = os.path.join(SAVE_DIR, "hybrid_transformer_gnn_best_weighted.pth")

for epoch in range(1, EPOCHS + 1):
    # ---------- TRAIN ----------
    hybrid_model.train()
    total_loss = 0.0
    total_acc = 0.0

    for xb, yb in train_loader:
        xb = xb.to(DEVICE)  # (B, L, F)
        yb = yb.to(DEVICE)  # (B, L)

        logits = hybrid_model(xb)  # (B, L, 2)

        loss = criterion(
            logits.view(-1, logits.size(-1)),  # (B*L, C)
            yb.view(-1)                        # (B*L,)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += accuracy_from_logits(logits, yb)

    avg_train_loss = total_loss / len(train_loader)
    avg_train_acc  = total_acc / len(train_loader)

    # ---------- VALIDATION ----------
    hybrid_model.eval()
    val_loss = 0.0
    val_acc = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)

            logits = hybrid_model(xb)
            loss = criterion(
                logits.view(-1, logits.size(-1)),
                yb.view(-1)
            )
            val_loss += loss.item()
            val_acc += accuracy_from_logits(logits, yb)

    avg_val_loss = val_loss / len(val_loader)
    avg_val_acc  = val_acc / len(val_loader)

    scheduler.step(avg_val_loss)

    if avg_val_acc > best_val_acc:
        best_val_acc = avg_val_acc
        torch.save(hybrid_model.state_dict(), best_model_path)
        print(f"üíæ New best hybrid model saved (epoch {epoch}) with Val Acc = {best_val_acc:.4f}")

    print(
        f"Epoch {epoch}/{EPOCHS} | "
        f"Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f} | "
        f"Val Loss: {avg_val_loss:.4f}, Val Acc: {avg_val_acc:.4f}"
    )

print("\n‚úÖ Hybrid training finished.")
print("Best validation accuracy:", best_val_acc)


üíæ New best hybrid model saved (epoch 1) with Val Acc = 0.8267
Epoch 1/15 | Train Loss: 0.0446, Train Acc: 0.9877 | Val Loss: 2.1831, Val Acc: 0.8267
Epoch 2/15 | Train Loss: 0.0245, Train Acc: 0.9938 | Val Loss: 1.3798, Val Acc: 0.8171
Epoch 3/15 | Train Loss: 0.0208, Train Acc: 0.9948 | Val Loss: 1.7646, Val Acc: 0.7642
Epoch 4/15 | Train Loss: 0.0173, Train Acc: 0.9954 | Val Loss: 2.5332, Val Acc: 0.6777
Epoch 5/15 | Train Loss: 0.0153, Train Acc: 0.9957 | Val Loss: 2.2169, Val Acc: 0.7407
üíæ New best hybrid model saved (epoch 6) with Val Acc = 0.8369
Epoch 6/15 | Train Loss: 0.0129, Train Acc: 0.9965 | Val Loss: 1.6175, Val Acc: 0.8369
Epoch 7/15 | Train Loss: 0.0119, Train Acc: 0.9966 | Val Loss: 2.5539, Val Acc: 0.7349
üíæ New best hybrid model saved (epoch 8) with Val Acc = 0.8732
Epoch 8/15 | Train Loss: 0.0113, Train Acc: 0.9967 | Val Loss: 1.4894, Val Acc: 0.8732
Epoch 9/15 | Train Loss: 0.0097, Train Acc: 0.9970 | Val Loss: 2.0873, Val Acc: 0.8136
Epoch 10/15 | Train Lo

In [10]:
# Cell 9: Test metrics at 0.5 threshold + threshold sweep

# Reload best model
hybrid_model.load_state_dict(torch.load(best_model_path, map_location=DEVICE))
hybrid_model.to(DEVICE)
hybrid_model.eval()

all_probs = []
all_targets = []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)

        logits = hybrid_model(xb)  # (B, L, 2)
        probs = torch.softmax(logits, dim=-1)[..., 1]  # P(attack)

        all_probs.append(probs.cpu().numpy().reshape(-1))
        all_targets.append(yb.cpu().numpy().reshape(-1))

all_probs = np.concatenate(all_probs)    # shape: (N_flows,)
all_targets = np.concatenate(all_targets)

def eval_at_threshold(th):
    preds = (all_probs >= th).astype(int)
    acc  = accuracy_score(all_targets, preds)
    prec = precision_score(all_targets, preds, zero_division=0)
    rec  = recall_score(all_targets, preds, zero_division=0)
    f1   = f1_score(all_targets, preds, zero_division=0)
    return acc, prec, rec, f1

# Default 0.5 threshold (for comparison)
acc_05, prec_05, rec_05, f1_05 = eval_at_threshold(0.5)
print("‚úÖ TEST METRICS at threshold 0.5 (Hybrid + GNN, weighted loss):")
print(f"Accuracy : {acc_05:.4f}")
print(f"Precision: {prec_05:.4f}")
print(f"Recall   : {rec_05:.4f}")
print(f"F1-score : {f1_05:.4f}")

# Sweep thresholds to find best F1 and best Recall
thresholds = np.linspace(0.1, 0.9, 9)
print("\nüîç Threshold sweep:")
best_f1 = -1
best_f1_th = None

for th in thresholds:
    acc, prec, rec, f1 = eval_at_threshold(th)
    print(f"th={th:.2f} | Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_f1_th = th

print(f"\n‚≠ê Best F1-score {best_f1:.4f} achieved at threshold {best_f1_th:.2f}")


‚úÖ TEST METRICS at threshold 0.5 (Hybrid + GNN, weighted loss):
Accuracy : 0.9674
Precision: 0.9753
Recall   : 0.7778
F1-score : 0.8654

üîç Threshold sweep:
th=0.10 | Acc=0.9661, Prec=0.9064, Rec=0.8344, F1=0.8689
th=0.20 | Acc=0.9686, Prec=0.9389, Rec=0.8203, F1=0.8756
th=0.30 | Acc=0.9691, Prec=0.9555, Rec=0.8079, F1=0.8755
th=0.40 | Acc=0.9683, Prec=0.9673, Rec=0.7912, F1=0.8704
th=0.50 | Acc=0.9674, Prec=0.9753, Rec=0.7778, F1=0.8654
th=0.60 | Acc=0.9663, Prec=0.9812, Rec=0.7641, F1=0.8591
th=0.70 | Acc=0.9654, Prec=0.9860, Rec=0.7536, F1=0.8542
th=0.80 | Acc=0.9615, Prec=0.9894, Rec=0.7220, F1=0.8348
th=0.90 | Acc=0.9506, Prec=0.9925, Rec=0.6382, F1=0.7768

‚≠ê Best F1-score 0.8756 achieved at threshold 0.20


In [11]:
import pandas as pd
import os

model_name = "Hybrid"  # change per notebook


df = pd.DataFrame([[model_name, acc_05, prec_05, rec_05, f1_05]],
                  columns=["Model", "Accuracy", "Precision", "Recall", "F1"])

os.makedirs("results", exist_ok=True)
csv_path = f"results/{model_name}.csv"

df.to_csv(csv_path, index=False)
print(f"Exported metrics to {csv_path}")

df


Exported metrics to results/Hybrid.csv


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Hybrid,0.96743,0.9753,0.777815,0.865434
