In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# 1. Data Preprocessing
def load_and_preprocess_data(file_path):
    # Load CICIDS2017 dataset (assuming CSV format)
    df = pd.read_csv(file_path)
    
    # Remove duplicates and handle missing values
    df = df.drop_duplicates()
    df = df.fillna(df.mean(numeric_only=True))
    
    # Select features (excluding non-numeric and label columns)
    feature_cols = [col for col in df.columns if col != 'Label' and df[col].dtype != 'object']
    X = df[feature_cols].values
    y = df['Label'].values
    
    # For binary classification: Convert labels to binary (Benign: 0, Attack: 1)
    y_binary = np.where(y == 'BENIGN', 0, 1)
    
    # For multiclass classification: Encode labels
    unique_labels = np.unique(y)
    label_map = {label: idx for idx, label in enumerate(unique_labels)}
    y_multiclass = np.array([label_map[label] for label in y])
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y_binary, y_multiclass, unique_labels

# 2. Graph Construction
def build_graph(X, threshold=0.9):
    num_nodes = X.shape[0]
    edge_index = []
    
    # Compute cosine similarity for edges
    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):
            similarity = 1 - cosine(X[i], X[j])
            if similarity > threshold:
                edge_index.append([i, j])
                edge_index.append([j, i])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    x = torch.tensor(X, dtype=torch.float)
    
    return edge_index, x

# 3. GAT Model Definition
class GATModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=8):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=0.6)
    
    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# 4. Training and Evaluation
def train_and_evaluate(model, data, y, num_epochs, is_binary=True):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()
    
    train_mask = data.train_mask
    val_mask = data.val_mask
    
    history = defaultdict(list)
    
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out[train_mask], y[train_mask])
        loss.backward()
        optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_out = model(data.x, data.edge_index)
            val_loss = criterion(val_out[val_mask], y[val_mask])
            pred = out.argmax(dim=1)
            val_pred = pred[val_mask]
            val_true = y[val_mask]
            
            acc = accuracy_score(val_true, val_pred)
            prec = precision_score(val_true, val_pred, average='weighted', zero_division=0)
            rec = recall_score(val_true, val_pred, average='weighted', zero_division=0)
            f1 = f1_score(val_true, val_pred, average='weighted', zero_division=0)
            
            if is_binary:
                probs = F.softmax(out, dim=1)[:, 1]
                auc = roc_auc_score(val_true, probs[val_mask].numpy())
            else:
                auc = 0  # AUC for multiclass requires one-vs-rest, simplified here
            
            history['train_loss'].append(loss.item())
            history['val_loss'].append(val_loss.item())
            history['val_acc'].append(acc)
            history['val_prec'].append(prec)
            history['val_rec'].append(rec)
            history['val_f1'].append(f1)
            history['val_auc'].append(auc)
        
        print(f'Epoch {epoch+1}: Loss={loss.item():.4f}, Val Loss={val_loss.item():.4f}, Val Acc={acc:.4f}')
    
    return history, pred, F.softmax(out, dim=1)

# 5. Plotting
def plot_metrics(history, title_prefix):
    plt.figure(figsize=(12, 5))
    
    # Accuracy Plot
    plt.subplot(1, 2, 1)
    plt.plot(history['val_acc'], label='Validation Accuracy')
    plt.title(f'{title_prefix} Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Loss Plot
    plt.subplot(1, 2, 2)
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title(f'{title_prefix} Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# 6. ROC Curve
def plot_roc_curve(y_true, y_probs, is_binary, unique_labels):
    plt.figure()
    if is_binary:
        fpr, tpr, _ = roc_curve(y_true, y_probs[:, 1])
        auc = roc_auc_score(y_true, y_probs[:, 1])
        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc:.2f})')
    else:
        for i, label in enumerate(unique_labels):
            fpr, tpr, _ = roc_curve(y_true == i, y_probs[:, i])
            auc = roc_auc_score(y_true == i, y_probs[:, i])
            plt.plot(fpr, tpr, label=f'{label} (AUC = {auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

# Main Execution
def main():
    # Load and preprocess data
    file_path = '../dataset/CICIDS2017/CICIDS-2017.csv'  # Update with actual path
    X, y_binary, y_multiclass, unique_labels = load_and_preprocess_data(file_path)
    
    # Build graph
    edge_index, x = build_graph(X)
    
    # Create train/test masks
    train_idx, val_idx = train_test_split(range(X.shape[0]), test_size=0.2, random_state=42)
    train_mask = torch.zeros(X.shape[0], dtype=torch.bool)
    val_mask = torch.zeros(X.shape[0], dtype=torch.bool)
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    
    # Binary Classification
    data = Data(x=x, edge_index=edge_index, train_mask=train_mask, val_mask=val_mask)
    model_binary = GATModel(in_channels=X.shape[1], hidden_channels=16, out_channels=2)
    history_binary, pred_binary, probs_binary = train_and_evaluate(model_binary, data, torch.tensor(y_binary, dtype=torch.long), num_epochs=50, is_binary=True)
    
    # Evaluate Binary Classification
    print("\nBinary Classification Metrics:")
    print(f"Accuracy: {history_binary['val_acc'][-1]:.4f}")
    print(f"Precision: {history_binary['val_prec'][-1]:.4f}")
    print(f"Recall: {history_binary['val_rec'][-1]:.4f}")
    print(f"F1 Score: {history_binary['val_f1'][-1]:.4f}")
    print(f"AUC: {history_binary['val_auc'][-1]:.4f}")
    
    plot_metrics(history_binary, "Binary Classification")
    plot_roc_curve(y_binary[val_mask], probs_binary[val_mask].numpy(), is_binary=True, unique_labels=['Benign', 'Malicious'])
    
    # Multiclass Classification
    data = Data(x=x, edge_index=edge_index, train_mask=train_mask, val_mask=val_mask)
    model_multiclass = GATModel(in_channels=X.shape[1], hidden_channels=16, out_channels=len(unique_labels))
    history_multiclass, pred_multiclass, probs_multiclass = train_and_evaluate(model_multiclass, data, torch.tensor(y_multiclass, dtype=torch.long), num_epochs=50, is_binary=False)
    
    # Evaluate Multiclass Classification
    print("\nMulticlass Classification Metrics:")
    print(f"Accuracy: {history_multiclass['val_acc'][-1]:.4f}")
    print(f"Precision: {history_multiclass['val_prec'][-1]:.4f}")
    print(f"Recall: {history_multiclass['val_rec'][-1]:.4f}")
    print(f"F1 Score: {history_multiclass['val_f1'][-1]:.4f}")
    
    plot_metrics(history_multiclass, "Multiclass Classification")
    plot_roc_curve(y_multiclass[val_mask], probs_multiclass[val_mask].numpy(), is_binary=False, unique_labels=unique_labels)

if __name__ == "__main__":
    main()

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            roc_curve, precision_recall_curve)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
from torch_geometric.utils import to_dense_adj, dense_to_sparse
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Step 1: Data Loading and Preprocessing
print("Loading and preprocessing data...")

# Load all dataset files
data_files = [
    '../dataset/data/Monday-WorkingHours.pcap_ISCX.csv',
    '../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv',
    '../dataset/data/Wednesday-workingHours.pcap_ISCX.csv',
    '../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    '../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
]

# Load and concatenate all datasets
data_list = []
for file in data_files:
    try:
        df = pd.read_csv(file)
        data_list.append(df)
        print(f"Loaded {file} with shape {df.shape}")
    except Exception as e:
        print(f"Error loading {file}: {e}")

data = pd.concat(data_list, axis=0)
print(f"Final dataset shape: {data.shape}")

# Clean up memory
del data_list
del df

# Strip whitespace from column names
data.columns = data.columns.str.strip()

# Handle missing and infinite values
print("\nHandling missing and infinite values...")
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill missing values with median
for col in data.columns:
    if data[col].dtype in [np.float64, np.int64]:
        med = data[col].median()
        data[col].fillna(med, inplace=True)

# Check for remaining missing values
missing_values = data.isna().sum()
print(f"Missing values after imputation:\n{missing_values[missing_values > 0]}")

# Label encoding for categorical features and target
print("\nEncoding labels...")
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])

# Binary classification (normal vs attack)
data['Label'] = data['Label'].apply(lambda x: 0 if x == 0 else 1)  # 0: normal, 1: attack

# Show class distribution
class_dist = data['Label'].value_counts(normalize=True)
print("\nClass distribution:")
print(class_dist)

# Feature selection - remove constant and duplicate features
print("\nRemoving constant and duplicate features...")
constant_features = [col for col in data.columns if data[col].nunique() == 1]
duplicate_features = []
for i, col1 in enumerate(data.columns):
    for col2 in data.columns[i+1:]:
        if data[col1].equals(data[col2]):
            duplicate_features.append(col2)

features_to_drop = list(set(constant_features + duplicate_features))
data.drop(features_to_drop, axis=1, inplace=True)
print(f"Dropped {len(features_to_drop)} features")

# Split features and target
X = data.drop('Label', axis=1)
y = data['Label']

# Standardize features
print("\nStandardizing features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into train/test sets (stratified to maintain class distribution)
# After train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to numpy arrays first if they're pandas Series
if hasattr(y_train, 'values'):
    y_train = y_train.values
    y_test = y_test.values

# Then convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.LongTensor(y_train).to(device)
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.LongTensor(y_test).to(device)

## Step 2: Graph Construction
print("\nConstructing graph...")

def create_graph_data(X, y, k=5):
    """Create graph data from tabular data using k-NN"""
    X_tensor = torch.FloatTensor(X)
    
    # Compute pairwise distances
    distances = torch.cdist(X_tensor, X_tensor)
    
    # Create k-NN adjacency matrix
    _, indices = torch.topk(distances, k=k, largest=False)
    
    # Create edge_index
    edge_index = []
    for i in range(len(indices)):
        for j in indices[i]:
            edge_index.append([i, j])
    
    edge_index = torch.LongTensor(edge_index).t().contiguous()
    
    # Create edge attributes (optional)
    edge_attr = distances[edge_index[0], edge_index[1]].unsqueeze(1)
    
    # Create graph data
    graph_data = Data(
        x=X_tensor,
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=torch.LongTensor(y)
    )
    
    return graph_data

# Create graph data for training and testing
train_graph = create_graph_data(X_train, y_train, k=5)
test_graph = create_graph_data(X_test, y_test, k=5)

# Move graphs to device
train_graph = train_graph.to(device)
test_graph = test_graph.to(device)

## Step 3: Attention-based GNN Model with Class Weighting
class GATModel(nn.Module):
    def __init__(self, num_features, num_classes, hidden_dim=64, heads=4, dropout=0.2):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim, heads=heads, dropout=dropout)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads, dropout=dropout)
        self.conv3 = GATConv(hidden_dim * heads, hidden_dim, heads=1, dropout=dropout)
        self.lin = nn.Linear(hidden_dim, num_classes)
        self.dropout = dropout
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv2(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv3(x, edge_index))
        
        # Global mean pooling
        x = global_mean_pool(x, batch=None)
        
        x = self.lin(x)
        return F.log_softmax(x, dim=1)

# Initialize model
num_features = X_train.shape[1]
num_classes = len(np.unique(y_train))
model = GATModel(num_features, num_classes, hidden_dim=64, heads=4).to(device)

# Calculate class weights for imbalanced data
class_counts = np.bincount(y_train)
class_weights = 1. / torch.tensor(class_counts, dtype=torch.float32)
class_weights = class_weights.to(device)
criterion = nn.NLLLoss(weight=class_weights)

optimizer = Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, verbose=True)

## Step 4: Training and Evaluation
def train(model, graph):
    model.train()
    optimizer.zero_grad()
    out = model(graph)
    loss = criterion(out, graph.y)
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, graph):
    model.eval()
    with torch.no_grad():
        out = model(graph)
        pred = out.argmax(dim=1)
        correct = (pred == graph.y).sum().item()
        acc = correct / len(graph.y)
        loss = criterion(out, graph.y).item()
    return acc, loss

def test(model, graph):
    model.eval()
    with torch.no_grad():
        out = model(graph)
        pred = out.argmax(dim=1)
        prob = torch.exp(out)[:, 1]  # Probability of class 1 (attack)
        
        # Calculate metrics
        acc = accuracy_score(graph.y.cpu(), pred.cpu())
        precision = precision_score(graph.y.cpu(), pred.cpu())
        recall = recall_score(graph.y.cpu(), pred.cpu())
        f1 = f1_score(graph.y.cpu(), pred.cpu())
        auc = roc_auc_score(graph.y.cpu(), prob.cpu())
        
        # Confusion matrix
        cm = confusion_matrix(graph.y.cpu(), pred.cpu())
        
        # ROC curve data
        fpr, tpr, _ = roc_curve(graph.y.cpu(), prob.cpu())
        
        # Precision-recall curve
        precision_curve, recall_curve, _ = precision_recall_curve(graph.y.cpu(), prob.cpu())
        
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'confusion_matrix': cm,
        'fpr': fpr,
        'tpr': tpr,
        'precision_curve': precision_curve,
        'recall_curve': recall_curve
    }

# Training loop
print("\nTraining model...")
num_epochs = 100
train_losses = []
val_losses = []
train_accs = []
val_accs = []

best_val_acc = 0
best_model = None

for epoch in range(1, num_epochs + 1):
    # Train
    loss = train(model, train_graph)
    train_losses.append(loss)
    
    # Evaluate on training set
    train_acc, _ = evaluate(model, train_graph)
    train_accs.append(train_acc)
    
    # Evaluate on test set
    val_acc, val_loss = evaluate(model, test_graph)
    val_accs.append(val_acc)
    val_losses.append(val_loss)
    
    # Update learning rate
    scheduler.step(val_acc)
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = model.state_dict().copy()
    
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, '
              f'Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

# Load best model
model.load_state_dict(best_model)

# Plot training curves
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accs, label='Train Accuracy')
plt.plot(val_accs, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

## Step 5: Final Evaluation
print("\nFinal evaluation on test set...")
test_results = test(model, test_graph)

print(f"\nTest Accuracy: {test_results['accuracy']:.4f}")
print(f"Test Precision: {test_results['precision']:.4f}")
print(f"Test Recall: {test_results['recall']:.4f}")
print(f"Test F1 Score: {test_results['f1']:.4f}")
print(f"Test AUC: {test_results['auc']:.4f}")

# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(test_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal', 'Attack'], 
            yticklabels=['Normal', 'Attack'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Plot ROC curve
plt.figure(figsize=(6, 6))
plt.plot(test_results['fpr'], test_results['tpr'], label=f'ROC Curve (AUC = {test_results["auc"]:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

# Plot Precision-Recall curve
plt.figure(figsize=(6, 6))
plt.plot(test_results['recall_curve'], test_results['precision_curve'], label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

Using device: cpu
Loading subset of data...


FileNotFoundError: [Errno 2] No such file or directory: '../dataset/Monday-WorkingHours.pcap_ISCX.csv'

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Step 1: Load and Preprocess a Subset of Data
def load_subset(sample_size=50000, random_state=42):
    """Load a subset of the CICIDS2017 data"""
    data_files = [
        '../dataset/data/Monday-WorkingHours.pcap_ISCX.csv',
        '../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv',
        '../dataset/data/Wednesday-workingHours.pcap_ISCX.csv',
        '../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
        '../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
        '../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv',
        '../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
        '../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
    ]
    
    # Load first file to get columns
    sample_df = pd.read_csv(data_files[0], nrows=1)
    
    # Load subset of data
    chunks = []
    for file in data_files:
        for chunk in pd.read_csv(file, chunksize=sample_size//len(data_files)):
            chunks.append(chunk)
            if len(chunks) >= len(data_files):
                break
    
    data = pd.concat(chunks, axis=0)
    data = data.sample(min(sample_size, len(data)), random_state=random_state)
    
    # Preprocessing
    data.columns = data.columns.str.strip()
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Fill missing values
    for col in data.columns:
        if data[col].dtype in [np.float64, np.int64]:
            med = data[col].median()
            data[col].fillna(med, inplace=True)
    
    # Binary classification
    label_encoder = LabelEncoder()
    data['Label'] = label_encoder.fit_transform(data['Label'])
    data['Label'] = data['Label'].apply(lambda x: 0 if x == 0 else 1)
    
    # Remove constant features
    nunique = data.nunique()
    constant_cols = nunique[nunique == 1].index
    data.drop(constant_cols, axis=1, inplace=True)
    
    return data

print("Loading subset of data...")
data = load_subset(sample_size=50000)  # Adjust sample_size based on your memory
print(f"Loaded data shape: {data.shape}")

## Step 2: Feature Engineering and Splitting
X = data.drop('Label', axis=1)
y = data['Label']

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Step 3: Efficient Graph Construction
def create_mini_batch_graphs(X, y, batch_size=1000, k=5):
    """Create multiple smaller graphs instead of one large graph"""
    graphs = []
    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        
        X_tensor = torch.FloatTensor(X_batch)
        distances = torch.cdist(X_tensor, X_tensor)
        
        # Create k-NN edges
        _, indices = torch.topk(distances, k=k, largest=False)
        edge_index = []
        for i in range(len(indices)):
            for j in indices[i]:
                edge_index.append([i, j])
        
        edge_index = torch.LongTensor(edge_index).t().contiguous()
        
        graphs.append(Data(
            x=X_tensor,
            edge_index=edge_index,
            y=torch.LongTensor(y_batch.values if hasattr(y_batch, 'values') else y_batch)
        ))
    return graphs

print("Creating mini-batch graphs...")
train_graphs = create_mini_batch_graphs(X_train, y_train, batch_size=1000)
test_graphs = create_mini_batch_graphs(X_test, y_test, batch_size=1000)

# Move to device
train_graphs = [g.to(device) for g in train_graphs]
test_graphs = [g.to(device) for g in test_graphs]

## Step 4: Attention-based GNN Model
class MiniBatchGAT(nn.Module):
    def __init__(self, num_features, num_classes, hidden_dim=64, heads=4, dropout=0.2):
        super(MiniBatchGAT, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim, heads=heads, dropout=dropout)
        self.conv2 = GATConv(hidden_dim*heads, hidden_dim, heads=heads, dropout=dropout)
        self.conv3 = GATConv(hidden_dim*heads, hidden_dim, heads=1, dropout=dropout)
        self.lin = nn.Linear(hidden_dim, num_classes)
        self.dropout = dropout
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv2(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = F.elu(self.conv3(x, edge_index))
        
        x = global_mean_pool(x, batch=None)
        x = self.lin(x)
        return F.log_softmax(x, dim=1)

# Initialize model
num_features = X_train.shape[1]
model = MiniBatchGAT(num_features, 2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

# Class weights for imbalanced data
class_counts = np.bincount(y_train)
class_weights = torch.FloatTensor([1./class_counts[0], 1./class_counts[1]]).to(device)
criterion = nn.NLLLoss(weight=class_weights)

## Step 5: Training and Evaluation
def train_mini_batch(model, graphs, optimizer):
    model.train()
    total_loss = 0
    for graph in graphs:
        optimizer.zero_grad()
        out = model(graph)
        loss = criterion(out, graph.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(graphs)

def evaluate_mini_batch(model, graphs):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for graph in graphs:
            out = model(graph)
            pred = out.argmax(dim=1)
            correct += (pred == graph.y).sum().item()
            total += len(graph.y)
    return correct / total

print("Training model...")
for epoch in range(1, 101):
    loss = train_mini_batch(model, train_graphs, optimizer)
    train_acc = evaluate_mini_batch(model, train_graphs)
    test_acc = evaluate_mini_batch(model, test_graphs)
    
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

## Step 6: Final Evaluation
def evaluate_metrics(model, graphs):
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []
    
    with torch.no_grad():
        for graph in graphs:
            out = model(graph)
            pred = out.argmax(dim=1)
            prob = torch.exp(out)[:, 1]
            
            all_preds.extend(pred.cpu().numpy())
            all_probs.extend(prob.cpu().numpy())
            all_labels.extend(graph.y.cpu().numpy())
    
    return {
        'accuracy': accuracy_score(all_labels, all_preds),
        'precision': precision_score(all_labels, all_preds),
        'recall': recall_score(all_labels, all_preds),
        'f1': f1_score(all_labels, all_preds),
        'auc': roc_auc_score(all_labels, all_probs),
        'confusion_matrix': confusion_matrix(all_labels, all_preds)
    }

print("\nFinal Evaluation:")
results = evaluate_metrics(model, test_graphs)
print(f"Accuracy: {results['accuracy']:.4f}")
print(f"Precision: {results['precision']:.4f}")
print(f"Recall: {results['recall']:.4f}")
print(f"F1 Score: {results['f1']:.4f}")
print(f"AUC: {results['auc']:.4f}")
print("\nConfusion Matrix:")
print(results['confusion_matrix'])

Using device: cpu
Loading subset of data...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(med, inplace=True)


Loaded data shape: (50000, 69)
Creating mini-batch graphs...
Training model...


ValueError: Expected input batch_size (1) to match target batch_size (1000).

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
import time

In [2]:
# Load datasets (replace with your actual paths)
data1 = pd.read_csv('../dataset/data/Monday-WorkingHours.pcap_ISCX.csv')
data2 = pd.read_csv('../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv')
data3 = pd.read_csv('../dataset/data/Wednesday-workingHours.pcap_ISCX.csv')
data4 = pd.read_csv('../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
data5 = pd.read_csv('../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
data6 = pd.read_csv('../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv')
data7 = pd.read_csv('../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
data8 = pd.read_csv('../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Data Loading
data1 = pd.read_csv('../dataset/data/Monday-WorkingHours.pcap_ISCX.csv')
data2 = pd.read_csv('../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv')
data3 = pd.read_csv('../dataset/data/Wednesday-workingHours.pcap_ISCX.csv')
data4 = pd.read_csv('../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
data5 = pd.read_csv('../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
data6 = pd.read_csv('../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv')
data7 = pd.read_csv('../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
data8 = pd.read_csv('../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
data_list = [data1, data2, data3, data4, data5, data6, data7, data8]

print('Data dimensions: ')
for i, data in enumerate(data_list, start=1):
    rows, cols = data.shape
    print(f'Data{i} -> {rows} rows, {cols} columns')

# Concatenate datasets
data = pd.concat(data_list)
rows, cols = data.shape
print('New dimension:')
print(f'Number of rows: {rows}')
print(f'Number of columns: {cols}')
print(f'Total cells: {rows * cols}')

Data dimensions: 
Data1 -> 529918 rows, 79 columns
Data2 -> 445909 rows, 79 columns
Data3 -> 692703 rows, 79 columns
Data4 -> 170366 rows, 79 columns
Data5 -> 288602 rows, 79 columns
Data6 -> 191033 rows, 79 columns
Data7 -> 286467 rows, 79 columns
Data8 -> 225745 rows, 79 columns
New dimension:
Number of rows: 2830743
Number of columns: 79
Total cells: 223628697


In [2]:
# Free memory
for d in data_list:
    del d

In [3]:
data.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,49188,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,49486,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [4]:
dups = data[data.duplicated()]
print(f'Number of duplicates: {len(dups)}')

Number of duplicates: 308381


In [5]:
# Clean column names
data.columns = data.columns.str.strip()
data.drop_duplicates(inplace = True)
data.shape

(2522362, 79)

In [6]:
missing_val = data.isna().sum()
print(missing_val.loc[missing_val > 0])

Flow Bytes/s    353
dtype: int64


In [7]:
# Checking for infinity values
numeric_cols = data.select_dtypes(include = np.number).columns
inf_count = np.isinf(data[numeric_cols]).sum()
print(inf_count[inf_count > 0])

Flow Bytes/s      1211
Flow Packets/s    1564
dtype: int64


In [8]:
# Replacing any infinite values (positive or negative) with NaN (not a number)
print(f'Initial missing values: {data.isna().sum().sum()}')

data.replace([np.inf, -np.inf], np.nan, inplace = True)

print(f'Missing values after processing infinite values: {data.isna().sum().sum()}')

Initial missing values: 353
Missing values after processing infinite values: 3128


In [9]:
# Handle missing and infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in data.select_dtypes(include=[np.float64, np.float32]).columns:
    data[col].fillna(data[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)


In [10]:
data['Label'].unique()

array(['BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris',
       'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'Infiltration', 'Bot', 'PortScan',
       'DDoS'], dtype=object)

In [11]:
# Types of attacks & normal instances (BENIGN)
data['Label'].value_counts()

Label
BENIGN                        2096484
DoS Hulk                       172849
DDoS                           128016
PortScan                        90819
DoS GoldenEye                   10286
FTP-Patator                      5933
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1953
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [12]:
# Map attack types
attack_map = {
    'BENIGN': 'BENIGN',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port Scan',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Botnet',
    'Web Attack � Brute Force': 'Web Attack',
    'Web Attack � XSS': 'Web Attack',
    'Web Attack � Sql Injection': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
}
data['Attack Type'] = data['Label'].map(attack_map)
data.drop('Label', axis=1, inplace=True)

In [13]:
# Encode attack types to integers
unique_labels = data['Attack Type'].unique()
label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
data['Attack Type'] = data['Attack Type'].map(label_to_idx)
num_classes = len(unique_labels)

# Select numerical features (exclude non-numerical columns)
feature_cols = [col for col in data.columns if col != 'Attack Type' and data[col].dtype in [np.float64, np.float32, np.int64]]
X = data[feature_cols].values
y = data['Attack Type'].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
# Step 3: Graph Construction
# Create adjacency matrix based on feature similarity (cosine similarity)
num_nodes = X.shape[0]
adj_matrix = np.zeros((num_nodes, num_nodes))
for i in range(num_nodes):
    for j in range(i + 1, num_nodes):
        dot_product = np.sum(X[i] * X[j])
        norm_i = np.sqrt(np.sum(X[i] ** 2))
        norm_j = np.sqrt(np.sum(X[j] ** 2))
        similarity = dot_product / (norm_i * norm_j + 1e-8)
        if similarity > 0.8:  # Threshold for edge creation
            adj_matrix[i, j] = 1
            adj_matrix[j, i] = 1

# Convert to torch tensors
X_tensor = torch.FloatTensor(X)
y_tensor = torch.LongTensor(y)
adj_tensor = torch.FloatTensor(adj_matrix)

# Step 4: Train-Test Split
train_idx, test_idx = train_test_split(range(num_nodes), test_size=0.2, stratify=y, random_state=42)
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

MemoryError: Unable to allocate 46.3 TiB for an array with shape (2522362, 2522362) and data type float64

In [4]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Data Loading
data1 = pd.read_csv('../dataset/data/Monday-WorkingHours.pcap_ISCX.csv')
data2 = pd.read_csv('../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv')
data3 = pd.read_csv('../dataset/data/Wednesday-workingHours.pcap_ISCX.csv')
data4 = pd.read_csv('../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
data5 = pd.read_csv('../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
data6 = pd.read_csv('../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv')
data7 = pd.read_csv('../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
data8 = pd.read_csv('../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
data_list = [data1, data2, data3, data4, data5, data6, data7, data8]

print('Data dimensions: ')
for i, data in enumerate(data_list, start=1):
    rows, cols = data.shape
    print(f'Data{i} -> {rows} rows, {cols} columns')

# Concatenate datasets
data = pd.concat(data_list)
rows, cols = data.shape
print('New dimension:')
print(f'Number of rows: {rows}')
print(f'Number of columns: {cols}')
print(f'Total cells: {rows * cols}')

# Free memory
for d in data_list:
    del d

# Step 2: Preprocessing
# Handle missing and infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in data.select_dtypes(include=[np.float64, np.float32]).columns:
    data[col].fillna(data[col].median(), inplace=True)

# Map attack types
attack_map = {
    'BENIGN': 'BENIGN',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port Scan',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Botnet',
    'Web Attack � Brute Force': 'Web Attack',
    'Web Attack � XSS': 'Web Attack',
    'Web Attack � Sql Injection': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
}
data['Attack Type'] = data['Label'].map(attack_map)
data.drop('Label', axis=1, inplace=True)

# Encode attack types to integers
unique_labels = data['Attack Type'].unique()
label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
data['Attack Type'] = data['Attack Type'].map(label_to_idx)
num_classes = len(unique_labels)

# Select numerical features (exclude non-numerical columns)
feature_cols = [col for col in data.columns if col != 'Attack Type' and data[col].dtype in [np.float64, np.float32, np.int64]]
X = data[feature_cols].values
y = data['Attack Type'].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Step 3: Graph Construction
# Create adjacency matrix based on feature similarity (cosine similarity)
num_nodes = X.shape[0]
adj_matrix = np.zeros((num_nodes, num_nodes))
for i in range(num_nodes):
    for j in range(i + 1, num_nodes):
        dot_product = np.sum(X[i] * X[j])
        norm_i = np.sqrt(np.sum(X[i] ** 2))
        norm_j = np.sqrt(np.sum(X[j] ** 2))
        similarity = dot_product / (norm_i * norm_j + 1e-8)
        if similarity > 0.8:  # Threshold for edge creation
            adj_matrix[i, j] = 1
            adj_matrix[j, i] = 1

# Convert to torch tensors
X_tensor = torch.FloatTensor(X)
y_tensor = torch.LongTensor(y)
adj_tensor = torch.FloatTensor(adj_matrix)

# Step 4: Train-Test Split
train_idx, test_idx = train_test_split(range(num_nodes), test_size=0.2, stratify=y, random_state=42)
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

# Step 5: GNN with Attention Mechanism
class GNNAttention(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNAttention, self).__init__()
        self.W1 = torch.nn.Parameter(torch.randn(input_dim, hidden_dim))
        self.W2 = torch.nn.Parameter(torch.randn(hidden_dim, hidden_dim))
        self.W_att = torch.nn.Parameter(torch.randn(hidden_dim, 1))
        self.W_out = torch.nn.Parameter(torch.randn(hidden_dim, output_dim))
        self.relu = torch.nn.ReLU()

    def forward(self, X, adj):
        # First GNN layer
        h1 = torch.matmul(X, self.W1)
        h1 = self.relu(torch.matmul(adj, h1))

        # Attention mechanism
        att_scores = torch.matmul(h1, self.W_att)
        att_weights = torch.nn.functional.softmax(att_scores, dim=0)
        h1_att = h1 * att_weights

        # Second GNN layer
        h2 = torch.matmul(h1_att, self.W2)
        h2 = self.relu(torch.matmul(adj, h2))

        # Output layer
        out = torch.matmul(h2, self.W_out)
        return out

# Initialize model
input_dim = X.shape[1]
hidden_dim = 64
model = GNNAttention(input_dim, hidden_dim, num_classes)
optimizer = torch.optim.Adam([{'params': model.parameters(), 'lr': 0.01}])
criterion = torch.nn.CrossEntropyLoss()

# Step 6: Training
num_epochs = 100
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    out = model(X_tensor, adj_tensor)
    loss = criterion(out[train_mask], y_tensor[train_mask])
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())

    # Training accuracy
    _, pred = torch.max(out[train_mask], 1)
    train_acc = accuracy_score(y_tensor[train_mask].numpy(), pred.numpy())
    train_accuracies.append(train_acc)

    # Validation (using test set as validation here)
    model.eval()
    with torch.no_grad():
        out = model(X_tensor, adj_tensor)
        val_loss = criterion(out[test_mask], y_tensor[test_mask])
        val_losses.append(val_loss.item())
        _, pred = torch.max(out[test_mask], 1)
        val_acc = accuracy_score(y_tensor[test_mask].numpy(), pred.numpy())
        val_accuracies.append(val_acc)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, '
          f'Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

# Step 7: Evaluation
model.eval()
with torch.no_grad():
    out = model(X_tensor, adj_tensor)
    _, pred = torch.max(out[test_mask], 1)
    y_true = y_tensor[test_mask].numpy()
    y_pred = pred.numpy()

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
cm = confusion_matrix(y_true, y_pred)

print(f'Final Test Metrics:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(cm)

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

Data dimensions: 
Data1 -> 529918 rows, 79 columns
Data2 -> 445909 rows, 79 columns
Data3 -> 692703 rows, 79 columns
Data4 -> 170366 rows, 79 columns
Data5 -> 288602 rows, 79 columns
Data6 -> 191033 rows, 79 columns
Data7 -> 286467 rows, 79 columns
Data8 -> 225745 rows, 79 columns
New dimension:
Number of rows: 2830743
Number of columns: 79
Total cells: 223628697


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)


KeyError: 'Label'

In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
import os

# Set random seed
torch.manual_seed(42)
np.random.seed(42)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data files
data_files = [
    '../dataset/data/Monday-WorkingHours.pcap_ISCX.csv',
    '../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv',
    '../dataset/data/Wednesday-workingHours.pcap_ISCX.csv',
    '../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    '../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
]

# Process data in chunks
chunk_size = 100000  # Adjust based on memory
data_chunks = []

for file in data_files:
    print(f"Processing {file}...")
    try:
        # Read in chunks
        for chunk in pd.read_csv(file, chunksize=chunk_size, low_memory=False):
            # Strip whitespace from columns
            chunk.columns = chunk.columns.str.strip()
            # Replace inf/nan
            chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
            # Fill numeric missing values with median
            for col in chunk.select_dtypes(include=[np.float64, np.int64]).columns:
                chunk[col].fillna(chunk[col].median(), inplace=True)
            # Map attack types
            attack_map = {
                'BENIGN': 'BENIGN', 'DDoS': 'DDoS', 'DoS Hulk': 'DoS', 'DoS GoldenEye': 'DoS',
                'DoS slowloris': 'DoS', 'DoS Slowhttptest': 'DoS', 'PortScan': 'Port Scan',
                'FTP-Patator': 'Brute Force', 'SSH-Patator': 'Brute Force', 'Bot': 'Botnet',
                'Web Attack � Brute Force': 'Web Attack', 'Web Attack � XSS': 'Web Attack',
                'Web Attack � Sql Injection': 'Web Attack', 'Infiltration': 'Infiltration',
                'Heartbleed': 'Heartbleed'
            }
            chunk['Attack Type'] = chunk['Label'].map(attack_map)
            chunk.drop('Label', axis=1, inplace=True)
            # Downsample to reduce memory (e.g., take 10% of each chunk)
            chunk = chunk.sample(frac=0.1, random_state=42)
            data_chunks.append(chunk)
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Concatenate chunks
data = pd.concat(data_chunks, axis=0)
print(f"Final dataset shape: {data.shape}")

# Free memory
del data_chunks

Using device: cpu
Processing ../dataset/data/Monday-WorkingHours.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Wednesday-workingHours.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)


Processing ../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)


Processing ../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)


Final dataset shape: (283074, 79)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)


In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Select features (example: choose a subset to reduce memory)
features = [
    'Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max',
    'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std',
    'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min',
    'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'
]

# Ensure features exist in data
features = [f for f in features if f in data.columns]
X = data[features]
y = data['Attack Type']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to torch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y_encoded, dtype=torch.long).to(device)

print(f"Feature matrix shape: {X_tensor.shape}")
print(f"Label tensor shape: {y_tensor.shape}")

Feature matrix shape: torch.Size([283074, 19])
Label tensor shape: torch.Size([283074])


In [5]:
from sklearn.neighbors import kneighbors_graph
from torch_geometric.utils import to_undirected, dense_to_sparse
import scipy.sparse as sp

# Create adjacency matrix using k-nearest neighbors (k=5 for sparsity)
n_samples = X_scaled.shape[0]
k = 5  # Number of neighbors
adj_matrix = kneighbors_graph(X_scaled, n_neighbors=k, mode='connectivity', include_self=False)

# Convert sparse matrix to COO format and extract edge indices
adj_matrix = adj_matrix.tocoo()  # Convert to COO format for easy edge extraction
row = torch.tensor(adj_matrix.row, dtype=torch.long)
col = torch.tensor(adj_matrix.col, dtype=torch.long)
edge_index = torch.stack([row, col], dim=0)  # Shape: [2, num_edges]

# Make graph undirected
edge_index = to_undirected(edge_index)
edge_index = edge_index.to(device)

# Create PyTorch Geometric Data object
graph_data = Data(x=X_tensor, edge_index=edge_index, y=y_tensor).to(device)
print(f"Graph created with {graph_data.num_nodes} nodes and {graph_data.num_edges} edges")

Graph created with 283074 nodes and 1844152 edges


In [8]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn  # Ensure this import is present
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
from torch_geometric.utils import to_undirected
import scipy.sparse as sp
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
import gc
# Step 4: Define GAT Model
class GATNetwork(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):
        super(GATNetwork, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.2)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=1, dropout=0.2)
        self.fc = nn.Linear(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = F.elu(self.conv2(x, edge_index))
        x = global_mean_pool(x, torch.zeros(x.size(0), dtype=torch.long, device=x.device))
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [9]:
# Step 5: Training
num_features = X_tensor.shape[1]
num_classes = len(label_encoder.classes_)
model = GATNetwork(in_channels=num_features, hidden_channels=16, out_channels=num_classes).to(device)
optimizer = Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
criterion = nn.CrossEntropyLoss()

epochs = 20
batch_size = 32
model.train()

for epoch in range(epochs):
    total_loss = 0
    for i in range(0, graph_data.num_nodes, batch_size):
        batch_mask = torch.arange(i, min(i + batch_size, graph_data.num_nodes), device=device)
        batch_data = Data(
            x=graph_data.x[batch_mask],
            edge_index=graph_data.edge_index,
            y=graph_data.y[batch_mask]
        ).to(device)
        optimizer.zero_grad()
        out = model(batch_data)
        loss = criterion(out, batch_data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        val_mask = torch.randperm(graph_data.num_nodes)[:1000].to(device)
        val_data = Data(
            x=graph_data.x[val_mask],
            edge_index=graph_data.edge_index,
            y=graph_data.y[val_mask]
        ).to(device)
        val_out = model(val_data)
        val_loss = criterion(val_out, val_data.y)
        val_pred = val_out.argmax(dim=1).cpu().numpy()
        val_true = val_data.y.cpu().numpy()
        val_acc = accuracy_score(val_true, val_pred)
    model.train()

    scheduler.step(val_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

IndexError: Found indices in 'edge_index' that are larger than 31 (got 283073). Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 32) in your node feature matrix and try again.

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.utils import to_undirected, subgraph
import scipy.sparse as sp
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
import gc

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Step 1: Data Loading and Preprocessing
data_files = [
    '../dataset/data/Monday-WorkingHours.pcap_ISCX.csv',
    '../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv',
    '../dataset/data/Wednesday-workingHours.pcap_ISCX.csv',
    '../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    '../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    '../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
]

chunk_size = 50000
data_chunks = []
for file in data_files:
    print(f"Processing {file}...")
    try:
        for chunk in pd.read_csv(file, chunksize=chunk_size, low_memory=False):
            chunk.columns = chunk.columns.str.strip()
            chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
            for col in chunk.select_dtypes(include=[np.float64, np.int64]).columns:
                chunk[col].fillna(chunk[col].median(), inplace=True)
            attack_map = {
                'BENIGN': 'BENIGN', 'DDoS': 'DDoS', 'DoS Hulk': 'DoS', 'DoS GoldenEye': 'DoS',
                'DoS slowloris': 'DoS', 'DoS Slowhttptest': 'DoS', 'PortScan': 'Port Scan',
                'FTP-Patator': 'Brute Force', 'SSH-Patator': 'Brute Force', 'Bot': 'Botnet',
                'Web Attack � Brute Force': 'Web Attack', 'Web Attack � XSS': 'Web Attack',
                'Web Attack � Sql Injection': 'Web Attack', 'Infiltration': 'Infiltration',
                'Heartbleed': 'Heartbleed'
            }
            chunk['Attack Type'] = chunk['Label'].map(attack_map)
            chunk.drop('Label', axis=1, inplace=True)
            chunk = chunk.sample(frac=0.05, random_state=42)
            data_chunks.append(chunk)
    except Exception as e:
        print(f"Error processing {file}: {e}")

data = pd.concat(data_chunks, axis=0)
print(f"Final dataset shape: {data.shape}")
del data_chunks
gc.collect()

# Step 2: Feature Selection and Encoding
features = [
    'Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max',
    'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std',
    'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min',
    'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'
]
features = [f for f in features if f in data.columns]
X = data[features]
y = data['Attack Type']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y_encoded, dtype=torch.long).to(device)
print(f"Feature matrix shape: {X_tensor.shape}")
print(f"Label tensor shape: {y_tensor.shape}")

# Free memory
del data, X, y, X_scaled, y_encoded
gc.collect()

# Step 3: Graph Construction
n_samples = X_tensor.shape[0]
k = 3
adj_matrix = kneighbors_graph(X_tensor.cpu().numpy(), n_neighbors=k, mode='connectivity', include_self=False)
adj_matrix = adj_matrix.tocoo()
row = torch.tensor(adj_matrix.row, dtype=torch.long)
col = torch.tensor(adj_matrix.col, dtype=torch.long)
edge_index = torch.stack([row, col], dim=0)
edge_index = to_undirected(edge_index)
edge_index = edge_index.to(device)
graph_data = Data(x=X_tensor, edge_index=edge_index, y=y_tensor).to(device)
print(f"Graph created with {graph_data.num_nodes} nodes and {graph_data.num_edges} edges")

# Step 4: Define GAT Model
class GATNetwork(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):
        super(GATNetwork, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.2)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=1, dropout=0.2)
        self.fc = nn.Linear(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.elu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = F.elu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = self.fc(x)  # Node-level predictions, no global pooling
        return x

# Step 5: Training
num_features = X_tensor.shape[1]
num_classes = len(label_encoder.classes_)
model = GATNetwork(in_channels=num_features, hidden_channels=16, out_channels=num_classes).to(device)
optimizer = Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
criterion = nn.CrossEntropyLoss()

epochs = 20
batch_size = 16
model.train()

for epoch in range(epochs):
    total_loss = 0
    for i in range(0, graph_data.num_nodes, batch_size):
        batch_mask = torch.arange(i, min(i + batch_size, graph_data.num_nodes), device=device)
        batch_edge_index, _ = subgraph(batch_mask, graph_data.edge_index, relabel_nodes=True, num_nodes=graph_data.num_nodes)
        batch_data = Data(
            x=graph_data.x[batch_mask],
            edge_index=batch_edge_index,
            y=graph_data.y[batch_mask]
        ).to(device)
        optimizer.zero_grad()
        out = model(batch_data)
        loss = criterion(out, batch_data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        val_mask = torch.randperm(graph_data.num_nodes)[:1000].to(device)
        val_edge_index, _ = subgraph(val_mask, graph_data.edge_index, relabel_nodes=True, num_nodes=graph_data.num_nodes)
        val_data = Data(
            x=graph_data.x[val_mask],
            edge_index=val_edge_index,
            y=graph_data.y[val_mask]
        ).to(device)
        val_out = model(val_data)
        val_loss = criterion(val_out, val_data.y)
        val_pred = val_out.argmax(dim=1).cpu().numpy()
        val_true = val_data.y.cpu().numpy()
        val_acc = accuracy_score(val_true, val_pred)
    model.train()

    scheduler.step(val_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# Final evaluation
model.eval()
with torch.no_grad():
    # Use a subset for final evaluation to save memory
    eval_mask = torch.randperm(graph_data.num_nodes)[:10000].to(device)
    eval_edge_index, _ = subgraph(eval_mask, graph_data.edge_index, relabel_nodes=True, num_nodes=graph_data.num_nodes)
    eval_data = Data(
        x=graph_data.x[eval_mask],
        edge_index=eval_edge_index,
        y=graph_data.y[eval_mask]
    ).to(device)
    out = model(eval_data)
    pred = out.argmax(dim=1).cpu().numpy()
    true = eval_data.y.cpu().numpy()
    acc = accuracy_score(true, pred)
    prec, rec, f1, _ = precision_recall_fscore_support(true, pred, average='weighted')
    print(f"Final Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")

Using device: cpu
Processing ../dataset/data/Monday-WorkingHours.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Tuesday-WorkingHours.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Wednesday-workingHours.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Friday-WorkingHours-Morning.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Processing ../dataset/data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chunk[col].fillna(chunk[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Final dataset shape: (141536, 79)
Feature matrix shape: torch.Size([141536, 19])
Label tensor shape: torch.Size([141536])
Graph created with 141536 nodes and 566330 edges
Epoch 1/20, Loss: 1049.1195, Val Loss: 3.3755, Val Acc: 0.8120
Epoch 2/20, Loss: 976.9966, Val Loss: 3.1597, Val Acc: 0.8070
Epoch 3/20, Loss: 991.6417, Val Loss: 2.7330, Val Acc: 0.7950
