# LSTM Classifier Training on CICIDS2018

This notebook trains a Long Short-Term Memory (LSTM) network on the CICIDS2018 dataset.

**Model:** LSTMClassifier (PyTorch)  
**Dataset:** CICIDS2018  
**Task:** Binary Classification (Benign vs Attack)

## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path

# Add src to path
sys.path.append(str(Path("../../").resolve()))

import numpy as np
import pandas as pd
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from src.models.classical.lstm import LSTMClassifier
import time
import gc

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Load and Preprocess Data

In [None]:
# Load all CSV files from the CICIDS2018 raw directory
DATA_PATH = '../../data/raw/cicids2018/'
all_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))
all_files = sorted(all_files, key=lambda x: os.path.getsize(x))

print(f"Found {len(all_files)} files.")
li = []

for filename in all_files:
    file_size_mb = os.path.getsize(filename) / (1024 * 1024)
    if file_size_mb > 1000:
        print(f"Skipping {os.path.basename(filename)} ({file_size_mb:.0f}MB - too large)")
        continue
    print(f"Loading {os.path.basename(filename)}...")
    try:
        df_temp = pd.read_csv(filename, index_col=None, header=0, low_memory=True)
        # Basic cleaning per file to save memory
        df_temp.columns = df_temp.columns.str.strip()
        drop_cols = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Label']
        label_col = df_temp['Label']
        
        # Keep label, drop metadata
        df_temp = df_temp.drop(columns=[c for c in drop_cols if c in df_temp.columns and c != 'Label'], errors='ignore')
        
        li.append(df_temp)
    except Exception as e:
        print(f"Error loading {filename}: {e}")

# Concatenate
df = pd.concat(li, axis=0, ignore_index=True)
print(f"Total raw samples: {len(df):,}")
del li
gc.collect()

In [None]:
# Preprocessing

# 1. Create Binary Label
def create_binary_label(label):
    if isinstance(label, str) and 'BENIGN' in label.upper():
        return 0
    return 1

df['binary_label'] = df['Label'].apply(create_binary_label)
df.drop(columns=['Label'], inplace=True, errors='ignore')

# 2. Convert to numeric
for col in df.columns:
    if col != 'binary_label':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 3. Clean NaNs/Infs
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

print("Binary class distribution:")
print(df['binary_label'].value_counts())

In [None]:
# Sample data (LSTM is slow, so we sample)
SAMPLE_SIZE_PER_CLASS = 200000
benign = df[df['binary_label'] == 0].sample(n=min(len(df[df['binary_label']==0]), SAMPLE_SIZE_PER_CLASS), random_state=42)
attack = df[df['binary_label'] == 1].sample(n=min(len(df[df['binary_label']==1]), SAMPLE_SIZE_PER_CLASS), random_state=42)

df_sampled = pd.concat([benign, attack], ignore_index=True)
print(f"Sampled to {len(df_sampled):,} samples")

y = df_sampled['binary_label'].values
X = df_sampled.drop(columns=['binary_label']).values.astype(np.float32)

del df, df_sampled, benign, attack
gc.collect()

In [None]:
# Split Data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Scale Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Create DataLoaders
BATCH_SIZE = 512
train_loader = DataLoader(TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train)), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TensorDataset(torch.FloatTensor(X_val), torch.LongTensor(y_val)), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(TensorDataset(torch.FloatTensor(X_test), torch.LongTensor(y_test)), batch_size=BATCH_SIZE, shuffle=False)

print(f"Train batches: {len(train_loader)}")

## 3. LSTM Model

In [None]:
model = LSTMClassifier(
    input_dim=X_train.shape[1],
    num_classes=2,
    lstm_units=[128, 64],
    dense_units=[64],
    dropout_rate=0.3
).to(device)

print(model)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)

def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
    return total_loss / total, correct / total

def validate(model, loader, criterion):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item() * X_batch.size(0)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
    return total_loss / total, correct / total

In [None]:
EPOCHS = 20
best_val_acc = 0

for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = validate(model, val_loader, criterion)
    scheduler.step(val_acc)
    
    print(f"Epoch {epoch+1} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "../../results/models/best_lstm_cicids2018.pth")

In [None]:
# Evaluation
model.load_state_dict(torch.load("../../results/models/best_lstm_cicids2018.pth"))
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.numpy())

print(classification_report(all_labels, all_preds, target_names=['Benign', 'Attack']))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))