# Autoencoder Training on CICIDS2018

This notebook trains an **Autoencoder** for unsupervised anomaly detection on the CICIDS2018 dataset.

**Model:** Autoencoder (PyTorch)  
**Dataset:** CICIDS2018  
**Task:** Anomaly Detection (Train on Benign, Detect Attacks)

## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path

# Add src to path
sys.path.append(str(Path("../../").resolve()))

import numpy as np
import pandas as pd
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from src.models.classical.autoencoder import Autoencoder
import time
import gc

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Load and Preprocess Data

In [None]:
# Load all CSV files from the CICIDS2018 raw directory
DATA_PATH = '../../data/raw/cicids2018/'
all_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))
all_files = sorted(all_files, key=lambda x: os.path.getsize(x))

print(f"Found {len(all_files)} files.")
li = []

for filename in all_files:
    file_size_mb = os.path.getsize(filename) / (1024 * 1024)
    if file_size_mb > 1000:
        print(f"Skipping {os.path.basename(filename)} ({file_size_mb:.0f}MB - too large)")
        continue
    print(f"Loading {os.path.basename(filename)}...")
    try:
        df_temp = pd.read_csv(filename, index_col=None, header=0, low_memory=True)
        # Basic cleaning per file to save memory
        df_temp.columns = df_temp.columns.str.strip()
        drop_cols = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Label']
        label_col = df_temp['Label']
        
        # Keep label, drop metadata
        df_temp = df_temp.drop(columns=[c for c in drop_cols if c in df_temp.columns and c != 'Label'], errors='ignore')
        
        li.append(df_temp)
    except Exception as e:
        print(f"Error loading {filename}: {e}")

# Concatenate
df = pd.concat(li, axis=0, ignore_index=True)
print(f"Total raw samples: {len(df):,}")
del li
gc.collect()

In [None]:
# Preprocessing

# 1. Create Binary Label
def create_binary_label(label):
    if isinstance(label, str) and 'BENIGN' in label.upper():
        return 0
    return 1

df['binary_label'] = df['Label'].apply(create_binary_label)
df.drop(columns=['Label'], inplace=True, errors='ignore')

# 2. Convert to numeric
for col in df.columns:
    if col != 'binary_label':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# 3. Clean NaNs/Infs
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

print("Binary class distribution:")
print(df['binary_label'].value_counts())

In [None]:
# Sample data
# For Autoencoder, we need primarily BENIGN data for training
NORMAL_SAMPLES = 500000
ATTACK_SAMPLES = 200000

benign = df[df['binary_label'] == 0].sample(n=min(len(df[df['binary_label']==0]), NORMAL_SAMPLES), random_state=42)
attack = df[df['binary_label'] == 1].sample(n=min(len(df[df['binary_label']==1]), ATTACK_SAMPLES), random_state=42)

df_sampled = pd.concat([benign, attack], ignore_index=True)
print(f"Sampled to {len(df_sampled):,} samples")

y = df_sampled['binary_label'].values
X = df_sampled.drop(columns=['binary_label']).values.astype(np.float32)

del df, df_sampled, benign, attack
gc.collect()

In [None]:
# Prepare Data/Splits for Anomaly Detection
# Train on Benign ONLY
normal_indices = np.where(y == 0)[0]
attack_indices = np.where(y == 1)[0]

X_normal = X[normal_indices]
X_attack = X[attack_indices]

# Split Benign into Train (70%), Val (15%), Test (15%)
X_train, X_temp = train_test_split(X_normal, test_size=0.3, random_state=42)
X_val, X_test_normal = train_test_split(X_temp, test_size=0.5, random_state=42)

# Test Set = Normal Test + All Attacks
X_test = np.concatenate([X_test_normal, X_attack])
y_test = np.concatenate([np.zeros(len(X_test_normal)), np.ones(len(X_attack))])

# Scale Features (MinMax is often better for Autoencoders)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Create DataLoaders
BATCH_SIZE = 512
# Train loader only needs X (unsupervised)
train_loader = DataLoader(TensorDataset(torch.FloatTensor(X_train), torch.zeros(len(X_train))), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TensorDataset(torch.FloatTensor(X_val), torch.zeros(len(X_val))), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(TensorDataset(torch.FloatTensor(X_test), torch.LongTensor(y_test)), batch_size=BATCH_SIZE, shuffle=False)

print(f"Train samples: {len(X_train):,} (Normal)")
print(f"Val samples: {len(X_val):,} (Normal)")
print(f"Test samples: {len(X_test):,} (Mixed)")

## 3. Autoencoder Model

In [None]:
model = Autoencoder(
    input_dim=X_train.shape[1],
    encoder_units=[64, 32, 16],
    latent_dim=8,
    decoder_units=[16, 32, 64],
    dropout_rate=0.2
).to(device)

print(model)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    for X_batch, _ in loader:
        X_batch = X_batch.to(device)
        optimizer.zero_grad()
        recon, _ = model(X_batch)
        loss = criterion(recon, X_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
    return total_loss / len(loader.dataset)

def validate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, _ in loader:
            X_batch = X_batch.to(device)
            recon, _ = model(X_batch)
            loss = criterion(recon, X_batch)
            total_loss += loss.item() * X_batch.size(0)
    return total_loss / len(loader.dataset)

In [None]:
EPOCHS = 30
best_val_loss = float('inf')

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer)
    val_loss = validate(model, val_loader)
    
    print(f"Epoch {epoch+1} | Train MSE: {train_loss:.6f} | Val MSE: {val_loss:.6f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "../../results/models/best_autoencoder_cicids2018.pth")

In [None]:
# Evaluation
model.load_state_dict(torch.load("../../results/models/best_autoencoder_cicids2018.pth"))
model.eval()

# 1. Set Threshold on Validation Set
model.set_threshold(torch.FloatTensor(X_val).to(device), percentile=95)
print(f"Anomaly Threshold: {model.threshold:.6f}")

# 2. Predict on Test Set
all_preds, all_labels, all_errors = [], [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        preds = model.predict(X_batch)
        errors = model.reconstruction_error(X_batch)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.numpy())
        all_errors.extend(errors.cpu().numpy())

print(classification_report(all_labels, all_preds, target_names=['Benign', 'Attack']))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

roc_auc = roc_auc_score(all_labels, all_errors)
print(f"ROC AUC Score: {roc_auc:.4f}")