In [None]:
## Cell 1: Setup and Import Libraries

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os # <-- THIS LINE FIXES THE ERROR
import re

# Prefer GPU explicitly and enable mixed precision + memory growth
try:
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            try:
                tf.config.experimental.set_memory_growth(gpu, True)
            except Exception:
                pass
        tf.config.set_visible_devices(gpus[0], 'GPU')
        from tensorflow.keras import mixed_precision
        mixed_precision.set_global_policy('mixed_float16')
        print(f"✅ GPU in use: {gpus[0].name} | Mixed precision ON")
    else:
        print("ℹ️ No GPU detected. Running on CPU.")
except Exception as e:
    print(f"⚠️ GPU setup skipped: {e}")

print("✅ Libraries imported successfully.")
print(f"TensorFlow Version: {tf.__version__}")

In [None]:
## Cell 2: Configuration and Hyperparameters

# --- Dataset File Paths ---
# !! Make sure these files are in the same directory as your notebook !!
file_paths = {
    'csic': 'csic_2010.csv',
    'sqli': 'SQL_Injection_Dataset.csv',
    'xss': 'XSS_dataset.csv',
    'url': 'malicious_urls.csv',
    'access_log': 'access.log'  # Apache/Nginx combined log
}

# Access log parsing options
access_log_config = {
    'combined_regex': r'^(\S+) (\S+) (\S+) \[(.*?)\] "(\S+) (.*?) (HTTP\/\d\.\d)" (\d{3}) (\S+) "(.*?)" "(.*?)"$',
    # Labeling heuristic: mark 5xx or WAF-like markers as malicious; others benign
    'malicious_status_codes': set(range(500, 600)),
    'waf_keywords': ['attack', 'blocked', 'malicious', 'sql injection', 'xss'],
}

# Sampling/balancing options
BALANCE_PER_SOURCE = True
PER_SOURCE_MAX = 50000        # cap records per source (after cleaning)
ACCESS_LOG_MAX = 25000        # cap from access.log specifically
RANDOM_STATE = 42

# --- Model Hyperparameters ---
MAX_FEATURES = 10000
MAX_LEN = 250
EMBEDDING_DIM = 128
GRU_UNITS = 128

# --- Training Parameters ---
BATCH_SIZE = 1024                 # larger batch for GPU throughput
EPOCHS = 5
EARLY_STOP_PATIENCE = 2
TRAIN_MAX_SAMPLES = 200_000       # cap total training samples for speed (None to disable)

print("✅ Configuration loaded.")

In [None]:
## Cell 3: Load Data and Split First, Then Preprocess (FIXED - No Data Leakage)

# Helpers for access.log parsing

def parse_access_log_line(line, cfg):
    m = re.match(cfg['combined_regex'], line)
    if not m:
        return None
    method, path, protocol = m.group(5), m.group(6), m.group(7)
    status = int(m.group(8))
    referer = m.group(10)
    ua = m.group(11)
    payload = path if path else ''
    # Heuristic labeling: conservative to reduce bias
    waf_hit = any(k in line.lower() for k in cfg['waf_keywords'])
    label = 1 if (status in cfg['malicious_status_codes'] or waf_hit) else 0
    return {'payload': payload, 'label': label, 'status': status, 'method': method, 'referer': referer, 'ua': ua}

# First, try to load the master dataset
df_master = None
try:
    master_dataset_path = 'prepared_dataset/master_web_attack_dataset.csv'
    if os.path.exists(master_dataset_path):
        print(f"\n🔄 Loading prepared master dataset from {master_dataset_path}...")
        df_master = pd.read_csv(master_dataset_path)
        print(f"✅ Master dataset loaded successfully!")
    else:
        print("\n🔄 Loading and processing individual datasets...")
        df_csic = pd.read_csv(file_paths['csic'])
        df_sqli = pd.read_csv(file_paths['sqli'])
        df_xss = pd.read_csv(file_paths['xss'])
        df_url = pd.read_csv(file_paths['url'])
        print("... CSV datasets loaded successfully.")

        # Optionally include access.log if present (with cap)
        access_rows = []
        if os.path.exists(file_paths['access_log']):
            print("\n🔄 Parsing access.log...")
            with open(file_paths['access_log'], 'r', encoding='utf-8', errors='ignore') as f:
                for i, line in enumerate(f):
                    row = parse_access_log_line(line.strip(), access_log_config)
                    if row and len(str(row['payload'])) > 1:
                        access_rows.append(row)
                        if len(access_rows) >= ACCESS_LOG_MAX:
                            break
            print(f"Parsed {len(access_rows):,} log lines (capped at {ACCESS_LOG_MAX}).")
            df_access = pd.DataFrame(access_rows)[['payload','label']]
        else:
            df_access = pd.DataFrame(columns=['payload','label'])

        # Standardize CSIC 2010
        df1 = df_csic[['content', 'classification']].copy()
        df1.rename(columns={'content': 'payload', 'classification': 'label'}, inplace=True)
        df1['label'] = df1['label'].apply(lambda x: 1 if str(x).lower() == 'anomalous' else 0)

        # Standardize SQL Injection
        df2 = df_sqli[['Query', 'Label']].copy()
        df2.rename(columns={'Query': 'payload', 'Label': 'label'}, inplace=True)

        # Standardize Cross-Site Scripting (XSS)
        df3 = df_xss[['Sentence', 'Label']].copy()
        df3.rename(columns={'Sentence': 'payload', 'Label': 'label'}, inplace=True)

        # Standardize Malicious URLs
        df4 = df_url[['url', 'type']].copy()
        df4.rename(columns={'url': 'payload', 'type': 'label'}, inplace=True)
        df4['label'] = df4['label'].apply(lambda x: 0 if x == 'benign' else 1)

        # Balance per source if enabled
        if BALANCE_PER_SOURCE:
            def cap(df, maxn):
                if df.empty:
                    return df
                n = min(len(df), maxn)
                return df.sample(n=n, random_state=RANDOM_STATE).reset_index(drop=True)
            df1 = cap(df1, PER_SOURCE_MAX)
            df2 = cap(df2, PER_SOURCE_MAX)
            df3 = cap(df3, PER_SOURCE_MAX)
            df4 = cap(df4, PER_SOURCE_MAX)
            df_access = cap(df_access, min(ACCESS_LOG_MAX, PER_SOURCE_MAX))

            # Equalize to the smallest available among sources
            sizes = [len(df) for df in [df1, df2, df3, df4, df_access] if len(df) > 0]
            if sizes:
                target = min(sizes)
                df1 = df1.sample(n=min(len(df1), target), random_state=RANDOM_STATE).reset_index(drop=True)
                df2 = df2.sample(n=min(len(df2), target), random_state=RANDOM_STATE).reset_index(drop=True)
                df3 = df3.sample(n=min(len(df3), target), random_state=RANDOM_STATE).reset_index(drop=True)
                df4 = df4.sample(n=min(len(df4), target), random_state=RANDOM_STATE).reset_index(drop=True)
                df_access = df_access.sample(n=min(len(df_access), target), random_state=RANDOM_STATE).reset_index(drop=True)

        # Combine, Clean, and Shuffle
        df_combined = pd.concat([df1, df2, df3, df4, df_access], ignore_index=True)
        df_combined.dropna(inplace=True)
        df_combined = df_combined[df_combined['payload'].astype(str).str.len() > 1]
        df_master = df_combined.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

        print("\n--- ✅ Master Dataset Created Successfully (with access.log if available) ---")
        os.makedirs('prepared_dataset', exist_ok=True)
        df_master.to_csv(master_dataset_path, index=False)
        print(f"\n💾 Master dataset saved to: '{master_dataset_path}'")

except (FileNotFoundError, KeyError) as e:
    print(f"\n❌ ERROR during data preparation: {e}")
    print("Please check that all CSV files exist and that the column names are correct.")

if df_master is not None and not df_master.empty:
    print("\n🔄 Splitting data into training and testing sets FIRST...")
    
    payloads_raw = df_master['payload'].astype(str).values
    labels_raw = df_master['label'].values
    
    X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
        payloads_raw, labels_raw,
        test_size=0.2,
        random_state=42,
        stratify=labels_raw
    )
    
    print(f"Raw training samples: {len(X_train_raw):,}")
    print(f"Raw testing samples:  {len(X_test_raw):,}")
    
    print("\n🔄 Preprocessing data: Converting text to numerical sequences...")
    
    tokenizer = Tokenizer(num_words=MAX_FEATURES, char_level=True, oov_token='<OOV>')
    tokenizer.fit_on_texts(X_train_raw)
    
    X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
    X_test_seq = tokenizer.texts_to_sequences(X_test_raw)
    
    X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test = pad_sequences(X_test_seq, maxlen=MAX_LEN)
    
    y_train = np.asarray(y_train_raw)
    y_test = np.asarray(y_test_raw)

    # Optional cap for quicker training without leakage (applies to training split only)
    if TRAIN_MAX_SAMPLES:
        n = min(TRAIN_MAX_SAMPLES, X_train.shape[0])
        rng = np.random.default_rng(RANDOM_STATE)
        idx = rng.choice(X_train.shape[0], size=n, replace=False)
        X_train = X_train[idx]
        y_train = y_train[idx]
        print(f"Capped training samples to: {n}")
    
    print(f"Training tensor shape (X_train): {X_train.shape}")
    print(f"Training tensor shape (y_train): {y_train.shape}")
    print(f"Testing tensor shape (X_test): {X_test.shape}")
    print(f"Testing tensor shape (y_test): {y_test.shape}")
    
    X_test_raw_stored = X_test_raw
    
else:
    print("Skipping preprocessing because the master dataframe was not created successfully.")

In [None]:
## Cell 4: Train-Test Split Already Done (REMOVED - No longer needed)

# This cell is no longer needed since we already split the data in Cell 3
# The train-test split is now done BEFORE tokenization to prevent data leakage

if 'X_train' in locals() and 'X_test' in locals():
    print("✅ Train-test split already completed in Cell 3 with proper data leakage prevention.")
    print(f"Final training samples: {len(X_train):,}")
    print(f"Final testing samples:  {len(X_test):,}")
else:
    print("⚠️ Data preprocessing not completed yet. Please run Cell 3 first.")

In [None]:
## Cell 5: Data Split Already Completed (REMOVED - No longer needed)

# This cell is no longer needed since we already split the data in Cell 3
# The train-test split is now done BEFORE tokenization to prevent data leakage

if 'X_train' in locals() and 'X_test' in locals():
    print("✅ Data split already completed in Cell 3 with proper data leakage prevention.")
    print(f"Training samples: {len(X_train):,}")
    print(f"Testing samples:  {len(X_test):,}")
else:
    print("⚠️ Data preprocessing not completed yet. Please run Cell 3 first.")

In [None]:
## Cell 6: Build the GRU Model Architecture (Modern Practice)

print("\n🧠 Building the GRU model...")

# -----------------------------
# ✅ Import dependencies first
# -----------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, GRU, Dense
import tensorflow as tf

# -----------------------------
# ✅ Build the GRU model using global hyperparameters from Cell 2
# -----------------------------
model = Sequential(name="GRU_Web_Threat_Detector")

# 1. Explicit Input Layer
model.add(tf.keras.Input(shape=(MAX_LEN,)))

# 2. Embedding Layer
model.add(Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_DIM))

# 3. GRU-based Layers
model.add(SpatialDropout1D(0.4))
model.add(GRU(units=GRU_UNITS, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(1, activation='sigmoid'))

# -----------------------------
# ✅ Compile the model
# -----------------------------
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

# -----------------------------
# ✅ Model summary
# -----------------------------
model.summary()


In [None]:
print("\n🚀 Starting optimized model training on Apple M4...")

import time
import tensorflow as tf
import numpy as np
from tensorflow.keras import mixed_precision

# ✅ Enable GPU acceleration and mixed precision (Apple Metal backend)
tf.config.optimizer.set_jit(True)
mixed_precision.set_global_policy('mixed_float16')

# ✅ Sanity check: list devices
print("📦 Available devices:", tf.config.list_physical_devices())

# Ensure required globals exist
if 'X_train' in globals() and 'y_train' in globals() and 'model' in globals():
    # ⚖️ Compute class weights (helps if dataset is imbalanced)
    unique, counts = np.unique(y_train, return_counts=True)
    total = float(len(y_train))
    class_weight = {int(k): total / (2.0 * float(v)) for k, v in dict(zip(unique, counts)).items()}
    print(f"✅ Class weights: {class_weight}")

    # ⚙️ Optimizer and compile step
    opt = tf.keras.optimizers.Adam(learning_rate=3e-4, clipnorm=1.0)
    model.compile(
        loss='binary_crossentropy',
        optimizer=opt,
        metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )

    # 🧠 Create efficient tf.data pipeline (GPU-friendly)
    BATCH_SIZE = 64 if 'BATCH_SIZE' not in globals() else BATCH_SIZE
    train_ds = (
        tf.data.Dataset.from_tensor_slices((X_train, y_train))
        .shuffle(buffer_size=len(y_train))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
    )

    # 🕒 Custom callback to measure time and print metrics per epoch
    class EpochTimer(tf.keras.callbacks.Callback):
        def on_train_begin(self, logs=None):
            self.epoch_times = []
        def on_epoch_begin(self, epoch, logs=None):
            self._epoch_start = time.perf_counter()
        def on_epoch_end(self, epoch, logs=None):
            duration = time.perf_counter() - self._epoch_start
            self.epoch_times.append(duration)
            avg = np.mean(self.epoch_times)
            remaining = (self.params['epochs'] - (epoch + 1)) * avg
            loss = logs.get('loss', 0)
            acc = logs.get('accuracy', 0)
            val_loss = logs.get('val_loss', 0)
            val_acc = logs.get('val_accuracy', 0)
            print(f"Epoch {epoch+1}/{self.params['epochs']} "
                  f"| ⏱ {duration:.2f}s | ETA: {remaining/60:.1f}m "
                  f"| loss: {loss:.4f} | acc: {acc:.4f} "
                  f"| val_loss: {val_loss:.4f} | val_acc: {val_acc:.4f}")

    epoch_timer = EpochTimer()

    # 🧩 Callbacks: EarlyStopping, ReduceLROnPlateau, and ModelCheckpoint
    EARLY_STOP_PATIENCE = 3 if 'EARLY_STOP_PATIENCE' not in globals() else EARLY_STOP_PATIENCE
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=EARLY_STOP_PATIENCE, restore_best_weights=True
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6, verbose=1
    )
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath='best_model.keras',
        monitor='val_loss', save_best_only=True, mode='min', verbose=1
    )

    # 🚀 Train the model
    EPOCHS = 25 if 'EPOCHS' not in globals() else EPOCHS
    t0 = time.perf_counter()
    history = model.fit(
        train_ds,
        epochs=EPOCHS,
        validation_split=0.1,  # automatically reserves part of training data for validation
        callbacks=[early_stop, reduce_lr, checkpoint, epoch_timer],
        class_weight=class_weight,
        verbose=1
    )
    t1 = time.perf_counter()

    total_time = t1 - t0
    epochs_run = len(history.history.get('loss', []))
    print(f"\n✅ Training complete in {total_time/60:.1f} min "
          f"({total_time/max(1,epochs_run):.2f}s/epoch over {epochs_run} epochs).")

else:
    print("⚠️ Missing data or model. Please ensure Cells 1–3 and 6 have been executed.")


In [None]:
## Cell 8: (Optional) Hybrid Model — Skipped by Default

print("\nℹ️ Skipping hybrid model build (requires categorical features).")

# If you later add categorical features (X_cat_train), uncomment and import:
# from tensorflow.keras import backend as K
# from tensorflow.keras.layers import Input, concatenate
# from tensorflow.keras.models import Model
# K.clear_session()
# text_input = Input(shape=(MAX_LEN,), name='text_input')
# categorical_input = Input(shape=(X_cat_train.shape[1],), name='categorical_input')
# embedding_layer = Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_DIM)(text_input)
# dropout_layer = SpatialDropout1D(0.4)(embedding_layer)
# gru_layer = GRU(units=GRU_UNITS, dropout=0.4, recurrent_dropout=0.4)(dropout_layer)
# dense_branch = Dense(16, activation='relu')(categorical_input)
# merged = concatenate([gru_layer, dense_branch])
# final_dense = Dense(64, activation='relu')(merged)
# output_layer = Dense(1, activation='sigmoid', name='output')(final_dense)
# model = Model(inputs=[text_input, categorical_input], outputs=output_layer)
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
# model.summary()


In [None]:
## Cell 9: Evaluate and Plot Confusion Matrix

from sklearn.metrics import classification_report, confusion_matrix

if 'model' in globals() and 'X_test' in globals() and 'y_test' in globals():
    print("\n🧪 Evaluating on test set...")
    eval_metrics = model.evaluate(X_test, y_test, verbose=0)
    print({name: float(value) for name, value in zip(model.metrics_names, eval_metrics)})

    print("\n🤔 Confusion Matrix:\n")
    y_prob = model.predict(X_test, verbose=0).ravel()
    y_pred = (y_prob > 0.5).astype(int)
    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Benign', 'Malicious'],
        yticklabels=['Benign', 'Malicious']
    )
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

    print("\n📋 Classification Report:\n")
    print(classification_report(y_test, y_pred, target_names=['Benign', 'Malicious']))
else:
    print("⚠️ Missing model or test data. Ensure previous cells have been run.")


In [None]:
## Cell 10: Visualize Training History

if 'history' in globals():
    history_dict = history.history
    acc = history_dict.get('accuracy', [])
    val_acc = history_dict.get('val_accuracy', [])
    loss = history_dict.get('loss', [])
    val_loss = history_dict.get('val_loss', [])
    epochs_range = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, 'bo-', label='Training Accuracy')
    plt.plot(epochs_range, val_acc, 'ro-', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, 'bo-', label='Training Loss')
    plt.plot(epochs_range, val_loss, 'ro-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()
else:
    print("⚠️ No training history found. Run the training cell first.")

In [None]:
## Cell 11: Real-Time Prediction Function (UPDATED)

if 'tokenizer' in globals() and 'model' in globals():
    def predict_threat(payload_string):
        """
        Takes a raw string, preprocesses it, and predicts if it's a threat.
        Uses the tokenizer fitted on training data and the trained model.
        """
        seq = tokenizer.texts_to_sequences([payload_string])
        padded_seq = pad_sequences(seq, maxlen=MAX_LEN)
        prediction_prob = float(model.predict(padded_seq, verbose=0)[0][0])
        verdict = "🚨 THREAT DETECTED 🚨" if prediction_prob > 0.5 else "✅ BENIGN ✅"
        print(f"\nInput: '{payload_string}'")
        print(f"Malicious Probability: {prediction_prob:.4f}")
        print(f"Verdict: {verdict}")

    print("\n🕵️--- Real-Time Threat Detection Test ---")
    predict_threat("<script>alert('xss attack');</script>")
    predict_threat("1' OR '1'='1'; --")
    predict_threat("https://www.google.com/search?q=normal+search")
    predict_threat("../../../etc/passwd")
else:
    print("⚠️ Missing tokenizer or model. Ensure Cells 1–3 and 6–7 have been run.")

In [None]:
## Cell 12: User Testing (Interactive Inference)

if 'tokenizer' in globals() and 'model' in globals():
    try:
        user_text = input("Enter a payload/URL to analyze: ")
        if user_text:
            seq = tokenizer.texts_to_sequences([user_text])
            padded_seq = pad_sequences(seq, maxlen=MAX_LEN)
            prob = float(model.predict(padded_seq, verbose=0)[0][0])
            verdict = "🚨 THREAT DETECTED" if prob > 0.5 else "✅ SAFE"
            print("\n--- Analysis Result ---")
            print(f"Input: {user_text}")
            print(f"Malicious Probability: {prob:.4f}")
            print(f"Verdict: {verdict}")
        else:
            print("No input provided.")
    except EOFError:
        print("Interactive input not available in this environment.")
else:
    print("⚠️ Missing tokenizer or model. Ensure training is completed (Cells 1–7).")