
# ==========================================
# 1. Imports & Configuration
# ==========================================

In [None]:

from pathlib import Path
import json
import joblib
import time

import numpy as np
import pandas as pd



# ==========================================
# 2. Paths & Artifacts
# ==========================================

In [None]:
def get_project_root() -> Path:
    cwd = Path.cwd().resolve()
    return cwd.parent if cwd.name.lower() == "notebooks" else cwd

PROJECT_ROOT = get_project_root()

PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
MODELS_DIR    = PROJECT_ROOT / "results" / "models"
RESULTS_DIR   = PROJECT_ROOT / "results"

print(f"[✓] Project Root: {PROJECT_ROOT}")



# ==========================================
# 3. Load Models & Metadata
# ==========================================

In [None]:
print("\n[1] Loading trained models and metadata...")

binary_model = joblib.load(MODELS_DIR / "binary_model.pkl")

with open(MODELS_DIR / "model_metadata.json", "r") as f:
    metadata = json.load(f)

BINARY_THRESHOLD = metadata["binary_threshold"]

print(f"[✓] Binary threshold loaded: {BINARY_THRESHOLD:.4f}")


# ==========================================
# 4. Load Data for Inference (Test Set)
# ==========================================

In [None]:
print("\n[2] Loading processed test data...")

test_df = pd.read_csv(PROCESSED_DIR / "test_cleaned.csv")

DROP_COLS = [
    "label", "attack_class", "binary_target",
    "attack_category", "level", "id", "difficulty"
]

X_test = test_df.drop(columns=DROP_COLS, errors="ignore")

print(f"[✓] Test samples loaded: {len(X_test)}")


# ==========================================
# 5. Probability Inference
# ==========================================

In [None]:
def predict_proba(pipe, X_raw):
    prep  = pipe.named_steps["prep"]
    model = pipe.named_steps["model"]

    X_enc = prep.transform(X_raw)

    best_iter = getattr(model, "best_iteration", None)
    if best_iter is None:
        return model.predict_proba(X_enc)[:, 1]

    return model.predict_proba(
        X_enc, iteration_range=(0, best_iter + 1)
    )[:, 1]


print("\n[3] Running inference...")

attack_probs = predict_proba(binary_model, X_test)


# ==========================================
# 6. Alert Decision Logic
# ==========================================

In [None]:
alerts_df = pd.DataFrame({
    "sample_id": np.arange(len(X_test)),
    "attack_probability": attack_probs,
})

alerts_df["decision"] = (alerts_df["attack_probability"] >= BINARY_THRESHOLD).astype(int)
alerts_df["decision_label"] = alerts_df["decision"].map({0: "NORMAL", 1: "ALERT"})

# Add simulated timestamps (for SOC realism)
base_time = int(time.time())
alerts_df["timestamp"] = [
    base_time + i for i in range(len(alerts_df))
]

alerts_df = alerts_df[
    ["timestamp", "sample_id", "attack_probability", "decision_label"]
]

print(alerts_df.head())


# ==========================================
# 7. Alert Statistics
# ==========================================

In [None]:
print("\n[4] Alert statistics:")

alert_rate = (alerts_df["decision_label"] == "ALERT").mean()

print(f"    Total samples: {len(alerts_df)}")
print(f"    ALERT rate:    {alert_rate:.2%}")
print(f"    Threshold:     {BINARY_THRESHOLD:.4f}")


# ==========================================
# 8. Export Alerts
# ==========================================

In [None]:
alerts_path = RESULTS_DIR / "alerts_simulation.csv"
alerts_df.to_csv(alerts_path, index=False)

print(f"\n[✓] Alerts exported to: {alerts_path}")
