# Week 3 — Univariate Z-Score Anomaly Detection (NSL-KDD)

**Objective.** Detect anomalies using z-scores on a single feature following the CLT intuition.
We simulate rarity by selecting only rows with `label ∈ {normal, teardrop}`; `normal=0`, `teardrop=1`.
We operate on `wrong_fragment`, compute z-scores, plot their density, and evaluate via a confusion matrix.

In [8]:
# Bootstrap: ensure project root (containing src/) is on sys.path.
from pathlib import Path
import sys

def ensure_project_root():
    here = Path.cwd().resolve()
    for parent in [here] + list(here.parents):
        if (parent / 'src' / '__init__.py').exists():
            if str(parent) not in sys.path:
                sys.path.insert(0, str(parent))
            print('✅ Project root:', parent)
            return parent
    raise RuntimeError('Could not find project root containing src/__init__.py')

_PROJECT_ROOT = ensure_project_root()

✅ Project root: C:\Users\mehra\Final_Project


In [9]:
# Imports and paths.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.neighbors import KernelDensity

from src.utils import RANDOM_STATE, DATA_RAW, FIGURES, ensure_dir
from src.io import load_nsl_kdd_raw

FIG_DIR = ensure_dir(FIGURES)
RAW_FILE = DATA_RAW / 'NSL-KDD.raw'
print('Raw file:', RAW_FILE)
print('Figures dir:', FIG_DIR)

Raw file: C:\Users\mehra\Final_Project\data\raw\NSL-KDD.raw
Figures dir: C:\Users\mehra\Final_Project\notebooks\figures


In [10]:
# Filter to {normal, teardrop} and compute z-scores using the entire subset.
from src.io import load_nsl_kdd_raw
import numpy as np
import pandas as pd

df = load_nsl_kdd_raw(RAW_FILE)
if "teardrop" not in set(df["label"].astype(str).unique()):
    raise ValueError("The dataset does not contain 'teardrop'. Please use the file that includes it.")

df_bin = df[df["label"].isin(["normal", "teardrop"])].copy()
df_bin["wrong_fragment"] = pd.to_numeric(df_bin["wrong_fragment"], errors="coerce").fillna(0.0)
df_bin["y"] = (df_bin["label"] == "teardrop").astype(int)  # normal=0, teardrop=1

# Compute mean/std on the entire filtered dataset (as specified).
mu = float(df_bin["wrong_fragment"].mean())
sd = float(df_bin["wrong_fragment"].std(ddof=0)) if df_bin["wrong_fragment"].std(ddof=0) > 0 else 1e-9

# Z-scores for all rows (univariate).
z_all = (df_bin["wrong_fragment"].to_numpy() - mu) / sd
y_all = df_bin["y"].to_numpy()
print("Counts:", dict(pd.Series(y_all).value_counts()))
print("Mean/Std (wrong_fragment):", mu, sd)


ValueError: The dataset does not contain 'teardrop'. Please use the file that includes it.

In [None]:
# Choose threshold. Prefer a class-separating midpoint if separable; else 3-sigma; else 99.5th pct of normals.
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt

z_norm = z_all[y_all == 0]
z_anom = z_all[y_all == 1]

# If the two classes are strictly separable on z, take the midpoint.
thr = 3.0  # default 3-sigma
if z_norm.size and z_anom.size:
    max_norm = np.max(z_norm)
    min_anom = np.min(z_anom)
    if min_anom > max_norm:
        thr = 0.5 * (max_norm + min_anom)
    else:
        # If not perfectly separable, tighten to the high tail of normals.
        thr = max(thr, float(np.percentile(z_norm, 99.5)))

y_pred = (z_all >= thr).astype(int)

# Confusion matrix over the entire filtered set (to match your figure).
cm = confusion_matrix(y_all, y_pred, labels=[0, 1])
print("Threshold (z >=):", thr)
print("Confusion matrix [[TN, FP],[FN, TP]]:\n", cm)
print("Classification report:\n", classification_report(y_all, y_pred, target_names=["normal","teardrop"], zero_division=0))

# Plot confusion matrix.
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["0","1"])
fig, ax = plt.subplots(figsize=(5,4))
disp.plot(ax=ax, cmap="YlGnBu", colorbar=True, values_format="d")
plt.title("Confusion matrix — Z-score on wrong_fragment")
plt.tight_layout()
cm_path = FIG_DIR / "week3_confusion_matrix_zscore_wrong_fragment_fullset.png"
plt.savefig(cm_path, dpi=150)
plt.show()
print("Saved confusion matrix to:", cm_path)
