In [None]:
# Ensure project root (contains src/) is on sys.path.
from pathlib import Path
import sys
def ensure_project_root():
    here = Path.cwd().resolve()
    for parent in [here] + list(here.parents):
        if (parent / 'src' / '__init__.py').exists():
            if str(parent) not in sys.path:
                sys.path.insert(0, str(parent))
            print('✅ Project root:', parent)
            return parent
    raise RuntimeError('Could not find project root containing src/__init__.py')
_PROJECT_ROOT = ensure_project_root()

# Week 3 — Univariate Z-Score Outlier Detection on `wrong_fragment`

We select rows where the label is either `normal` or an attack class. If `teardrop` is present, we use it; otherwise we choose the rarest non-`normal` label. We compute z-scores for `wrong_fragment`, plot the density, and evaluate with a confusion matrix.

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.neighbors import KernelDensity
from src.utils import RANDOM_STATE, DATA_RAW, FIGURES, ensure_dir
from src.io import load_nsl_kdd_raw

FIG_DIR = ensure_dir(FIGURES)
RAW_FILE = DATA_RAW / 'NSL-KDD.raw'
print('Raw file:', RAW_FILE)
print('Figures dir:', FIG_DIR)

In [None]:
# Load dataset and subset to {normal, target_attack} with safe fallback.
df = load_nsl_kdd_raw(RAW_FILE)
labels = df['label'].astype(str).unique().tolist()
if 'wrong_fragment' not in df.columns:
    raise ValueError('The feature wrong_fragment is not present in this file.')

target_attack = 'teardrop' if 'teardrop' in labels else None
if target_attack is None:
    vc = df['label'].value_counts()
    vc = vc[vc.index != 'normal']
    if len(vc) == 0:
        raise ValueError('No attack labels present in this file. Cannot build normal vs anomaly subset.')
    target_attack = vc.sort_values(ascending=True).index[0]
    print(f"⚠️ Using rarest available attack as anomaly class: '{target_attack}'.")
else:
    print("Using 'teardrop' as anomaly class.")

df_sub = df[df['label'].isin(['normal', target_attack])].copy()
df_sub['wrong_fragment'] = pd.to_numeric(df_sub['wrong_fragment'], errors='coerce').fillna(0.0)
df_sub['y'] = (df_sub['label'] == target_attack).astype(int)
y_all = df_sub['y'].to_numpy()
print('Counts (normal=0, anomaly=1):', df_sub['y'].value_counts().to_dict())

In [None]:
# Compute z-scores using mean/std of the entire subset.
mu = float(df_sub['wrong_fragment'].mean())
sd = float(df_sub['wrong_fragment'].std(ddof=0))
sd = sd if sd > 0 else 1e-9
z = (df_sub['wrong_fragment'].to_numpy() - mu) / sd
z = np.nan_to_num(z, nan=0.0, posinf=0.0, neginf=0.0)
print('Mean/Std:', mu, sd)

In [None]:
# Plot density of z-scores.
z2d = z.reshape(-1, 1)
std_z = float(np.std(z2d))
bw = 1.06 * std_z * (len(z2d) ** (-1/5) if len(z2d) > 1 else 1.0)
bw = bw if np.isfinite(bw) and bw > 0 else 0.3
kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(z2d)
xs = np.linspace(-5.0, 15.0, 800).reshape(-1, 1)
dens = np.exp(kde.score_samples(xs))
plt.figure(figsize=(6,4))
plt.plot(xs, dens)
plt.xlabel('z-score'); plt.ylabel('Density'); plt.title('Density plot of z-scores')
plt.xlim(-5.0, 15.0); plt.tight_layout()
out_density = FIG_DIR / 'week3_zscore_density_wrong_fragment.png'
plt.savefig(out_density, dpi=150); plt.show()
print('Saved density plot to:', out_density)

In [None]:
# Threshold and confusion matrix over the full subset.
z_norm = z[y_all == 0]
z_anom = z[y_all == 1]
thr = 3.0
if z_norm.size and z_anom.size:
    max_norm = np.max(z_norm)
    min_anom = np.min(z_anom)
    if min_anom > max_norm:
        thr = 0.5 * (max_norm + min_anom)
    else:
        thr = max(thr, float(np.percentile(z_norm, 99.5)))
y_pred = (z >= thr).astype(int)

from sklearn.metrics import classification_report
cm = confusion_matrix(y_all, y_pred, labels=[0,1])
print('Threshold (z >=):', thr)
print('Confusion matrix [[TN, FP],[FN, TP]]:\n', cm)
print('Classification report:\n', classification_report(y_all, y_pred, target_names=['normal','anomaly'], zero_division=0))

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['0','1'])
fig, ax = plt.subplots(figsize=(5,4))
disp.plot(ax=ax, cmap='YlGnBu', colorbar=True, values_format='d')
plt.title('Confusion matrix — Z-score on wrong_fragment')
plt.tight_layout()
out_cm = FIG_DIR / 'week3_confusion_matrix_zscore_wrong_fragment_fullset.png'
plt.savefig(out_cm, dpi=150); plt.show()
print('Saved confusion matrix to:', out_cm)