# Week 1 â€” Data Preparation (NSL-KDD)

This notebook loads, validates, encodes, scales, splits, and saves the NSL-KDD dataset.

In [27]:
from pathlib import Path
import sys

def ensure_project_root():
    # Start from the current working directory of the notebook.
    here = Path.cwd().resolve()
    # Walk up until we find a folder that contains src/__init__.py.
    for parent in [here] + list(here.parents):
        src_dir = parent / "src"
        if src_dir.is_dir() and (src_dir / "__init__.py").exists():
            if str(parent) not in sys.path:
                sys.path.insert(0, str(parent))
            return parent
    raise RuntimeError("Could not find a project root containing src/__init__.py.")

_ = ensure_project_root()

In [28]:
# Import project modules.
from pathlib import Path
import pandas as pd
from src.utils import set_all_seeds, RANDOM_STATE, DATA_RAW, DATA_PROCESSED, ensure_dir
from src.io import load_nsl_kdd_raw, save_numpy, save_joblib
from src.prep import fit_transform_split

In [30]:
# Set seeds for reproducibility.
set_all_seeds(RANDOM_STATE)

In [31]:
# Define data paths.
raw_file = DATA_RAW / 'NSL-KDD.raw'
processed_dir = ensure_dir(DATA_PROCESSED)

print('Raw file:', raw_file)
print('Processed output:', processed_dir)

Raw file: C:\Users\mehra\Final_Project\data\raw\NSL-KDD.raw
Processed output: C:\Users\mehra\Final_Project\data\processed


In [32]:
# Load dataset.
df = load_nsl_kdd_raw(raw_file)
print('Shape:', df.shape)
for c in ['protocol_type', 'service', 'flag', 'label']:
    assert c in df.columns, f'Missing column: {c}'
print('Verified schema successfully.')

Shape: (494021, 42)
Verified schema successfully.


In [33]:
# Convert numeric columns safely.
categorical = ['protocol_type', 'service', 'flag']
numeric = [c for c in df.columns if c not in categorical + ['label']]
for c in numeric:
    df[c] = pd.to_numeric(df[c], errors='coerce')
print('Missing numeric values:', int(df[numeric].isna().sum().sum()))

Missing numeric values: 0


In [34]:
# Inspect label distribution.
print(df['label'].value_counts().head(10))

label
smurf.          280790
neptune.        107201
normal.          97278
back.             2203
satan.            1589
ipsweep.          1247
portsweep.        1040
warezclient.      1020
teardrop.          979
pod.               264
Name: count, dtype: int64


In [36]:
# Split data, fit preprocessor, and transform.
X_train, X_test, y_train, y_test, arts = fit_transform_split(df, random_state=RANDOM_STATE)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)
print('Train classes:', {k:int(v) for k,v in pd.Series(y_train).value_counts().items()})
print('Test classes:',  {k:int(v) for k,v in pd.Series(y_test).value_counts().items()})

Train shape: (395216, 115) Test shape: (98805, 115)
Train classes: {'other': 395216}
Test classes: {'other': 98805}


In [37]:
# Save processed arrays and preprocessor.
save_numpy(X_train, processed_dir / 'X_train.npy')
save_numpy(X_test, processed_dir / 'X_test.npy')
save_numpy(y_train, processed_dir / 'y_train.npy')
save_numpy(y_test, processed_dir / 'y_test.npy')
save_joblib(arts.preprocessor, processed_dir / 'preprocessor.joblib')

print('All processed artifacts saved to:', processed_dir)

All processed artifacts saved to: C:\Users\mehra\Final_Project\data\processed


In [38]:
# Summarize completion.
print('Data preparation complete.')
for f in ['X_train.npy','X_test.npy','y_train.npy','y_test.npy','preprocessor.joblib']:
    print('  -', processed_dir / f)

Data preparation complete.
  - C:\Users\mehra\Final_Project\data\processed\X_train.npy
  - C:\Users\mehra\Final_Project\data\processed\X_test.npy
  - C:\Users\mehra\Final_Project\data\processed\y_train.npy
  - C:\Users\mehra\Final_Project\data\processed\y_test.npy
  - C:\Users\mehra\Final_Project\data\processed\preprocessor.joblib


## Label Normalization and Robust Saving (Week 01)

In [None]:

# --- Normalize labels and save processed arrays robustly ---
import numpy as np, json
from pathlib import Path
from src.utils import ensure_dir

# Binary label mapping: normal -> 0, others -> 1
def to_binary_anomaly(y):
    y = np.asarray(y)
    try:
        u = np.unique(y)
        if set(u.tolist()).issubset({0, 1}) or set(u.astype(int).tolist()).issubset({0, 1}):
            return y.astype(np.int8, copy=False)
    except Exception:
        pass
    y_str = np.char.lower(y.astype(str))
    y_bin = np.ones_like(y_str, dtype=np.int8)
    y_bin[(y_str == "normal") | (y_str == "0")] = 0
    return y_bin

# Apply normalization
y_train_bin = to_binary_anomaly(y_train)
y_test_bin  = to_binary_anomaly(y_test)

# Sanity counts
def counts(name, arr):
    u, c = np.unique(arr, return_counts=True)
    print(f"{name} counts:", dict(zip(u.tolist(), c.tolist())))

counts("y_train_bin", y_train_bin)
counts("y_test_bin",  y_test_bin)

# Define processed dir
try:
    from src.utils import DATA_PROCESSED
    PROCESSED_DIR = Path(DATA_PROCESSED)
except Exception:
    PROCESSED_DIR = Path(r"C:\Users\mehra\Final_Project\data\processed")

ensure_dir(PROCESSED_DIR)

# Convert + save as numeric
np.save(str(PROCESSED_DIR / "X_train.npy"), np.asarray(X_train, dtype=np.float32, order="C"))
np.save(str(PROCESSED_DIR / "X_test.npy"),  np.asarray(X_test,  dtype=np.float32, order="C"))
np.save(str(PROCESSED_DIR / "y_train.npy"), y_train_bin)
np.save(str(PROCESSED_DIR / "y_test.npy"),  y_test_bin)

# Save label mapping
with open(PROCESSED_DIR / "label_mapping.json", "w") as f:
    json.dump({"scheme": "binary", "normal->0, else->1": True}, f, indent=2)

print("Saved processed arrays to:", PROCESSED_DIR.resolve())
for fpath in ["X_train.npy","X_test.npy","y_train.npy","y_test.npy"]:
    p = PROCESSED_DIR / fpath
    print(f" - {fpath}: {'OK' if p.exists() else 'MISSING'}")
