In [None]:
# Ensure the project root is on sys.path so `import src.*` works.
from pathlib import Path
import sys
PROJECT_ROOT = Path.cwd().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
print('Using project root:', PROJECT_ROOT)
print('In sys.path:', str(PROJECT_ROOT) in sys.path)

# Week 1 â€” Data Preparation (NSL-KDD)

This notebook loads, validates, encodes, scales, splits, and saves the NSL-KDD dataset using the code in `src/`.

In [None]:
# Import project modules.
import pandas as pd
from src.utils import set_all_seeds, RANDOM_STATE, DATA_RAW, DATA_PROCESSED, ensure_dir
from src.io import load_nsl_kdd_raw, save_numpy, save_joblib
from src.prep import fit_transform_split

In [None]:
# Set seeds for reproducibility.
set_all_seeds(RANDOM_STATE)

In [None]:
# Define data paths.
raw_file = DATA_RAW / 'NSL-KDD.raw'
processed_dir = ensure_dir(DATA_PROCESSED)
print('Raw file:', raw_file)
print('Processed output:', processed_dir)

In [None]:
# Load dataset and validate schema.
df = load_nsl_kdd_raw(raw_file)
print('Shape:', df.shape)
for c in ['protocol_type', 'service', 'flag', 'label']:
    assert c in df.columns, f'Missing column: {c}'
print('Verified schema successfully.')

In [None]:
# Convert numeric columns to numeric types safely.
categorical = ['protocol_type', 'service', 'flag']
numeric = [c for c in df.columns if c not in categorical + ['label']]
for c in numeric:
    df[c] = pd.to_numeric(df[c], errors='coerce')
print('Missing numeric values:', int(df[numeric].isna().sum().sum()))

In [None]:
# Inspect label distribution.
print(df['label'].value_counts().head(10))

In [None]:
# Split data, fit preprocessor, and transform.
X_train, X_test, y_train, y_test, arts = fit_transform_split(df, random_state=RANDOM_STATE)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)
print('Train classes:', {k:int(v) for k,v in pd.Series(y_train).value_counts().items()})
print('Test classes:',  {k:int(v) for k,v in pd.Series(y_test).value_counts().items()})

In [None]:
# Save processed arrays and preprocessor.
save_numpy(X_train, processed_dir / 'X_train.npy')
save_numpy(X_test, processed_dir / 'X_test.npy')
save_numpy(y_train, processed_dir / 'y_train.npy')
save_numpy(y_test, processed_dir / 'y_test.npy')
save_joblib(arts.preprocessor, processed_dir / 'preprocessor.joblib')
print('All processed artifacts saved to:', processed_dir)

In [None]:
# Print completion summary.
print('Data preparation complete.')
for f in ['X_train.npy','X_test.npy','y_train.npy','y_test.npy','preprocessor.joblib']:
    print('  -', processed_dir / f)