In [None]:

# Detect project root (folder containing src/) and add it to sys.path.
from pathlib import Path
import sys

def get_project_root():
    here = Path.cwd().resolve()
    for parent in [here] + list(here.parents):
        if (parent / 'src' / '__init__.py').exists():
            return parent
    raise RuntimeError('Could not find project root containing src/__init__.py')

ROOT = get_project_root()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print('✅ Project root:', ROOT)


In [None]:

# Import utilities and define canonical paths.
from pathlib import Path
import numpy as np
import pandas as pd

from src.utils import DATA_RAW, FIGURES, ensure_dir, RANDOM_STATE
from src.io import load_nsl_kdd_raw, save_joblib
from src.prep import build_preprocessor

DATA_PROCESSED = ROOT / 'data' / 'processed'
FIG_DIR = ensure_dir(FIGURES)
ensure_dir(DATA_PROCESSED)
RAW_FILE = DATA_RAW / 'NSL-KDD.raw'
print('RAW_FILE:', RAW_FILE)
print('DATA_PROCESSED:', DATA_PROCESSED)
print('FIG_DIR:', FIG_DIR)


In [None]:

# Load raw dataset and validate required columns.
df = load_nsl_kdd_raw(RAW_FILE)
assert 'label' in df.columns, 'The raw file must include the label column.'
for col in ['protocol_type','service','flag']:
    assert col in df.columns, f'Missing categorical feature: {col}'
print('Rows:', len(df), '| Columns:', len(df.columns))
df.head(3)


In [None]:

# Normalize labels and coerce numerics. Keep categoricals as-is.
df['label'] = df['label'].astype(str).str.rstrip('.')
categorical = ['protocol_type','service','flag']
numeric = [c for c in df.columns if c not in categorical + ['label']]
for c in numeric:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df[numeric] = df[numeric].fillna(0.0)

# Map attack labels to high-level family.
family_map = {
    'normal':'normal','neptune':'dos','smurf':'dos','back':'dos','pod':'dos','teardrop':'dos','land':'dos','apache2':'dos','udpstorm':'dos','processtable':'dos','worm':'dos',
    'ipsweep':'probe','nmap':'probe','portsweep':'probe','satan':'probe','mscan':'probe','saint':'probe',
    'guess_passwd':'r2l','ftp_write':'r2l','imap':'r2l','phf':'r2l','multihop':'r2l','warezmaster':'r2l','warezclient':'r2l','spy':'r2l','xlock':'r2l','xsnoop':'r2l','snmpguess':'r2l','snmpgetattack':'r2l','httptunnel':'r2l','named':'r2l','sendmail':'r2l',
    'buffer_overflow':'u2r','loadmodule':'u2r','perl':'u2r','rootkit':'u2r','ps':'u2r','sqlattack':'u2r','xterm':'u2r'
}
df['family'] = df['label'].map(family_map).fillna('other')
print(df['family'].value_counts().to_dict())


In [None]:

# Stratified train/test split by family.
from sklearn.model_selection import train_test_split

X = df.drop(columns=['label','family'])
y = df['family']
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print('Train/Test:', X_train_df.shape, X_test_df.shape)
print('Train families:', y_train.value_counts().to_dict())
print('Test families:',  y_test.value_counts().to_dict())


In [None]:

# Build ColumnTransformer and fit-transform.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical = ['protocol_type','service','flag']
numeric = [c for c in X_train_df.columns if c not in categorical]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), categorical),
    ('num', StandardScaler(with_mean=True, with_std=True), numeric),
], remainder='drop', verbose_feature_names_out=False)

X_train = preprocessor.fit_transform(X_train_df)
X_test  = preprocessor.transform(X_test_df)
print('X_train shape:', X_train.shape, '| X_test shape:', X_test.shape)


In [None]:

# Save artifacts to data/processed with absolute repo path.
np.save(DATA_PROCESSED / 'X_train.npy', X_train)
np.save(DATA_PROCESSED / 'X_test.npy',  X_test)
np.save(DATA_PROCESSED / 'y_train.npy', y_train.to_numpy())
np.save(DATA_PROCESSED / 'y_test.npy',  y_test.to_numpy())

from joblib import dump
dump(preprocessor, DATA_PROCESSED / 'preprocessor.joblib')
print('✅ Saved: X_train.npy, X_test.npy, y_train.npy, y_test.npy, preprocessor.joblib')


In [None]:

# Demonstrate robust loading (fixes 'Invalid argument' errors).
Xt = np.load(DATA_PROCESSED / 'X_test.npy', allow_pickle=False)
yt = np.load(DATA_PROCESSED / 'y_test.npy', allow_pickle=False)
print('Loaded shapes:', Xt.shape, yt.shape)
