# Week 01 — Problem Definition & Data Preparation

In [1]:
import os, sys
sys.path.append(os.path.abspath(".."))  # from notebooks/ to project root

from src.utils import set_global_seed, Paths

set_global_seed()
print("Import OK.", Paths)


Import OK. <class 'src.utils.Paths'>


In [2]:
# Set up environment and constants. 
from pathlib import Path
import pandas as pd
from src.utils import set_global_seed, Paths
from src.io import load_raw_nsl_kdd, map_attack_family, save_numpy
from src.prep import split_and_fit
import joblib
set_global_seed(42)
paths = Paths().ensure()


In [3]:
# Load data from corrected file. 
raw_path = paths.data_raw / "NSL-KDD.raw"
print(raw_path)  # optional: confirm the resolved path
df = load_raw_nsl_kdd(raw_path)
df = map_attack_family(df)
display(df.head())
df.shape


C:\Users\mehra\Final_Project\data\raw\NSL-KDD.raw


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,family
0,0,tcp,http,SF,181,5450,0,0,0,0,...,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal,normal
1,0,tcp,http,SF,239,486,0,0,0,0,...,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal,normal
2,0,tcp,http,SF,235,1337,0,0,0,0,...,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal,normal
3,0,tcp,http,SF,219,1337,0,0,0,0,...,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal,normal
4,0,tcp,http,SF,217,2032,0,0,0,0,...,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal,normal


(494021, 43)

In [4]:
# Check family distribution and Confirm class imbalance.
fam_counts = df['family'].value_counts()
fam_perc = (fam_counts / len(df)).round(4)
display(pd.DataFrame({'count': fam_counts, 'percent': fam_perc}))


Unnamed: 0_level_0,count,percent
family,Unnamed: 1_level_1,Unnamed: 2_level_1
DoS,391458,0.7924
normal,97278,0.1969
Probe,4107,0.0083
R2L,1126,0.0023
U2R,52,0.0001


In [5]:
# Split, encode, and scale. (Fit preprocessor on train only.)
X_train, X_test, y_train, y_test, pre = split_and_fit(df, paths)
X_train.shape, X_test.shape


((395216, 115), (98805, 115))

In [6]:
# Ensure y labels are integer-encoded (required for ML models)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test  = le.transform(y_test)

print("Label classes:", le.classes_)


Label classes: ['DoS' 'Probe' 'R2L' 'U2R' 'normal']


In [7]:
# Persist artifacts. (Save arrays and preprocessor.)
save_numpy(paths.data_proc / 'X_train.npy', X_train)
save_numpy(paths.data_proc / 'X_test.npy', X_test)
save_numpy(paths.data_proc / 'y_train.npy', y_train)
save_numpy(paths.data_proc / 'y_test.npy', y_test)
joblib.dump(pre, paths.data_proc / 'preprocessor.joblib')
print('Saved processed arrays and preprocessor to', paths.data_proc)


Saved processed arrays and preprocessor to C:\Users\mehra\Final_Project\data\processed


In [8]:
# Brief summary. (Report shapes and missing checks.)
summary = {
    'rows': len(df), 'cols': df.shape[1],
    'X_train_shape': X_train.shape, 'X_test_shape': X_test.shape,
    'y_train_counts': {k:int(v) for k,v in pd.Series(y_train).value_counts().items()},
    'y_test_counts': {k:int(v) for k,v in pd.Series(y_test).value_counts().items()},
    'missing_total': int(df.isna().sum().sum())
}
summary


{'rows': 494021,
 'cols': 43,
 'X_train_shape': (395216, 115),
 'X_test_shape': (98805, 115),
 'y_train_counts': {0: 313166, 4: 77822, 1: 3285, 2: 901, 3: 42},
 'y_test_counts': {0: 78292, 4: 19456, 1: 822, 2: 225, 3: 10},
 'missing_total': 0}