In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import os


# --- 1) Load raw NSL-KDD data ---
train_path = "../data/nsl-kdd/KDDTrain+.txt"
test_path  = "../data/nsl-kdd/KDDTest+.txt"

df_train = pd.read_csv(train_path, header=None)
df_test  = pd.read_csv(test_path, header=None)

# If a 43rd empty column exists
if df_train.shape[1] == 43:
    df_train = df_train.iloc[:, :42]
if df_test.shape[1] == 43:
    df_test = df_test.iloc[:, :42]

def split_X_y(df):
    X = df.iloc[:, :-1].copy()
    y = df.iloc[:, -1].astype(str).str.strip()
    return X, y

X_train_raw, y_train_raw = split_X_y(df_train)
X_test_raw,  y_test_raw  = split_X_y(df_test)

# Merge train + test
X_all = pd.concat([X_train_raw, X_test_raw], axis=0).reset_index(drop=True)
y_all = pd.concat([y_train_raw, y_test_raw], axis=0).reset_index(drop=True)

# --- 2) Convert label → binary (normal=0 / attack=1) ---
y_all_binary = y_all.apply(lambda s: 0 if "normal" in s.lower() else 1)

# --- 3) Assign NSL-KDD feature names ---
kdd_columns = [
 "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
 "wrong_fragment","urgent","hot","num_failed_logins","logged_in",
 "num_compromised","root_shell","su_attempted","num_root","num_file_creations",
 "num_shells","num_access_files","num_outbound_cmds","is_host_login",
 "is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
 "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
 "srv_diff_host_rate","dst_host_count","dst_host_srv_count",
 "dst_host_same_srv_rate","dst_host_diff_srv_rate",
 "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
 "dst_host_serror_rate","dst_host_srv_serror_rate",
 "dst_host_rerror_rate","dst_host_srv_rerror_rate"
]
X_all.columns = kdd_columns

# --- 4) Select the 18 features for training ---
selected_features = [
    "service","flag","src_bytes","dst_bytes","logged_in","count",
    "serror_rate","srv_serror_rate","same_srv_rate","diff_srv_rate",
    "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate"
]

X_sel = X_all[selected_features].copy()

# --- 5) One-hot encoding (categorical to numeric) ---
X_sel = pd.get_dummies(X_sel, columns=["service","flag"], drop_first=True)

# --- 6) Convert everything to numeric ---
X_sel = X_sel.apply(pd.to_numeric, errors='coerce').fillna(0.0)

# --- 7) Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sel)

# Save the scaler for Notebook 2
joblib.dump(scaler, "../models/scaler.save")
joblib.dump(X_sel.columns.tolist(), "../models/columns.save")


# --- 8) Save data for Notebook 2 ---
np.save("../data/X.npy", X_scaled)
np.save("../data/y.npy", y_all_binary.to_numpy())

print("✔ Preprocessing done!")
print("Saved:")
print(" -> data/X.npy")
print(" -> data/y.npy")
print(" -> models/scaler.save")


✔ Preprocessing done!
Saved:
 -> data/X.npy
 -> data/y.npy
 -> models/scaler.save
