In [34]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# Paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
RAW_DIR = os.path.join(BASE_DIR, 'data', 'raw')
PROC_DIR = os.path.join(BASE_DIR, 'data', 'processed')
os.makedirs(PROC_DIR, exist_ok=True)

In [35]:
# 1. Load raw training data
train_path = os.path.join(RAW_DIR, 'UNSW_NB15_training-set.csv')
df = pd.read_csv(train_path)


In [36]:
# 2. Drop integrity columns if present
df = df.drop(columns=[c for c in ['row_hash', 'attack_cat'] if c in df.columns])

In [37]:
# 3. Separate features and label
X = df.drop(columns=['label'])
y = df['label']

In [38]:
# 4. Train/test split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [39]:
# 5. Define feature groups
# Continuous ratelike features
continuous = ['dur', 'rate', 'sload']
# Count-like and depth features
counts = ['spkts','dpkts','sbytes','dbytes','trans_depth','response_body_len']
# Window counts
window_ct = ['ct_src_dport_ltm','ct_dst_sport_ltm']
# Numeric other
numeric_other = ['ct_ftp_cmd']
# Categorical
categorical = ['proto','service','state','ct_flw_http_mthd']
# Binary flags
binary = ['is_ftp_login','is_sm_ips_ports']

In [40]:
# 6. Build preprocessing pipelines
log_tf = FunctionTransformer(np.log1p, validate=False)
cont_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
cnt_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('log',   log_tf),
    ('scale', StandardScaler())
])
cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Remove sparse parameter
])
bin_pipe = Pipeline([('passthrough', 'passthrough')])

preprocessor = ColumnTransformer([
    ('cont', cont_pipe, continuous),
    ('cnt',  cnt_pipe, counts + window_ct + numeric_other),
    ('cat',  cat_pipe, categorical),
    ('bin',  bin_pipe, binary)
])

In [41]:

# 7. Fit and transform training data
# Ensure we only pass the expected feature columns in the correct order
feature_cols = continuous + counts + window_ct + numeric_other + categorical + binary
X_train_proc = preprocessor.fit_transform(X_train_raw[feature_cols])

In [42]:
# 8. Balance training set if needed Balance training set if needed
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_proc, y_train)

In [43]:
# 9. Fit PCA to reduce dimensionality
pca = PCA(n_components=10, random_state=42)  # Specify exact number of components
X_train_pca = pca.fit_transform(X_train_bal)

In [44]:
# 10. Persist artifacts
joblib.dump(preprocessor, os.path.join(PROC_DIR, 'preprocessor.pkl'))
joblib.dump(pca,         os.path.join(PROC_DIR, 'pca.pkl'))

['c:\\Users\\dorai\\OneDrive\\Documents\\Documents\\SEM6\\Computer Security\\Project_cs\\IDS-binary-classification\\data\\processed\\pca.pkl']

In [45]:
# 11. Optionally, transform test set and save for Module 3
df_test_proc = preprocessor.transform(X_test_raw)
np.save(os.path.join(PROC_DIR, 'X_test_proc.npy'), df_test_proc)
pd.DataFrame({'label': y_test}).to_csv(os.path.join(PROC_DIR, 'y_test.csv'), index=False)

print("Module 2 complete: preprocessor.pkl and pca.pkl saved.")

Module 2 complete: preprocessor.pkl and pca.pkl saved.


In [46]:
import sklearn
print(sklearn.__version__)

1.6.1
