In [13]:
"""
Module 3: Binary Model Training & Evaluation (Single-Pipeline Bundling)

Steps:
1. Load processed train/test data (raw CSV + integrity dropped)
2. Load and apply preprocessing + PCA artifacts
3. Train multiple classifiers with hyperparameter tuning
4. Evaluate each on train/validation and test sets
5. Assemble a final VotingClassifier ensemble
6. Bundle into a single Pipeline: preprocessor → PCA → ensemble
7. Save all individual models and the full pipeline
"""
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier

# Paths
BASE_DIR      = os.path.abspath(os.path.join(os.getcwd(), '..'))
RAW_PATH      = os.path.join(BASE_DIR, 'data', 'raw', 'UNSW_NB15_training-set.csv')
PROC_DIR      = os.path.join(BASE_DIR, 'data', 'processed')
MODELS_DIR    = os.path.join(BASE_DIR, 'src', 'models')
os.makedirs(MODELS_DIR, exist_ok=True)

In [14]:
# 1) Load raw data and drop integrity columns if present
df = pd.read_csv(RAW_PATH)
for col in ['row_hash', 'attack_cat']:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)
X = df.drop(columns=['label'])
y = df['label']

In [15]:
# 2) Train/test split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [16]:
# 3) Load fitted preprocessor and PCA
preprocessor = joblib.load(os.path.join(PROC_DIR, 'preprocessor.pkl'))
pca          = joblib.load(os.path.join(PROC_DIR, 'pca.pkl'))

# Apply transforms to raw features
X_train_proc = preprocessor.transform(X_train_raw)
X_test_proc  = preprocessor.transform(X_test_raw)
X_train_pca  = pca.transform(X_train_proc)
X_test_pca   = pca.transform(X_test_proc)

In [17]:
# 4) Define models & search spaces
models = {
    'lr': LogisticRegression(solver='liblinear', random_state=42),
    'rf': RandomForestClassifier(n_jobs=-1, random_state=42),
    'xgb': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1, random_state=42),
    'lgb': LGBMClassifier(n_jobs=-1, random_state=42)
}
grids = {
    'lr': {'C': [0.01, 0.1, 1, 10]},
    'rf': {'n_estimators': [100,200], 'max_depth': [None,10,20]},
    'xgb': {'n_estimators':[100,200], 'learning_rate':[0.05,0.1], 'max_depth':[3,6]},
    'lgb': {'n_estimators':[100,200], 'learning_rate':[0.05,0.1], 'num_leaves':[31,63]}
}

In [18]:
# 5) Hyperparameter tuning
best_models = {}
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
for name, clf in models.items():
    search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=grids[name],
        n_iter=5,
        cv=skf,
        scoring='roc_auc',
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train_pca, y_train)
    best = search.best_estimator_
    best_models[name] = best
    print(f"{name} best params: {search.best_params_}, AUC: {search.best_score_:.4f}")
    joblib.dump(best, os.path.join(MODELS_DIR, f"{name}_model.pkl"))



lr best params: {'C': 10}, AUC: 0.8802
rf best params: {'n_estimators': 200, 'max_depth': 20}, AUC: 0.9897


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgb best params: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}, AUC: 0.9890
[LightGBM] [Info] Number of positive: 95472, number of negative: 44800
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 140272, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.680621 -> initscore=0.756625
[LightGBM] [Info] Start training from score 0.756625
lgb best params: {'num_leaves': 63, 'n_estimators': 200, 'learning_rate': 0.1}, AUC: 0.9897


In [19]:
# 6) Evaluate each on test set
for name, model in best_models.items():
    y_pred = model.predict(X_test_pca)
    y_proba = model.predict_proba(X_test_pca)[:,1]
    print(f"\n=== {name.upper()} Evaluation ===")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


=== LR Evaluation ===
              precision    recall  f1-score   support

           0       0.66      0.74      0.70     11200
           1       0.87      0.82      0.84     23869

    accuracy                           0.79     35069
   macro avg       0.76      0.78      0.77     35069
weighted avg       0.80      0.79      0.80     35069

ROC-AUC: 0.8816
Confusion Matrix:
 [[ 8301  2899]
 [ 4361 19508]]

=== RF Evaluation ===
              precision    recall  f1-score   support

           0       0.94      0.90      0.92     11200
           1       0.95      0.97      0.96     23869

    accuracy                           0.95     35069
   macro avg       0.95      0.94      0.94     35069
weighted avg       0.95      0.95      0.95     35069

ROC-AUC: 0.9906
Confusion Matrix:
 [[10096  1104]
 [  664 23205]]

=== XGB Evaluation ===
              precision    recall  f1-score   support

           0       0.95      0.88      0.91     11200
           1       0.94      0.98  




=== LGB Evaluation ===
              precision    recall  f1-score   support

           0       0.94      0.89      0.91     11200
           1       0.95      0.97      0.96     23869

    accuracy                           0.95     35069
   macro avg       0.95      0.93      0.94     35069
weighted avg       0.95      0.95      0.95     35069

ROC-AUC: 0.9903
Confusion Matrix:
 [[ 9970  1230]
 [  627 23242]]


In [20]:
# 7) Voting ensemble of all four
ensemble = VotingClassifier(
    estimators=[(n, m) for n, m in best_models.items()],
    voting='soft', n_jobs=-1
)
ensemble.fit(X_train_pca, y_train)
joblib.dump(ensemble, os.path.join(MODELS_DIR, 'voting_ensemble.pkl'))

['c:\\Users\\dorai\\OneDrive\\Documents\\Documents\\SEM6\\Computer Security\\Project_cs\\IDS-binary-classification\\src\\models\\voting_ensemble.pkl']

In [21]:
# Evaluate ensemble
y_e = ensemble.predict(X_test_pca)
p_e = ensemble.predict_proba(X_test_pca)[:,1]
print("\n=== ENSEMBLE Evaluation ===")
print(classification_report(y_test, y_e))
print(f"ROC-AUC: {roc_auc_score(y_test, p_e):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_e))




=== ENSEMBLE Evaluation ===
              precision    recall  f1-score   support

           0       0.95      0.88      0.91     11200
           1       0.95      0.98      0.96     23869

    accuracy                           0.95     35069
   macro avg       0.95      0.93      0.94     35069
weighted avg       0.95      0.95      0.95     35069

ROC-AUC: 0.9890
Confusion Matrix:
 [[ 9846  1354]
 [  507 23362]]




In [22]:
!pip install skl2onnx onnxruntime





[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
# in your training notebook / Module 3 script, after you build & save full_pipe:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Load the full sklearn Pipeline
import joblib
full_pipe = joblib.load("src/models/ids_full_pipeline.pkl")

# Determine number of raw features (p)
import pandas as pd
df = pd.read_csv("data/raw/UNSW_NB15_training-set.csv")
feature_cols = [c for c in df.columns if c not in ("label","attack_cat","row_hash")]
p = len(feature_cols)

# Convert to ONNX
initial_type = [(feature_cols[i], FloatTensorType([None,1])) for i in range(p)]
# Or use one vector input: FloatTensorType([None, p]) with name "input"
onnx_model = convert_sklearn(
    full_pipe,
    initial_types=[("input", FloatTensorType([None, p]))],
    options={id(full_pipe): {"zipmap": False}}
)
with open("src/models/ids_full_pipeline.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


ImportError: cannot import name 'split_complex_to_pairs' from 'onnx.helper' (c:\Users\dorai\OneDrive\Documents\Documents\SEM6\Computer Security\Project_cs\csproject\Lib\site-packages\onnx\helper.py)