In [1]:
# 02_training_models.ipynb


"""Objective: Train, tune, and compare multiple models for intrusion detection.
Contents:
- Train baseline models: LogisticRegression, RandomForest
- Optionally: XGBoost/LightGBM, SVM, MLP
- Cross-validation and metrics
- GridSearch example and saving models
"""

'Objective: Train, tune, and compare multiple models for intrusion detection.\nContents:\n- Train baseline models: LogisticRegression, RandomForest\n- Optionally: XGBoost/LightGBM, SVM, MLP\n- Cross-validation and metrics\n- GridSearch example and saving models\n'

In [3]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [19]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
PREP_DIR = "C:\\Users\\HarshaSri\\Desktop\\IDS_PROJECT\\data\\processed"
MODELS_DIR = "C:\\Users\\HarshaSri\\Desktop\\IDS_PROJECT\\models"
os.makedirs(MODELS_DIR, exist_ok=True)

In [7]:
# Load data
X_train_bal, y_train_bal, X_test_t, y_test = joblib.load(os.path.join(PREP_DIR, 'data_splits.joblib'))
print('Loaded shapes:', X_train_bal.shape, y_train_bal.shape)

Loaded shapes: (21518, 256) (21518,)


In [9]:
# CV helper
from sklearn.model_selection import StratifiedKFold


def eval_cv(estimator, X, y, cv=5):
    scoring = ['precision', 'recall', 'f1', 'roc_auc']
    cv = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    res = cross_validate(estimator, X, y, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)
    df = pd.DataFrame(res)
    return df[['test_precision','test_recall','test_f1','test_roc_auc']].mean()

In [11]:
# 1) Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
print('Training LogisticRegression...')
res_lr = eval_cv(lr, X_train_bal, y_train_bal)
print('Logistic results:', res_lr)
joblib.dump(lr.fit(X_train_bal, y_train_bal), os.path.join(MODELS_DIR, 'logistic.joblib'))

Training LogisticRegression...
Logistic results: test_precision    0.987754
test_recall       0.981597
test_f1           0.984664
test_roc_auc      0.998598
dtype: float64


['C:\\Users\\HarshaSri\\Desktop\\IDS_PROJECT\\models\\logistic.joblib']

In [13]:
# 2) Random Forest
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, class_weight='balanced', random_state=42)
print('Training RandomForest...')
res_rf = eval_cv(rf, X_train_bal, y_train_bal)
print('RandomForest results:', res_rf)
rf.fit(X_train_bal, y_train_bal)
joblib.dump(rf, os.path.join(MODELS_DIR, 'rf.joblib'))

Training RandomForest...
RandomForest results: test_precision    0.998976
test_recall       0.996096
test_f1           0.997533
test_roc_auc      0.999947
dtype: float64


['C:\\Users\\HarshaSri\\Desktop\\IDS_PROJECT\\models\\rf.joblib']

In [23]:
#XGBoost / LightGBM
try:
    import xgboost as xgb
    xg = xgb.XGBClassifier(eval_metric='logloss', n_jobs=-1, random_state=42)
    print('Training XGBoost...')
    res_xg = eval_cv(xg, X_train_bal, y_train_bal)
    print('XGBoost results:', res_xg)
    xg.fit(X_train_bal, y_train_bal)
    joblib.dump(xg, os.path.join(MODELS_DIR, 'xgboost.joblib'))
except Exception:
    print('XGBoost not available or failed to train')

Training XGBoost...
XGBoost results: test_precision    0.998419
test_recall       0.997397
test_f1           0.997908
test_roc_auc      0.999920
dtype: float64


In [25]:
# Example grid search for RandomForest (small grid)
param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
gs = GridSearchCV(RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42), param_grid, scoring='f1', cv=3, n_jobs=-1, verbose=1)
print('Starting GridSearch (RF) ...')
gs.fit(X_train_bal[:50000], y_train_bal[:50000])
print('Best params:', gs.best_params_)
joblib.dump(gs.best_estimator_, os.path.join(MODELS_DIR, 'rf_best.joblib'))
print('Saved rf_best.joblib')

Starting GridSearch (RF) ...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best params: {'max_depth': 20, 'n_estimators': 200}
Saved rf_best.joblib
