# Hybrid IDS: Two-Stage Bagging Ensemble with XAI

## Architecture
- **Bag 1 (Supervised)**: Random Forest + XGBoost
- **Bag 2 (Unsupervised)**: Autoencoder + Isolation Forest (boosted by Bag 1)
- **XAI Layer**: Fast SHAP-based explanations
- **Goal**: Maximize TPR while maintaining FPR < 5%

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Our custom modules
from autoencoder_trainer import AutoencoderTrainer
from isolation_forest_trainer import IsolationForestTrainer
from two_stage_ensemble import TwoStageEnsemble
from fast_xai_explainer import FastXAIExplainer

# Sklearn and other ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import joblib
import torch

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("âœ“ All libraries imported successfully!")

## 1. Data Loading and Preparation

In [None]:
# Load the CICIDS 2017 dataset (adjust path as needed)
print("Loading dataset...")

# Assuming you have a combined CSV or you're loading from the existing notebook
# This is placeholder code - adjust based on your actual data location
data = pd.read_csv('data/cicids_2017_combined.csv')  # Adjust path

print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {data.columns.tolist()[:10]}...")
print(f"\nLabel distribution:")
print(data['outcome'].value_counts())

In [None]:
# Separate features and labels
# Note: Adjust column names based on your actual dataset
label_column = 'outcome'  # or 'Label' depending on your dataset

# Separate normal vs attack
normal_data = data[data[label_column] == 0].copy()  # Assuming 0 = normal
attack_data = data[data[label_column] == 1].copy()  # Assuming 1 = attack

print(f"Normal samples: {len(normal_data):,}")
print(f"Attack samples: {len(attack_data):,}")

# Handle class imbalance by sampling
# For training unsupervised models, we need balanced validation set
n_samples = min(len(normal_data), len(attack_data), 100000)  # Limit for speed

normal_sampled = normal_data.sample(n=n_samples, random_state=42)
attack_sampled = attack_data.sample(n=n_samples, random_state=42)

print(f"\nSampled {n_samples:,} from each class")

In [None]:
# Extract features (drop label column)
X_normal = normal_sampled.drop(columns=[label_column]).values
X_attack = attack_sampled.drop(columns=[label_column]).values

feature_names = normal_sampled.drop(columns=[label_column]).columns.tolist()

# Split normal data: training (70%), validation (15%), test (15%)
X_normal_train, X_normal_temp = train_test_split(X_normal, test_size=0.3, random_state=42)
X_normal_val, X_normal_test = train_test_split(X_normal_temp, test_size=0.5, random_state=42)

# Split attack data: validation (50%), test (50%)
X_attack_val, X_attack_test = train_test_split(X_attack, test_size=0.5, random_state=42)

print(f"Normal - Train: {len(X_normal_train):,}, Val: {len(X_normal_val):,}, Test: {len(X_normal_test):,}")
print(f"Attack - Val: {len(X_attack_val):,}, Test: {len(X_attack_test):,}")
print(f"\nFeature count: {len(feature_names)}")

## 2. Training Autoencoder

In [None]:
# Initialize autoencoder trainer
ae_trainer = AutoencoderTrainer(
    input_dim=len(feature_names),
    latent_dim=32,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

print(f"Using device: {ae_trainer.device}")
print(f"Model architecture: {len(feature_names)} â†’ 64 â†’ 48 â†’ 32 (latent) â†’ 48 â†’ 64 â†’ {len(feature_names)}")

In [None]:
# Prepare data (scale and create DataLoaders)
X_train_ae, X_val_ae = ae_trainer.prepare_data(
    X_normal_train,
    validation_split=0.2,
    batch_size=256
)

In [None]:
# Train autoencoder
ae_trainer.train(
    epochs=50,
    learning_rate=0.001,
    patience=10
)

In [None]:
# Plot training history
ae_trainer.plot_training_history()

In [None]:
# Optimize threshold for FPR < 5%
ae_threshold, ae_fpr, ae_tpr = ae_trainer.optimize_threshold_for_fpr(
    X_normal_val,
    X_attack_val,
    target_fpr=0.05
)

print(f"\nâœ“ Autoencoder threshold optimized!")
print(f"  FPR: {ae_fpr*100:.2f}% | TPR: {ae_tpr*100:.2f}%")

In [None]:
# Save autoencoder model
ae_trainer.save_model('autoencoder_model.pth')
print("âœ“ Autoencoder model saved!")

## 3. Training Isolation Forest

In [None]:
# Initialize Isolation Forest trainer
if_trainer = IsolationForestTrainer()

# Prepare data
X_train_if, X_val_if = if_trainer.prepare_data(
    X_normal_train,
    validation_split=0.3
)

In [None]:
# Train with grid search
best_params_if = if_trainer.train_with_grid_search(
    X_train_if,
    contamination_range=[0.01, 0.03, 0.05]
)

print("\nâœ“ Isolation Forest training complete!")

In [None]:
# Optimize threshold for FPR < 5%
if_threshold, if_fpr, if_tpr = if_trainer.optimize_threshold_for_fpr(
    X_normal_val,
    X_attack_val,
    target_fpr=0.05
)

print(f"\nâœ“ Isolation Forest threshold optimized!")
print(f"  FPR: {if_fpr*100:.2f}% | TPR: {if_tpr*100:.2f}%")

In [None]:
# Plot score distributions
if_trainer.plot_score_distribution(X_normal_val, X_attack_val)

In [None]:
# Evaluate on test set
if_results = if_trainer.evaluate(X_normal_test, X_attack_test)

# Save model
if_trainer.save_model('isolation_forest_model.joblib')
print("âœ“ Isolation Forest model saved!")

## 4. Two-Stage Ensemble Integration

In [None]:
# Initialize ensemble
ensemble = TwoStageEnsemble()

# Load all models
ensemble.load_models(
    rf_path='random_forest_model.joblib',
    xgb_path='xgboost_model_intrusion_detection.joblib',
    ae_path='autoencoder_model.pth',
    if_path='isolation_forest_model.joblib'
)

print("âœ“ All models loaded into ensemble!")

In [None]:
# Optimize fusion thresholds for FPR < 5%
ensemble_fpr, ensemble_tpr = ensemble.optimize_fusion_thresholds(
    X_normal_val,
    X_attack_val,
    target_fpr=0.05
)

print(f"\nâœ“ Ensemble thresholds optimized!")
print(f"  Final FPR: {ensemble_fpr*100:.2f}% | TPR: {ensemble_tpr*100:.2f}%")

In [None]:
# Save ensemble configuration
ensemble.save_ensemble('two_stage_ensemble.joblib')
print("âœ“ Ensemble configuration saved!")

## 5. Comprehensive Evaluation

In [None]:
# Test on holdout test set
X_test_combined = np.vstack([X_normal_test, X_attack_test])
y_test_true = np.hstack([
    np.zeros(len(X_normal_test)),
    np.ones(len(X_attack_test))
])

# Get predictions
predictions, confidence, all_scores = ensemble.predict(
    X_test_combined,
    return_all_scores=True
)

# Convert categories to binary (Normal=0, Attack=1)
y_pred = (predictions != "Normal").astype(int)

print("âœ“ Predictions generated on test set!")

In [None]:
# Calculate metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

accuracy = accuracy_score(y_test_true, y_pred)
precision = precision_score(y_test_true, y_pred)
recall = recall_score(y_test_true, y_pred)
f1 = f1_score(y_test_true, y_pred)

# Calculate FPR and TPR
tn, fp, fn, tp = confusion_matrix(y_test_true, y_pred).ravel()
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

print(f"\n{'='*60}")
print(f"FINAL ENSEMBLE EVALUATION RESULTS")
print(f"{'='*60}")
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"F1-Score:  {f1:.4f} ({f1*100:.2f}%)")
print(f"\nFPR: {fpr*100:.2f}% {'âœ“ MEETS TARGET' if fpr < 0.05 else 'âœ— EXCEEDS TARGET'}")
print(f"TPR: {tpr*100:.2f}%")
print(f"\nConfusion Matrix:")
print(f"  TN = {tn:,}  |  FP = {fp:,}")
print(f"  FN = {fn:,}  |  TP = {tp:,}")
print(f"{'='*60}\n")

In [None]:
# Visualize confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay

fig, ax = plt.subplots(figsize=(8, 6))
cm_display = ConfusionMatrixDisplay.from_predictions(
    y_test_true,
    y_pred,
    display_labels=['Normal', 'Attack'],
    cmap='Blues',
    ax=ax
)
ax.set_title(f'Two-Stage Ensemble Confusion Matrix\nFPR: {fpr*100:.2f}% | TPR: {tpr*100:.2f}%')
plt.tight_layout()
plt.savefig('ensemble_confusion_matrix.png', dpi=150)
plt.show()

In [None]:
# ROC Curve
from sklearn.metrics import roc_curve, auc

fpr_roc, tpr_roc, thresholds = roc_curve(y_test_true, confidence)
roc_auc = auc(fpr_roc, tpr_roc)

plt.figure(figsize=(10, 6))
plt.plot(fpr_roc, tpr_roc, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.axvline(x=0.05, color='red', linestyle='--', label='FPR Target (5%)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Two-Stage Ensemble')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('ensemble_roc_curve.png', dpi=150)
plt.show()

## 6. XAI Layer - Explainability Demonstrations

In [None]:
# Initialize XAI explainer
explainer = FastXAIExplainer()

# Load models
explainer.load_models(
    rf_path='random_forest_model.joblib',
    xgb_path='xgboost_model_intrusion_detection.joblib',
    feature_names=feature_names
)

# Initialize SHAP with background data
explainer.initialize_shap(
    background_data=X_normal_train[:1000],
    n_background=100
)

print("âœ“ XAI Explainer initialized!")

In [None]:
# Select diverse examples for explanation
# 1. True Positive (correctly detected attack)
tp_indices = np.where((y_test_true == 1) & (y_pred == 1))[0]
example_tp = tp_indices[0]

# 2. False Positive (normal flagged as attack)
fp_indices = np.where((y_test_true == 0) & (y_pred == 1))[0]
example_fp = fp_indices[0] if len(fp_indices) > 0 else None

# 3. True Negative (correctly classified normal)
tn_indices = np.where((y_test_true == 0) & (y_pred == 0))[0]
example_tn = tn_indices[0]

print(f"Selected examples:")
print(f"  True Positive (Attack detected): Index {example_tp}")
if example_fp:
    print(f"  False Positive (Normal as Attack): Index {example_fp}")
print(f"  True Negative (Normal): Index {example_tn}")

In [None]:
# Explain True Positive (Detected Attack)
instance_tp = X_test_combined[example_tp]
pred_tp = predictions[example_tp]
scores_tp = {k: v[example_tp:example_tp+1] for k, v in all_scores.items()}

explanation_tp = explainer.explain_instance(
    instance_tp,
    pred_tp,
    scores_tp,
    top_n=5
)

print("\n" + "="*70)
print("EXAMPLE 1: TRUE POSITIVE (Attack Correctly Detected)")
print("="*70)
explainer.print_explanation(explanation_tp)
explainer.plot_waterfall(explanation_tp, save_path='explanation_true_positive.png')

In [None]:
# Explain False Positive (if any)
if example_fp:
    instance_fp = X_test_combined[example_fp]
    pred_fp = predictions[example_fp]
    scores_fp = {k: v[example_fp:example_fp+1] for k, v in all_scores.items()}
    
    explanation_fp = explainer.explain_instance(
        instance_fp,
        pred_fp,
        scores_fp,
        top_n=5
    )
    
    print("\n" + "="*70)
    print("EXAMPLE 2: FALSE POSITIVE (Normal Traffic Flagged as Attack)")
    print("="*70)
    explainer.print_explanation(explanation_fp)
    explainer.plot_waterfall(explanation_fp, save_path='explanation_false_positive.png')

In [None]:
# Explain True Negative (Normal)
instance_tn = X_test_combined[example_tn]
pred_tn = predictions[example_tn]
scores_tn = {k: v[example_tn:example_tn+1] for k, v in all_scores.items()}

explanation_tn = explainer.explain_instance(
    instance_tn,
    pred_tn,
    scores_tn,
    top_n=5
)

print("\n" + "="*70)
print("EXAMPLE 3: TRUE NEGATIVE (Normal Traffic)")
print("="*70)
explainer.print_explanation(explanation_tn)
explainer.plot_waterfall(explanation_tn, save_path='explanation_true_negative.png')

In [None]:
# Random Forest feature importance
rf_importance = explainer.get_rf_feature_importance(top_n=15)

plt.figure(figsize=(10, 6))
features = [f[0] for f in rf_importance]
importances = [f[1] for f in rf_importance]

plt.barh(range(len(features)), importances, color='steelblue')
plt.yticks(range(len(features)), features)
plt.xlabel('Gini Importance')
plt.title('Random Forest - Top 15 Feature Importance')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('rf_feature_importance.png', dpi=150)
plt.show()

## 7. Summary and Model Saving

In [None]:
# Create summary report
summary = f"""
{'='*70}
TWO-STAGE BAGGING ENSEMBLE - FINAL SUMMARY
{'='*70}

MODELS TRAINED:
  1. Autoencoder (Unsupervised)
     - Architecture: {len(feature_names)} â†’ 64 â†’ 48 â†’ 32 â†’ 48 â†’ 64 â†’ {len(feature_names)}
     - FPR: {ae_fpr*100:.2f}% | TPR: {ae_tpr*100:.2f}%
  
  2. Isolation Forest (Unsupervised)
     - Best Params: {best_params_if}
     - FPR: {if_fpr*100:.2f}% | TPR: {if_tpr*100:.2f}%
  
  3. Random Forest + XGBoost (Supervised) - Pre-trained

ENSEMBLE PERFORMANCE:
  Accuracy:  {accuracy*100:.2f}%
  Precision: {precision*100:.2f}%
  Recall:    {recall*100:.2f}%
  F1-Score:  {f1*100:.2f}%
  
  FALSE POSITIVE RATE: {fpr*100:.2f}% {'âœ“ MEETS TARGET (<5%)' if fpr < 0.05 else 'âœ— EXCEEDS TARGET'}
  TRUE POSITIVE RATE:  {tpr*100:.2f}%
  
  ROC-AUC: {roc_auc:.4f}

SAVED MODELS:
  - autoencoder_model.pth
  - isolation_forest_model.joblib
  - two_stage_ensemble.joblib (configuration)

VISUALIZATIONS GENERATED:
  - autoencoder_training_history.png
  - isolation_forest_scores.png
  - ensemble_confusion_matrix.png
  - ensemble_roc_curve.png
  - rf_feature_importance.png
  - explanation_*.png (XAI waterfall plots)

{'='*70}
"""

print(summary)

# Save summary to file
with open('training_summary.txt', 'w') as f:
    f.write(summary)

print("\nâœ“ Summary saved to training_summary.txt")

In [None]:
print("\n" + "="*70)
print("ðŸŽ‰ TRAINING COMPLETE! ðŸŽ‰")
print("="*70)
print("\nAll models trained, optimized, and evaluated.")
print("You can now use the ensemble for real-time intrusion detection.")
print("\nNext steps:")
print("  1. Deploy models to production environment")
print("  2. Integrate with live network traffic capture")
print("  3. Build dashboard for real-time monitoring")
print("="*70)