In [None]:
# Import required libraries
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# AI-NIDS modules
from ml.preprocessing.preprocessor import DataPreprocessor, FeatureEngineer
from ml.models.xgboost_classifier import XGBoostClassifier
from ml.models.autoencoder import AnomalyAutoencoder
from ml.models.lstm_detector import LSTMDetector
from ml.models.ensemble import EnsembleDetector
from ml.explainability.shap_explainer import SHAPExplainer
from ml.training import ModelTrainer

# Settings
plt.style.use('dark_background')
sns.set_palette('husl')
%matplotlib inline

print('✅ Libraries loaded successfully')

## 1. Data Loading & Exploration

Load your dataset (CICIDS2017 or UNSW-NB15) and explore its structure.

In [None]:
# Load dataset
# Option 1: CICIDS2017
# df = pd.read_csv('../data/raw/cicids2017.csv')

# Option 2: UNSW-NB15
# df = pd.read_csv('../data/raw/unsw_nb15.csv')

# For demo, create synthetic data
np.random.seed(42)
n_samples = 10000

df = pd.DataFrame({
    'duration': np.random.exponential(10, n_samples),
    'protocol_type': np.random.choice(['tcp', 'udp', 'icmp'], n_samples),
    'src_bytes': np.random.exponential(1000, n_samples),
    'dst_bytes': np.random.exponential(1000, n_samples),
    'count': np.random.poisson(10, n_samples),
    'srv_count': np.random.poisson(5, n_samples),
    'serror_rate': np.random.beta(2, 10, n_samples),
    'rerror_rate': np.random.beta(2, 10, n_samples),
    'same_srv_rate': np.random.beta(8, 2, n_samples),
    'diff_srv_rate': np.random.beta(2, 8, n_samples),
    'label': np.random.choice([0, 1], n_samples, p=[0.9, 0.1])
})

print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
# Data exploration
print('Dataset Info:')
print(f'  Samples: {len(df):,}')
print(f'  Features: {len(df.columns) - 1}')
print(f'  Target distribution:')
print(df['label'].value_counts(normalize=True))

# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

df['label'].value_counts().plot(kind='bar', ax=axes[0], color=['#00ff88', '#ff4444'])
axes[0].set_title('Class Distribution')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')

df['label'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['#00ff88', '#ff4444'])
axes[1].set_title('Class Proportion')

plt.tight_layout()
plt.show()

## 2. Preprocessing & Feature Engineering

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Separate features and target
X = df.drop(columns=['label'])
y = df['label']

# Fit and transform
X_processed = preprocessor.fit_transform(X)

print(f'Processed features shape: {X_processed.shape}')
print(f'Feature names: {list(X_processed.columns)}')

In [None]:
# Split data
X_train, X_temp, y_train, y_temp = train_test_split(
    X_processed, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f'Training set:   {len(X_train):,} samples')
print(f'Validation set: {len(X_val):,} samples')
print(f'Test set:       {len(X_test):,} samples')

## 3. Model Training

### 3.1 XGBoost Classifier

In [None]:
# Train XGBoost
xgb_model = XGBoostClassifier(
    params={
        'max_depth': 6,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }
)

xgb_model.train(X_train, y_train, X_val=X_val, y_val=y_val)
print('✅ XGBoost training complete')

In [None]:
# Evaluate XGBoost
from sklearn.metrics import classification_report, confusion_matrix

y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

print('XGBoost Classification Report:')
print(classification_report(y_test, y_pred_xgb))

### 3.2 Autoencoder

In [None]:
# Train Autoencoder on normal traffic only
X_train_normal = X_train[y_train == 0]

autoencoder = AnomalyAutoencoder(
    input_dim=X_train.shape[1],
    encoding_dim=16
)

ae_history = autoencoder.train(
    X_train_normal.values,
    epochs=50,
    batch_size=128
)
print('✅ Autoencoder training complete')

In [None]:
# Plot autoencoder training loss
plt.figure(figsize=(10, 4))
plt.plot(ae_history['train_loss'], label='Training Loss')
if 'val_loss' in ae_history:
    plt.plot(ae_history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Autoencoder Training History')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

### 3.3 LSTM Detector

In [None]:
# Prepare sequence data for LSTM
sequence_length = 10

lstm_detector = LSTMDetector(
    input_size=X_train.shape[1],
    hidden_size=64,
    num_layers=2,
    sequence_length=sequence_length
)

lstm_history = lstm_detector.train(
    X_train.values,
    y_train.values,
    epochs=30,
    batch_size=64
)
print('✅ LSTM training complete')

## 4. Ensemble Creation

In [None]:
# Create ensemble detector
ensemble = EnsembleDetector(model_dir='../models')

# Train all models
ensemble.train(X_train, y_train, X_val, y_val)

# Save models
ensemble.save()
print('✅ Ensemble training and saving complete')

In [None]:
# Evaluate ensemble
y_pred_ensemble, confidence = ensemble.predict(X_test)

print('Ensemble Classification Report:')
print(classification_report(y_test, y_pred_ensemble))

## 5. Evaluation & Visualization

In [None]:
# Compare models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = {
    'XGBoost': (y_pred_xgb, y_prob_xgb),
    'Ensemble': (y_pred_ensemble, confidence)
}

results = []
for name, (y_pred, y_prob) in models.items():
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1': f1_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_prob) if y_prob is not None else None
    })

results_df = pd.DataFrame(results)
results_df.set_index('Model', inplace=True)
results_df.style.format('{:.4f}').background_gradient(cmap='Greens')

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, (name, (y_pred, _)) in zip(axes, models.items()):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['Normal', 'Attack'],
                yticklabels=['Normal', 'Attack'])
    ax.set_title(f'{name} Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.tight_layout()
plt.show()

In [None]:
# Plot ROC curves
from sklearn.metrics import roc_curve

plt.figure(figsize=(10, 6))

for name, (_, y_prob) in models.items():
    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        auc = roc_auc_score(y_test, y_prob)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.4f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random', alpha=0.5)
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

## 6. SHAP Explainability

In [None]:
# Initialize SHAP explainer
explainer = SHAPExplainer(xgb_model.model, X_train[:1000])

print('✅ SHAP explainer initialized')

In [None]:
# Explain a single prediction
sample_idx = 0
sample = X_test.iloc[[sample_idx]]

explanation = explainer.explain(sample)

print(f'\nPrediction: {"Attack" if y_pred_xgb[sample_idx] else "Normal"}')
print(f'Actual: {"Attack" if y_test.iloc[sample_idx] else "Normal"}')
print(f'\nTop contributing features:')
for feature, value in explanation['top_features']:
    print(f'  {feature}: {value:+.4f}')

In [None]:
# SHAP summary plot
import shap

# Calculate SHAP values for test set
shap_values = explainer.shap_values(X_test[:500])

# Summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test[:500], show=False)
plt.title('SHAP Feature Importance')
plt.tight_layout()
plt.show()

## 7. Save Models

In [None]:
# Save all models
import os

model_dir = '../models'
os.makedirs(model_dir, exist_ok=True)

# Save XGBoost
xgb_model.save(f'{model_dir}/xgboost_model.json')

# Save Autoencoder
autoencoder.save(f'{model_dir}/autoencoder.pt')

# Save LSTM
lstm_detector.save(f'{model_dir}/lstm_detector.pt')

# Save preprocessor
import joblib
joblib.dump(preprocessor, f'{model_dir}/preprocessor.pkl')

print('✅ All models saved successfully')
print(f'\nModels saved to: {os.path.abspath(model_dir)}')

---

## Summary

This notebook demonstrated the complete AI-NIDS training pipeline:

1. **Data Loading**: Loaded and explored the network traffic dataset
2. **Preprocessing**: Applied feature engineering and normalization
3. **Model Training**: Trained XGBoost, Autoencoder, and LSTM models
4. **Ensemble**: Combined models for improved detection
5. **Evaluation**: Compared model performance with metrics and visualizations
6. **Explainability**: Used SHAP to understand model decisions

### Next Steps
- Replace synthetic data with real CICIDS2017 or UNSW-NB15 dataset
- Fine-tune hyperparameters for better performance
- Deploy models to the AI-NIDS system