In [None]:
# IDS Model Training on Google Colab

This notebook trains the ML classification model using data from MongoDB.

## Prerequisites
- MongoDB database with labeled training data
- MongoDB connection string (with network access from Colab)
- Sufficient Colab runtime (High-RAM recommended for 8M+ samples)

# IDS Model Training on Google Colab

This notebook trains the ML classification model using data from MongoDB.

## Prerequisites
- MongoDB database with labeled training data
- MongoDB connection string (with network access from Colab)
- Sufficient Colab runtime (High-RAM recommended for 8M+ samples)

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install -q pymongo>=4.6.1 pandas>=2.2.0 numpy>=1.26.0 scikit-learn>=1.4.0 tqdm>=4.66.1 matplotlib>=3.8.2 seaborn>=0.13.0 imbalanced-learn>=0.11.0

In [None]:
# Import libraries
import sys
import os
from pathlib import Path
import logging
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("✓ Libraries imported successfully")

## 2. Configuration

In [None]:
# @title MongoDB Configuration
# Enter your MongoDB connection string here
# For MongoDB Atlas, format: mongodb+srv://username:password@cluster.mongodb.net/
# For local MongoDB with port forwarding: mongodb://localhost:27017/

MONGODB_URI = "mongodb://your-connection-string"  # @param {type:"string"}
MONGODB_DATABASE_NAME = "ids_db"  # @param {type:"string"}

# Training configuration
HYPERPARAMETER_TUNING_ENABLED = True  # @param {type:"boolean"}
HYPERPARAMETER_TUNING_N_ITER = 20  # @param {type:"integer"}
HYPERPARAMETER_TUNING_CV = 3  # @param {type:"integer"}
MODEL_TYPE = "random_forest"  # @param ["random_forest", "svm", "logistic_regression"]
BATCH_LOADING_ENABLED = True  # @param {type:"boolean"}
MAX_TRAINING_SAMPLES = None  # @param {type:"raw"} Set to None for all samples, or specify a number

print("✓ Configuration loaded")

## 3. Load Training Module

Upload `train_colab.py` to Colab using the file browser, or paste its contents in the next cell.

In [None]:
# Option 1: Import from uploaded file
try:
    from train_colab import (
        ColabConfig, ColabDataCollector, ColabDataPreprocessor,
        ColabClassifier, ColabModelTrainer
    )
    print("✓ Training module loaded from file")
except ImportError:
    print("⚠ train_colab.py not found. Please upload it or paste the code in the next cell.")

In [None]:
# Option 2: Paste train_colab.py code here if file upload doesn't work
# Copy the entire contents of backend/colab/train_colab.py and paste here
# Then uncomment the following:

# exec(open('train_colab.py').read())

## 4. Initialize Services

In [None]:
# Create configuration
config = ColabConfig(
    mongodb_uri=MONGODB_URI,
    mongodb_database_name=MONGODB_DATABASE_NAME,
    CLASSIFICATION_MODEL_TYPE=MODEL_TYPE,
    HYPERPARAMETER_TUNING_ENABLED=HYPERPARAMETER_TUNING_ENABLED,
    HYPERPARAMETER_TUNING_N_ITER=HYPERPARAMETER_TUNING_N_ITER,
    HYPERPARAMETER_TUNING_CV=HYPERPARAMETER_TUNING_CV,
    BATCH_LOADING_ENABLED=BATCH_LOADING_ENABLED,
    MAX_TRAINING_SAMPLES=MAX_TRAINING_SAMPLES
)

# Initialize services
print("Connecting to MongoDB...")
data_collector = ColabDataCollector(config)
preprocessor = ColabDataPreprocessor(config)
classifier = ColabClassifier(config)
model_trainer = ColabModelTrainer(config, classifier, preprocessor, data_collector)

print("✓ Services initialized successfully")

## 5. Check Database Statistics

In [None]:
# Get database statistics
stats = data_collector.get_statistics()

print("="*60)
print("Database Statistics")
print("="*60)
print(f"Total samples: {stats.get('total_samples', 0):,}")
print(f"Labeled samples: {stats.get('labeled_samples', 0):,}")
print(f"Benign: {stats.get('benign_count', 0):,}")
print(f"Malicious: {stats.get('malicious_count', 0):,}")
print("="*60)

# Check memory requirements
try:
    import psutil
    available_memory_gb = psutil.virtual_memory().available / (1024**3)
    total_memory_gb = psutil.virtual_memory().total / (1024**3)
    
    labeled_samples = stats.get('labeled_samples', 0)
    # Estimate: ~8 bytes per feature value, 81 features
    estimated_memory_gb = (labeled_samples * 81 * 8) / (1024**3)
    
    print(f"\nAvailable RAM: {available_memory_gb:.2f} GB")
    print(f"Total RAM: {total_memory_gb:.2f} GB")
    print(f"Estimated memory needed: {estimated_memory_gb:.2f} GB")
    
    if estimated_memory_gb > available_memory_gb * 0.8:
        print("\n⚠ WARNING: Estimated memory usage exceeds 80% of available RAM")
        print("Batch loading is recommended (BATCH_LOADING_ENABLED=True)")
    else:
        print("\n✓ Memory requirements look good")
except ImportError:
    print("\n⚠ psutil not available. Cannot check memory.")

## 6. Load Training Data

In [None]:
# Load training data
print("Loading training data from MongoDB...")
print("This may take several minutes for large datasets...")

df = model_trainer.load_training_data()

print(f"\n✓ Loaded {len(df):,} samples")
print(f"Features: {len(df.columns) - 1}")  # Exclude label column
print(f"\nDataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['label'].value_counts())

## 7. Preprocess Data

In [None]:
# Clean data
print("Cleaning data...")
df_clean = preprocessor.clean_data(df)
print(f"✓ Cleaned data: {len(df_clean):,} samples")

# Engineer features
print("\nEngineering features...")
df_eng = preprocessor.engineer_features(df_clean)
print(f"✓ Feature engineering complete")

# Split data
print("\nSplitting data into train/val/test sets...")
test_size = 1.0 - config.TRAIN_TEST_SPLIT_RATIO
val_size = 0.15

train_df, val_df, test_df = preprocessor.split_data(
    df_eng, test_size=test_size, val_size=val_size, stratify=True
)

print(f"\n✓ Data split complete:")
print(f"  Train: {len(train_df):,} samples")
print(f"  Validation: {len(val_df):,} samples")
print(f"  Test: {len(test_df):,} samples")

## 8. Train Model

In [None]:
# Train model
print("="*60)
print("Starting Model Training")
print("="*60)

if HYPERPARAMETER_TUNING_ENABLED:
    print(f"Hyperparameter tuning: ENABLED")
    print(f"  Iterations: {HYPERPARAMETER_TUNING_N_ITER}")
    print(f"  CV folds: {HYPERPARAMETER_TUNING_CV}")
    print(f"\n⚠ This will take several hours for large datasets...")
else:
    print("Hyperparameter tuning: DISABLED")
    print("Using default hyperparameters")

print("\nStarting training...")
start_time = datetime.now()

training_results = model_trainer.train_model(
    hyperparameter_tuning=HYPERPARAMETER_TUNING_ENABLED
)

end_time = datetime.now()
training_duration = (end_time - start_time).total_seconds() / 3600  # hours

print("\n" + "="*60)
print("Training Complete!")
print("="*60)
print(f"Training time: {training_duration:.2f} hours")
print(f"Training samples: {training_results['training_samples']:,}")
print(f"Validation samples: {training_results['validation_samples']:,}")
print(f"Test samples: {training_results['test_samples']:,}")

## 9. Evaluate Model

In [None]:
# Display evaluation metrics
test_metrics = training_results['test_metrics']

print("="*60)
print("Test Set Evaluation Metrics")
print("="*60)
print(f"Accuracy:  {test_metrics.get('accuracy', 0):.4f}")
print(f"Precision: {test_metrics.get('precision', 0):.4f}")
print(f"Recall:    {test_metrics.get('recall', 0):.4f}")
print(f"F1-Score:  {test_metrics.get('f1_score', 0):.4f}")
print(f"ROC-AUC:   {test_metrics.get('roc_auc', 0):.4f}")

# Display confusion matrix
cm = test_metrics.get('confusion_matrix', [])
if cm:
    print("\nConfusion Matrix:")
    print(f"  True Negatives (Benign):  {cm[0][0]:,}")
    print(f"  False Positives:          {cm[0][1]:,}")
    print(f"  False Negatives:          {cm[1][0]:,}")
    print(f"  True Positives (Malicious): {cm[1][1]:,}")

# Display hyperparameters if tuning was performed
if training_results.get('hyperparameters'):
    print("\nBest Hyperparameters:")
    for param, value in training_results['hyperparameters'].items():
        print(f"  {param}: {value}")

print("="*60)

## 10. Save Model

In [None]:
# Save model
model_filename = 'classification_model.pkl'

print(f"Saving model to {model_filename}...")
model_path = model_trainer.save_model(model_filename)

# Check file size
file_size_mb = os.path.getsize(model_path) / (1024 * 1024)
print(f"\n✓ Model saved successfully")
print(f"File: {model_path}")
print(f"Size: {file_size_mb:.2f} MB")

## 11. Download Model

In [None]:
# Download model file
from google.colab import files

print("Downloading model file...")
files.download(model_filename)

print("\n✓ Model download initiated")
print("\nAfter downloading, replace your local classification_model.pkl file")
print("in the backend/ directory with this downloaded file.")

## 12. Optional: Save to Google Drive

In [None]:
# Optional: Save model to Google Drive
# Uncomment and run this cell if you want to save to Drive

# from google.colab import drive
# drive.mount('/content/drive')
# 
# import shutil
# drive_path = '/content/drive/MyDrive/IDS_Model'
# os.makedirs(drive_path, exist_ok=True)
# 
# shutil.copy(model_filename, drive_path)
# print(f"✓ Model saved to Google Drive: {drive_path}/{model_filename}")

## Summary

Training completed successfully! The model file has been downloaded.

### Next Steps:
1. Replace `backend/classification_model.pkl` with the downloaded file
2. Restart your Flask backend to load the new model
3. Test the model using the Analysis page in the frontend

### Notes:
- Training time depends on dataset size and hyperparameter tuning settings
- For 8M+ samples with hyperparameter tuning, expect 4-8 hours
- Colab free tier has 12-hour session limits
- Consider using Colab Pro for longer training sessions