# Bayesian Hyperparameter Optimization

This notebook demonstrates systematic hyperparameter tuning using Bayesian optimization with Gaussian Processes. This approach is more efficient than grid search or random search.

In [None]:
import sys
sys.path.insert(0, '../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

from models.bayesian_optimizer import BayesianHyperparameterOptimizer

# Set random seed
np.random.seed(42)

print("Bayesian Optimization for Hyperparameter Tuning")
print("="*70)

## 1. Generate Synthetic Training Data

Create a classification dataset representative of audio features.

In [None]:
# Generate synthetic dataset (representative of audio features)
X, y = make_classification(
    n_samples=2000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=8,
    random_state=42
)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Classes: {len(np.unique(y_train))}")

## 2. Optimize Random Forest Hyperparameters

Use Bayesian optimization to find optimal Random Forest parameters.

In [None]:
# Initialize optimizer
optimizer_rf = BayesianHyperparameterOptimizer(
    X_train, y_train, model_type='random_forest', cv_folds=5
)

# Run Bayesian optimization
print("\n" + "="*70)
print("OPTIMIZING RANDOM FOREST HYPERPARAMETERS")
print("="*70 + "\n")

best_rf_params = optimizer_rf.optimize_random_forest(n_init_points=5, n_iter=15)

print("\nBest Random Forest Parameters:")
for key, value in best_rf_params.items():
    print(f"  {key}: {value}")

## 3. Optimize Gradient Boosting Hyperparameters

Apply Bayesian optimization to Gradient Boosting model.

In [None]:
# Initialize optimizer for Gradient Boosting
optimizer_gb = BayesianHyperparameterOptimizer(
    X_train, y_train, model_type='gradient_boosting', cv_folds=5
)

# Run Bayesian optimization
print("\n" + "="*70)
print("OPTIMIZING GRADIENT BOOSTING HYPERPARAMETERS")
print("="*70 + "\n")

best_gb_params = optimizer_gb.optimize_gradient_boosting(n_init_points=5, n_iter=15)

print("\nBest Gradient Boosting Parameters:")
for key, value in best_gb_params.items():
    print(f"  {key}: {value}")

## 4. Visualize Optimization History

Plot the Bayesian optimization process.

In [None]:
# Plot optimization histories
print("\nVisualizing Random Forest optimization history...")
optimizer_rf.plot_optimization_history()

print("\nVisualizing Gradient Boosting optimization history...")
optimizer_gb.plot_optimization_history()

## 5. Train Final Models with Optimized Parameters

Train models using the optimized hyperparameters.

In [None]:
# Train final Random Forest with optimized parameters
print("\n" + "="*70)
print("TRAINING FINAL MODELS WITH OPTIMIZED PARAMETERS")
print("="*70)

rf_final = RandomForestClassifier(**best_rf_params)
rf_final.fit(X_train, y_train)
rf_pred = rf_final.predict(X_test)

print("\nRandom Forest (Optimized):")
print(f"  Accuracy:   {accuracy_score(y_test, rf_pred):.4f}")
print(f"  F1-Score:   {f1_score(y_test, rf_pred, average='weighted'):.4f}")

# Train final Gradient Boosting with optimized parameters
gb_final = GradientBoostingClassifier(**best_gb_params)
gb_final.fit(X_train, y_train)
gb_pred = gb_final.predict(X_test)

print("\nGradient Boosting (Optimized):")
print(f"  Accuracy:   {accuracy_score(y_test, gb_pred):.4f}")
print(f"  F1-Score:   {f1_score(y_test, gb_pred, average='weighted'):.4f}")

## 6. Performance Summary

Compare optimized models and visualize results.

In [None]:
# Performance comparison
results_df = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting'],
    'Accuracy': [
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, gb_pred)
    ],
    'F1-Score': [
        f1_score(y_test, rf_pred, average='weighted'),
        f1_score(y_test, gb_pred, average='weighted')
    ]
})

print("\n" + "="*70)
print("OPTIMIZED MODEL PERFORMANCE")
print("="*70)
print(results_df.to_string(index=False))

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(results_df))
width = 0.35

ax.bar(x - width/2, results_df['Accuracy'], width, label='Accuracy', alpha=0.8)
ax.bar(x + width/2, results_df['F1-Score'], width, label='F1-Score', alpha=0.8)

ax.set_ylabel('Score')
ax.set_title('Optimized Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(results_df['Model'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim([0, 1.0])

plt.tight_layout()
plt.savefig('../outputs/optimized_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nOptimization complete! Best models saved.")