# Model Experiments — AQI Predictor

Interactive model comparison, hyperparameter tuning, and residual analysis.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from src.feature_pipeline.feature_store import get_training_data
from src.training_pipeline.evaluate import (
    compute_metrics, compare_models, plot_predictions_vs_actual
)
from src.config import RANDOM_STATE, TEST_SIZE

plt.style.use('seaborn-v0_8-whitegrid')
print('Libraries loaded!')

In [None]:
# Load training data
X, y = get_training_data(use_hopsworks=False)
print(f'Features: {X.shape}, Target: {y.shape}')

# Time-based split
split_idx = int(len(X) * (1 - TEST_SIZE))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

## 1. Ridge Regression

In [None]:
from src.training_pipeline.models.ridge_model import train as train_ridge, predict as predict_ridge

ridge_model = train_ridge(X_train.values, y_train.values, tune=True)
ridge_pred = predict_ridge(ridge_model, X_test.values)
ridge_metrics = compute_metrics(y_test.values, ridge_pred)
print(f'Ridge: {ridge_metrics}')

## 2. Random Forest

In [None]:
from src.training_pipeline.models.random_forest import train as train_rf, predict as predict_rf

rf_model = train_rf(X_train.values, y_train.values, tune=False)
rf_pred = predict_rf(rf_model, X_test.values)
rf_metrics = compute_metrics(y_test.values, rf_pred)
print(f'Random Forest: {rf_metrics}')

## 3. XGBoost

In [None]:
from src.training_pipeline.models.xgboost_model import train as train_xgb, predict as predict_xgb

xgb_model = train_xgb(X_train.values, y_train.values,
                       X_val=X_test.values, y_val=y_test.values, tune=False)
xgb_pred = predict_xgb(xgb_model, X_test.values)
xgb_metrics = compute_metrics(y_test.values, xgb_pred)
print(f'XGBoost: {xgb_metrics}')

## 4. Model Comparison

In [None]:
results = {
    'Ridge Regression': ridge_metrics,
    'Random Forest': rf_metrics,
    'XGBoost': xgb_metrics,
}

comparison = compare_models(results)
comparison

In [None]:
# Prediction vs Actual for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

preds = {'Ridge': ridge_pred, 'Random Forest': rf_pred, 'XGBoost': xgb_pred}
for i, (name, pred) in enumerate(preds.items()):
    axes[i].scatter(y_test.values, pred, alpha=0.4, s=10)
    axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    axes[i].set_xlabel('Actual AQI')
    axes[i].set_ylabel('Predicted AQI')
    axes[i].set_title(name)

plt.tight_layout()
plt.show()

## 5. Residual Analysis

In [None]:
# Residuals for best model
best_name = min(results, key=lambda k: results[k]['rmse'])
best_pred = preds[best_name.split()[0]] if best_name.split()[0] in preds else xgb_pred

residuals = y_test.values - best_pred

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

axes[0].hist(residuals, bins=40, color='steelblue', edgecolor='white')
axes[0].set_title(f'{best_name} — Residual Distribution')
axes[0].set_xlabel('Residual')

axes[1].scatter(best_pred, residuals, alpha=0.4, s=10)
axes[1].axhline(0, color='red', linestyle='--')
axes[1].set_title('Residuals vs Predicted')
axes[1].set_xlabel('Predicted AQI')
axes[1].set_ylabel('Residual')

from scipy import stats
stats.probplot(residuals, plot=axes[2])
axes[2].set_title('Q-Q Plot')

plt.tight_layout()
plt.show()