# 02 Model Experiments

Compare multiple regression algorithms for the biohacking dataset.

## 1. Imports and Data Loading

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from scripts.model_experiments import (
    cross_validate_model,
    evaluate_model,
    get_experiment_models,
    load_experiment_data,
    train_model,
)

X, y = load_experiment_data()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X.shape, y.shape

## 2. Train All 3 Models

In [None]:
models = get_experiment_models(random_state=42)
trained_models = {}
metrics_rows = []
cv_rows = []

for name, model in models.items():
    trained_model = train_model(model, X_train, y_train)
    trained_models[name] = trained_model

    metrics = evaluate_model(trained_model, X_test, y_test)
    metrics_rows.append({"Model": name, **metrics})

    cv_scores = cross_validate_model(model, X, y, cv=5)
    cv_rows.append(
        {
            "Model": name,
            "CV Mean R2": cv_scores.mean(),
            "CV Std R2": cv_scores.std(),
            "Folds": len(cv_scores),
        }
    )

list(trained_models.keys())

## 3. Cross-Validation Results (Table)

In [None]:
cv_results_df = pd.DataFrame(cv_rows).sort_values("CV Mean R2", ascending=False)
cv_results_df.reset_index(drop=True)

## 4. Metrics Comparison (MSE, MAE, R²)

In [None]:
metrics_df = pd.DataFrame(metrics_rows)
metrics_df = metrics_df.rename(
    columns={"mse": "MSE", "mae": "MAE", "r2": "R2"}
).sort_values("R2", ascending=False)
metrics_df.reset_index(drop=True)

## 5. Model Selection Rationale

Select the model with the strongest combination of:
- Highest test-set R²
- Lowest MSE and MAE
- Stable cross-validation performance (higher mean R² with lower standard deviation)

In practice, prefer the model that balances predictive accuracy and generalization rather than only maximizing a single test split metric.