# Regression example: predict RPM from audio+encoder recordings

This notebook demonstrates how to build a regression dataset from CSVs produced by `audio_and_encoder.record_snippet` (columns: `time_seconds`, `audio_signal`, `rpm`) and train a baseline RPM regressor using the `regression_orchestrator.py` helper.

Objectives:
- Build train/test feature matrices using the same rich feature extractor used by the pipeline.
- Compute labels as the average RPM over each time window.
- Train a simple baseline regressor with scaling and cross-validation.
- Evaluate performance and visualize predictions vs ground truth.

Run cells sequentially. If you edited `regression_orchestrator.py`, re-run the import cell to pick up changes.

In [None]:
# Standard imports
from pathlib import Path
import importlib.util
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')
%matplotlib inline

In [None]:
# Dynamically import the regression orchestrator (keeps the notebook flexible)
# This search is robust to the notebook being moved; it finds the helper anywhere in the repo.
candidates = list(Path.cwd().rglob('regression_orchestrator.py'))
if not candidates:
    raise FileNotFoundError('Could not locate regression_orchestrator.py in the repository. Ensure the file is present under Feature_extraction_pipeline.')
reg_path = candidates[0].resolve()
spec = importlib.util.spec_from_file_location('regression_orch', str(reg_path))
regmod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(regmod)  # type: ignore[attr-defined]
run_regression_on_dataset = getattr(regmod, 'run_regression_on_dataset')
print(f'Imported regression_orchestrator OK from {reg_path}')

In [None]:
# Build the regression dataset from CSVs under sample_data (adjust path if needed)
data_root = Path('regression_data')
res = run_regression_on_dataset(data_root, segment_seconds=10.0, overlap=0.5, train_fraction=0.8, buffer_seconds=0.5, feature_level='standard')
X_train = res['train']['X']
y_train = np.asarray(res['train']['y'], dtype=float)
X_test = res['test']['X']
y_test = np.asarray(res['test']['y'], dtype=float)
print('Train shape:', X_train.shape, 'Train samples:', y_train.shape[0])
print('Test shape :', X_test.shape,  'Test samples :', y_test.shape[0])

In [None]:
# Quick inspection of label distribution
if y_train.size:
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(y_train, bins=30, kde=True, ax=ax[0])
    ax[0].set_title('Train RPM distribution')
    if y_test.size:
        sns.histplot(y_test, bins=30, kde=True, ax=ax[1], color='orange')
        ax[1].set_title('Test RPM distribution')
    plt.tight_layout()
else:
    print('No labels found in dataset. Ensure CSVs under sample_data contain rpm values.')

In [None]:
# Baseline model: scaling + RandomForestRegressor with 5-fold CV (if enough samples)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

if X_train.size == 0 or y_train.size == 0:
    print('No training data. Skipping model training.')
else:
    model = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100, random_state=42))
    n_samples = y_train.shape[0]
    cv = KFold(n_splits=5, shuffle=True, random_state=42) if n_samples >= 5 else None
    if cv is not None:
        scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
        print('CV MSE (mean):', float(-np.mean(scores)), 'std:', float(np.std(scores)))
    else:
        print('Not enough samples for 5-fold CV; train on full set instead')
    # fit and evaluate on test set
    model.fit(X_train, y_train)
    if X_test.size and y_test.size:
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f'Test MSE: {mse:.4f}, R2: {r2:.4f}')
    else:
        print('No test set available for evaluation')

In [None]:
# Visualization: Predicted vs True and residuals (if test available)
if 'model' in globals() and hasattr(model, 'predict') and X_test.size and y_test.size:
    y_pred = model.predict(X_test)

    # Predicted vs True
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    axs[0].scatter(y_test, y_pred, alpha=0.6)
    axs[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    axs[0].set_xlabel('True RPM')
    axs[0].set_ylabel('Predicted RPM')
    axs[0].set_title('Predicted vs True')

    # Residuals distribution
    residuals = y_test - y_pred
    sns.histplot(residuals, bins=30, kde=True, ax=axs[1])
    axs[1].set_title('Residuals (y_true - y_pred)')
    plt.tight_layout()

    # Feature importances (if RandomForest is present in the pipeline)
    try:
        rf = None
        if hasattr(model, 'named_steps'):
            rf = model.named_steps.get('randomforestregressor')
        if rf is None and hasattr(model, 'steps'):
            # try to find a RandomForestRegressor instance among steps
            for _, step_obj in model.steps:
                if step_obj.__class__.__name__ == 'RandomForestRegressor':
                    rf = step_obj
                    break

        if rf is not None:
            importances = rf.feature_importances_
            fnames = None
            if isinstance(res, dict) and isinstance(res.get('train'), dict):
                fnames = res['train'].get('feature_names')
            if not fnames:
                fnames = [f'feat{i}' for i in range(importances.size)]
            idx = np.argsort(importances)[::-1][:20]
            fig, ax = plt.subplots(figsize=(8, 6))
            ax.barh([fnames[i] for i in idx[::-1]], importances[idx[::-1]])
            ax.set_title('Top feature importances (RandomForest)')
            plt.tight_layout()
        else:
            print('RandomForestRegressor not found in pipeline; skipping feature importances.')
    except Exception as e:
        print('Could not compute feature importances:', e)
else:
    print('Model not found or no test data available for visualization.')

## Teaching notes and extensions
- This example trains a basic RandomForest regressor. For curriculum, you can show the effect of different feature levels (`basic`/`standard`/`advanced`) by re-running the dataset build cell with `feature_level` changed.
- To add RPM as an input feature instead of target only, compute the average RPM per segment and append as a column in `X` before training.
- Try regression metrics beyond MSE: MAE, RMSE, or quantile regression for robust evaluation.
- For time-series aware models, preserve temporal order and use rolling/sequence models (LSTM/TemporalConv).

If you'd like, I can add a second notebook cell that shows hyperparameter tuning with `RandomizedSearchCV` or a small Keras/TensorFlow example for a neural baseline.