# Train and Apply Models

In [1]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd
import math, re, itertools
from ML.model_training import omit_patient_video, train_random_forest, train_knn
from ML import utils
import sys
from IPython.display import clear_output

## Split Training and Testing Data

In [2]:
X_train, X_test, arousal_train, arousal_test = omit_patient_video(target="arousal")

Generate all subsets of columns for parameters.

In [3]:
subsets = utils.generate_all_subsets(X_train.columns)

seen = set()
unique = []
for s in subsets:
    key = tuple(s)
    if key not in seen:
        seen.add(key)
        unique.append(s)


print(f"Generated {len(unique)} unique")

Generated 59535 unique


## KNN Model

Train KNN to predict arousal.

In [None]:
best_model = None
best_mse = math.inf
best_keep = None
best_metrics = {"PCC": 0.0, "MSE": math.inf, "RMSE": math.inf, "MAE": math.inf, "size": 0}

best_n = 1
n = 21

bar_len = 30
def safe_pcc(y_true, y_pred):
    yt = np.asarray(y_true, dtype=float).ravel()
    yp = np.asarray(y_pred, dtype=float).ravel()
    if yt.size < 2 or np.std(yt) == 0 or np.std(yp) == 0:
        return 0.0
    return float(np.corrcoef(yt, yp)[0, 1])

def render(bar_str: str, status_str: str):
    clear_output(wait=True)
    print(bar_str)
    print(status_str, end="")
    sys.stdout.flush()

status = "Best: size=0 | PCC=0.0000 | MSE=∞ | RMSE=∞ | MAE=∞"

start_at = 1

total_full = len(unique)
iter_unique = unique[start_at-1:]

for idx, keep in enumerate(iter_unique, start_at):
    filled = int(bar_len * idx / total_full) if total_full else 0
    bar = "█" * filled + "-" * (bar_len - filled)
    pct = (idx / total_full * 100) if total_full else 100
    bar_str = f"[{bar}] {idx}/{total_full} ({pct:5.1f}%)"

    X_train_sub = X_train.loc[:, keep]
    X_test_sub  = X_test.loc[:, keep]

    knn, X_test_eval, y_test_eval = train_knn(
        X_train_sub, X_test_sub, arousal_train, arousal_test, neighbors=n
    )

    arousal_pred = knn.predict(X_test_eval)
    mse  = mean_squared_error(y_test_eval, arousal_pred)
    rmse = math.sqrt(mse)
    mae  = mean_absolute_error(y_test_eval, arousal_pred)
    pcc  = safe_pcc(y_test_eval, arousal_pred)

    if mse < best_mse:
        best_mse   = mse
        best_model = knn
        best_keep  = keep
        best_metrics = {"PCC": pcc, "MSE": mse, "RMSE": rmse, "MAE": mae, "size": len(keep)}
        status = (f"Best: size={len(keep)} | "
                  f"PCC={pcc:.4f} | MSE={mse:.6f} | RMSE={rmse:.6f} | MAE={mae:.6f}")

    render(bar_str, status)
render(bar_str, status)



[████████████------------------] 24076/59535 ( 40.4%)
Best: size=64 | PCC=0.8739 | MSE=0.137217 | RMSE=0.370428 | MAE=0.184614

In [10]:
y_pred = best_model.predict(X_test.loc[:, best_keep])

# --- metrics ---
r2   = r2_score(y_test_eval, y_pred)
mae  = mean_absolute_error(y_test_eval, y_pred)
mse  = mean_squared_error(y_test_eval, y_pred)
rmse = float(np.sqrt(mse))

# Pearson (with safe fallback if SciPy isn't available)
try:
    from scipy.stats import pearsonr
    pcc, pval = pearsonr(np.asarray(y_test_eval).ravel(), np.asarray(y_pred).ravel())
except Exception:
    yt = np.asarray(y_test_eval, dtype=float).ravel()
    yp = np.asarray(y_pred, dtype=float).ravel()
    if yt.size < 2 or np.std(yt) == 0 or np.std(yp) == 0:
        pcc, pval = 0.0, float("nan")
    else:
        pcc = float(np.corrcoef(yt, yp)[0, 1])
        pval = float("nan")

n_used = getattr(best_model, "n_neighbors", None)
if n_used is None:
    n_used = best_n if "best_n" in globals() else "N/A"

print("KNN Regression Performance")
print("--------------------------")
print(f"N_Neighbors: {n_used}")
print(f"PCC:  {pcc:.4f} (p={pval:.3g})")
print(f"R²:   {r2:.4f}")
print(f"MAE:  {mae:.6f}")
print(f"MSE:  {mse:.6f}")
print(f"RMSE: {rmse:.6f}")

print("\nBest feature subset")
print("-------------------")
print(f"Count: {len(best_keep)}")
wrap = 4 
for i in range(0, len(best_keep), wrap):
    print(", ".join(best_keep[i:i+wrap]))

importances = getattr(best_model, "feature_importances_", None)
if importances is not None:
    top = min(10, len(importances))
    order = np.argsort(importances)[::-1][:top]
    print("\nTop feature importances:")
    for i in order:
        print(f"{X_test.columns[i]}: {importances[i]:.4f}")
else:
    # KNN doesn't expose importances; mention permutation importance as an option
    print("\nNote: This estimator does not expose `feature_importances_` "
          "(typical for KNN). Consider permutation importance if you need ranks.")


KNN Regression Performance
--------------------------
N_Neighbors: 21
PCC:  0.7715 (p=0)
R²:   0.5578
MAE:  0.318917
MSE:  0.248130
RMSE: 0.498126

Best feature subset
-------------------
Count: 48
F7_alpha_activity, F7_alpha_mobility, F7_betaL_activity, F7_betaL_mobility
F7_betaH_activity, F7_betaH_mobility, F7_gamma_activity, F7_gamma_mobility
F3_alpha_activity, F3_alpha_mobility, F3_betaL_activity, F3_betaL_mobility
F3_betaH_activity, F3_betaH_mobility, F3_gamma_activity, F3_gamma_mobility
F4_alpha_activity, F4_alpha_mobility, F4_betaL_activity, F4_betaL_mobility
F4_betaH_activity, F4_betaH_mobility, F4_gamma_activity, F4_gamma_mobility
F8_alpha_activity, F8_alpha_mobility, F8_betaL_activity, F8_betaL_mobility
F8_betaH_activity, F8_betaH_mobility, F8_gamma_activity, F8_gamma_mobility
F7_alpha_entropy, F7_betaL_entropy, F7_betaH_entropy, F7_gamma_entropy
F3_alpha_entropy, F3_betaL_entropy, F3_betaH_entropy, F3_gamma_entropy
F4_alpha_entropy, F4_betaL_entropy, F4_betaH_entropy, F4_gam

### Train RF Regressor

In [None]:
rf, X_test, y_test = train_random_forest(X_train, X_test, arousal_train, arousal_test)