# Train and Apply Models

In [1]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd
import math, re, itertools
from ML.model_training import random_train_test_split, omit_patient_video, train_random_forest, train_knn
from ML import utils
import sys
from IPython.display import clear_output

## Split Training and Testing Data

In [2]:
X_train, X_test, arousal_train, arousal_test = omit_patient_video(target="arousal")
# X_train, X_test, arousal_train, arousal_test = random_train_test_split(target="arousal")

Generate all subsets of columns for parameters.

In [4]:
subsets = utils.generate_all_subsets(X_train.columns)
# subsets = [utils.generate_one_subset(X_train.columns)]

seen = set()
unique = []
for s in subsets:
    key = tuple(s)
    if key not in seen:
        seen.add(key)
        unique.append(s)
print(unique)


print(f"Generated {len(unique)} unique")

[['AF3_delta', 'AF3_theta', 'AF3_alpha', 'AF3_gamma', 'F7_delta', 'F7_theta', 'F7_alpha', 'F7_gamma', 'F3_delta', 'F3_theta', 'F3_alpha', 'F3_gamma', 'FC5_delta', 'FC5_theta', 'FC5_alpha', 'FC5_gamma', 'T7_delta', 'T7_theta', 'T7_alpha', 'T7_gamma', 'P7_delta', 'P7_theta', 'P7_alpha', 'P7_gamma', 'O1_delta', 'O1_theta', 'O1_alpha', 'O1_gamma', 'O2_delta', 'O2_theta', 'O2_alpha', 'O2_gamma', 'P8_delta', 'P8_theta', 'P8_alpha', 'P8_gamma', 'T8_delta', 'T8_theta', 'T8_alpha', 'T8_gamma', 'FC6_delta', 'FC6_theta', 'FC6_alpha', 'FC6_gamma', 'F4_delta', 'F4_theta', 'F4_alpha', 'F4_gamma', 'F8_delta', 'F8_theta', 'F8_alpha', 'F8_gamma', 'AF4_delta', 'AF4_theta', 'AF4_alpha', 'AF4_gamma', 'AF3_entropy', 'F7_entropy', 'F3_entropy', 'FC5_entropy', 'T7_entropy', 'P7_entropy', 'O1_entropy', 'O2_entropy', 'P8_entropy', 'T8_entropy', 'FC6_entropy', 'F4_entropy', 'F8_entropy', 'AF4_entropy', 'AF4_AF3_delta_da', 'AF4_AF3_delta_ra', 'AF4_AF3_gamma_da', 'AF4_AF3_gamma_ra', 'AF4_AF3_theta_da', 'AF4_AF3_t

## KNN Model

Train KNN to predict arousal.

In [5]:
best_model = None
best_mae = math.inf
best_keep = None
best_metrics = {"PCC": 0.0, "MSE": math.inf, "RMSE": math.inf, "MAE": math.inf, "size": 0}

best_n = 1
# n = 1

bar_len = 30
def safe_pcc(y_true, y_pred):
    yt = np.asarray(y_true, dtype=float).ravel()
    yp = np.asarray(y_pred, dtype=float).ravel()
    if yt.size < 2 or np.std(yt) == 0 or np.std(yp) == 0:
        return 0.0
    return float(np.corrcoef(yt, yp)[0, 1])

def render(bar_str: str, status_str: str, curr_mae):
    clear_output(wait=True)
    print(bar_str)
    print(curr_mae)
    print(status_str, end="")
    sys.stdout.flush()

status = "Best: size=0 | PCC=0.0000 | MSE=∞ | RMSE=∞ | MAE=∞"

start_at = 1

total_full = len(unique)
iter_unique = unique[start_at-1:]

for idx, keep in enumerate(iter_unique, start_at):
    filled = int(bar_len * idx / total_full) if total_full else 0
    bar = "█" * filled + "-" * (bar_len - filled)
    pct = (idx / total_full * 100) if total_full else 100
    bar_str = f"[{bar}] {idx}/{total_full} ({pct:5.1f}%)"

    X_train_sub = X_train.loc[:, keep]
    X_test_sub  = X_test.loc[:, keep]

    for n in [1, 11, 21, 51]:
        knn, X_test_eval, y_test_eval = train_knn(
            X_train_sub, X_test_sub, arousal_train, arousal_test, neighbors=n
        )

        arousal_pred = knn.predict(X_test_eval)
        mse  = mean_squared_error(y_test_eval, arousal_pred)
        rmse = math.sqrt(mse)
        mae  = mean_absolute_error(y_test_eval, arousal_pred)
        pcc  = safe_pcc(y_test_eval, arousal_pred)

        if mae < best_mae:
            best_mae   = mae
            best_model = knn
            best_keep  = keep
            best_n = n
            best_metrics = {"PCC": pcc, "MSE": mse, "RMSE": rmse, "MAE": mae, "size": len(keep)}
            status = (f"Best: index={idx} size={len(keep)} | "
                    f"PCC={pcc:.4f} | MSE={mse:.6f} | RMSE={rmse:.6f} | MAE={mae:.6f} | K={n}")

    render(bar_str, status, mae)
render(bar_str, status, mae)
# 40076!! low mse
# 40418!! low mae



[███████████████████-----------] 74/113 ( 65.5%)
0.38257542128669797
Best: index=14 size=84 | PCC=0.7604 | MSE=0.255260 | RMSE=0.505233 | MAE=0.166844 | K=1

KeyboardInterrupt: 

In [5]:

pcc_list   = []
pval_list  = []
r2_list    = []
mae_list   = []
mse_list   = []
rmse_list  = []
n_list     = []

best_cross_val_mae = math.inf

num_folds = 10

for fold in range(num_folds):
    X_train, X_test, arousal_train, arousal_test = omit_patient_video(
        target="arousal",
    )

    X_train_sub = X_train.loc[:, best_keep]
    X_test_sub  = X_test.loc[:, best_keep]

    knn, X_test_eval, y_test_eval = train_knn(
        X_train_sub,
        X_test_sub,
        arousal_train,
        arousal_test,
        neighbors=1,
    )

    y_pred = knn.predict(X_test_eval)

    r2   = r2_score(y_test_eval, y_pred)
    mae  = mean_absolute_error(y_test_eval, y_pred)
    mse  = mean_squared_error(y_test_eval, y_pred)
    rmse = float(np.sqrt(mse))

    try:
        from scipy.stats import pearsonr
        pcc, pval = pearsonr(
            np.asarray(y_test_eval).ravel(),
            np.asarray(y_pred).ravel()
        )
    except Exception:
        yt = np.asarray(y_test_eval, dtype=float).ravel()
        yp = np.asarray(y_pred, dtype=float).ravel()
        if yt.size < 2 or np.std(yt) == 0 or np.std(yp) == 0:
            pcc, pval = 0.0, float("nan")
        else:
            pcc = float(np.corrcoef(yt, yp)[0, 1])
            pval = float("nan")

    n_used = getattr(knn, "n_neighbors", None)
    if n_used is None:
        if "best_n" in globals():
            n_used = best_n
        else:
            n_used = "N/A"

    pcc_list.append(float(pcc))
    pval_list.append(float(pval))
    r2_list.append(float(r2))
    mae_list.append(float(mae))
    mse_list.append(float(mse))
    rmse_list.append(float(rmse))
    n_list.append(n_used)

avg_pcc  = float(np.mean(pcc_list))
avg_pval = float(np.mean(pval_list))
avg_r2   = float(np.mean(r2_list))
avg_mae  = float(np.mean(mae_list))
avg_mse  = float(np.mean(mse_list))
avg_rmse = float(np.mean(rmse_list))

if len(n_list) > 0:
    final_n = n_list[0]
else:
    final_n = "N/A"

print("KNN Regression Performance (10 random splits)")
print("--------------------------------------------")
print(f"N_Neighbors: {final_n}")
print(f"PCC:  {avg_pcc:.4f} (avg p={avg_pval:.3g})")
print(f"R²:   {avg_r2:.4f}")
print(f"MAE:  {avg_mae:.6f}")
print(f"MSE:  {avg_mse:.6f}")
print(f"RMSE: {avg_rmse:.6f}")

print("\nBest feature subset")
print("-------------------")
print(f"Count: {len(best_keep)}")
wrap = 4
for i in range(0, len(best_keep), wrap):
    print(", ".join(best_keep[i:i+wrap]))


KNN Regression Performance (10 random splits)
--------------------------------------------
N_Neighbors: 1
PCC:  0.5277 (avg p=5.26e-146)
R²:   0.0641
MAE:  0.386979
MSE:  0.539028
RMSE: 0.730691

Best feature subset
-------------------
Count: 84
AF3_theta, AF3_alpha, AF3_gamma, F7_theta
F7_alpha, F7_gamma, F3_theta, F3_alpha
F3_gamma, FC5_theta, FC5_alpha, FC5_gamma
T7_theta, T7_alpha, T7_gamma, P7_theta
P7_alpha, P7_gamma, O1_theta, O1_alpha
O1_gamma, O2_theta, O2_alpha, O2_gamma
P8_theta, P8_alpha, P8_gamma, T8_theta
T8_alpha, T8_gamma, FC6_theta, FC6_alpha
FC6_gamma, F4_theta, F4_alpha, F4_gamma
F8_theta, F8_alpha, F8_gamma, AF4_theta
AF4_alpha, AF4_gamma, AF3_theta_entropy, AF3_alpha_entropy
AF3_gamma_entropy, F7_theta_entropy, F7_alpha_entropy, F7_gamma_entropy
F3_theta_entropy, F3_alpha_entropy, F3_gamma_entropy, FC5_theta_entropy
FC5_alpha_entropy, FC5_gamma_entropy, T7_theta_entropy, T7_alpha_entropy
T7_gamma_entropy, P7_theta_entropy, P7_alpha_entropy, P7_gamma_entropy
O1_thet

### Train RF Regressor

In [None]:
best_model = None
best_mae = math.inf
best_metrics = {
    "PCC": 0.0,
    "MSE": math.inf,
    "RMSE": math.inf,
    "MAE": math.inf,
    "size": 0,
}

best_n = 1
bar_len = 30


def safe_pcc(y_true, y_pred):
    yt = np.asarray(y_true, dtype=float).ravel()
    yp = np.asarray(y_pred, dtype=float).ravel()
    if yt.size < 2 or np.std(yt) == 0 or np.std(yp) == 0:
        return 0.0
    return float(np.corrcoef(yt, yp)[0, 1])


def render(bar_str: str, status_str: str):
    clear_output(wait=True)
    print(bar_str)
    print(status_str, end="")
    sys.stdout.flush()


status = "Best: size=0 | PCC=0.0000 | MSE=∞ | RMSE=∞ | MAE=∞"

print(best_keep)
keep = best_keep
X_train_sub = X_train.loc[:, keep]
X_test_sub = X_test.loc[:, keep]


rf, X_test_eval, y_test_eval = train_random_forest(
    X_train_sub, X_test_sub, arousal_train, arousal_test, estimators=100, max_depth=8
)

arousal_pred = rf.predict(X_test_eval)
mse = mean_squared_error(y_test_eval, arousal_pred)
rmse = math.sqrt(mse)
mae = mean_absolute_error(y_test_eval, arousal_pred)
pcc = safe_pcc(y_test_eval, arousal_pred)

if mae < best_mae:
    best_mae = mae
    best_model = rf
    best_keep = keep
    best_metrics = {"PCC": pcc, "MSE": mse, "RMSE": rmse, "MAE": mae, "size": len(keep)}
    status = (
        f"Best: index={idx} size={len(keep)} | "
        f"PCC={pcc:.4f} | MSE={mse:.6f} | RMSE={rmse:.6f} | MAE={mae:.6f}"
    )

    render(bar_str, status)

None


KeyError: None