In [10]:
import os
import os
import pickle
import numpy as np
import pandas as pd
from glob import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# ─── 1) Train on variable‐length segments ──────────────────────────────────
# Load your pre‐extracted features for variable‐length segments
df_var = pd.read_csv('./features_csvs/features_dataset_without_duration.csv')
X_var = df_var.drop(columns=['modality','video_id','segment','participant','label'])
y_var = df_var['label']

# Train a Random Forest on ALL variable‐length data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_var, y_var)

print("✅ Trained on variable‐length segments (n_samples = {})".format(len(df_var)))

# ─── 2) Prepare fixed‐window test sets ────────────────────────────────────
window_sizes = [10, 20, 50]  # in seconds
fps          = 25               # annotation frame rate
ann_csv      = './annotations_csvs/annotations_with_audio.csv'
pkls_dir     = './pkls'         # directory containing pid.pkl files

filtered_cols = ["accelX_filtered", "accelY_filtered", "accelZ_filtered"]

# Load union‐voted annotations for With_Audio
ann_df = pd.read_csv(ann_csv)
ann_df = ann_df.sort_values(['video_id','segment','participant','frame'])

# Pre‐load signals
signals = {}
for pid in ann_df['participant'].unique():
    pkl_path = os.path.join(pkls_dir, f"{pid}.pkl")
    if os.path.exists(pkl_path):
        signals[pid] = pickle.load(open(pkl_path,'rb'))

# ─── 3. FEATURE EXTRACTION FUNCTION ────────────────────────────────────────
def extract_features_segment(df_sig, start_t, end_t):
    seg = df_sig[(df_sig["time"] >= start_t) & (df_sig["time"] <= end_t)]
    X = seg[filtered_cols].values
    feats = {}
    # time-domain stats per axis
    for i, col in enumerate(filtered_cols):
        arr = seg[col].values
        feats[f"{col}_mean"]   = arr.mean()   if len(arr)>0 else 0
        feats[f"{col}_var"]    = arr.var()    if len(arr)>0 else 0
        feats[f"{col}_energy"] = (arr**2).sum() if len(arr)>0 else 0
    # derivatives
    if X.shape[0] > 1:
        d = np.diff(X, axis=0)
        for i, col in enumerate(filtered_cols):
            darr = d[:,i]
            feats[f"{col}_deriv_mean"] = darr.mean()
            feats[f"{col}_deriv_std"]  = darr.std()
    else:
        for col in filtered_cols:
            feats[f"{col}_deriv_mean"] = 0
            feats[f"{col}_deriv_std"]  = 0
    # SMA
    feats["SMA"] = np.abs(X).sum()
    # inter-axis correlations
    if X.shape[0] > 1:
        feats["corr_xy"] = np.corrcoef(X[:,0], X[:,1])[0,1]
        feats["corr_xz"] = np.corrcoef(X[:,0], X[:,2])[0,1]
        feats["corr_yz"] = np.corrcoef(X[:,1], X[:,2])[0,1]
    else:
        feats.update({"corr_xy":0, "corr_xz":0, "corr_yz":0})
    # duration
    # feats["duration"] = end_t - start_t
    return feats

# ─── 3) Evaluate on each fixed window size ────────────────────────────────
results = []
for w in window_sizes:
    frames_per_win = int(w * fps)
    test_recs = []
    print(f"Window size: {w}")

    # group by video, segment, participant
    for (vid, seg, pid), grp in ann_df.groupby(['video_id','segment','participant']):
        if pid not in signals:
            continue
        timeline = grp['label'].values
        num_wins = int(np.ceil(len(timeline) / frames_per_win))
        df_sig   = signals[pid]

        step = int(5 * fps)   # avanza 5 s
        for start_f in range(0, len(timeline), step):
            end_f = start_f + frames_per_win
            slice_lab = timeline[start_f:end_f]
            # etiqueta Union Vote
            int(slice_lab.mean() >= 0.3)
            start_t = start_f / fps
            end_t   = end_f   / fps
            feats = extract_features_segment(df_sig, start_t, end_t)
            test_recs.append({**feats, 'label': win_label})

    df_test = pd.DataFrame(test_recs)
    X_test  = df_test.drop(columns=['label'])
    y_test  = df_test['label']

    # Predict and compute metrics
    y_pred   = clf.predict(X_test)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall    = recall_score(y_test, y_pred, zero_division=0)
    f1        = f1_score(y_test, y_pred, zero_division=0)
    acc       = accuracy_score(y_test, y_pred)

    results.append({
        'window_s':      w,
        'n_test_windows': len(df_test),
        'precision':     precision,
        'recall':        recall,
        'f1_macro':      f1,
        'accuracy':      acc
    })
    print(f"→ Window {w}s: precision={precision:.3f}, recall={recall:.3f}, f1={f1:.3f}, acc={acc:.3f}")

# 4) Summary table
res_df = pd.DataFrame(results)
print("\nPerformance by Window Size:")
print(res_df)


✅ Trained on variable‐length segments (n_samples = 36935)
Window size: 10
→ Window 10s: precision=0.000, recall=0.000, f1=0.000, acc=0.988
Window size: 20
→ Window 20s: precision=0.000, recall=0.000, f1=0.000, acc=1.000
Window size: 50
→ Window 50s: precision=0.000, recall=0.000, f1=0.000, acc=1.000

Performance by Window Size:
   window_s  n_test_windows  precision  recall  f1_macro  accuracy
0        10           63312        0.0     0.0       0.0  0.987885
1        20           63312        0.0     0.0       0.0  1.000000
2        50           63312        0.0     0.0       0.0  1.000000


In [None]:
"""✅ Trained on variable‐length segments (n_samples = 36935)
Window size: 1.0
→ Window 1.0s: precision=0.037, recall=0.999, f1=0.071, acc=0.038
Window size: 3.0
→ Window 3.0s: precision=0.033, recall=0.969, f1=0.064, acc=0.073
Window size: 5.0
→ Window 5.0s: precision=0.030, recall=0.581, f1=0.057, acc=0.448

Performance by Window Size:
   window_s  n_test_windows  precision    recall  f1_macro  accuracy
0       1.0          313488   0.037068  0.998968  0.071484  0.037647
1       3.0          104976   0.033287  0.968750  0.064363  0.072750
2       5.0           63312   0.030008  0.581319  0.057070  0.447798


Performance by Window Size:
   window_s  n_test_windows  precision    recall  f1_macro  accuracy
0        10           63312   0.179922  0.015504  0.028548  0.851655
1        20           63312   0.000000  0.000000  0.000000  0.787181
2        50           63312   0.000000  0.000000  0.000000  0.645991
"""