In [7]:
# build_tcn_data_future_multihorizon_missteps.ipynb
%run J_preprocess_shared_UPDATED.ipynb  # UPDATED: adds optional KIN acc-mag RMS features (Pelvis/T8/Head)

import os
import numpy as np
import pandas as pd
from tqdm import tqdm

# ========================= USER CONFIG =========================
MISSTEP_XLSX = r"R:\Research Projects\NASA_Full\Other side projects\859 Class project\MateoCode\video frames1.xlsx"
XDF_ROOT_CF  = r"R:\Research Projects\NASA_Full\NASA Data Extraction_CF\Input Data\xdf_RawData"
XDF_ROOT_SD  = r"R:\Research Projects\NASA_Full\NASA Data Extraction_SD\Input Data\xdf_RawData"
B_CSV_PATH   = r"R:\Research Projects\NASA_Full\Other side projects\859 Class project\MateoCode\all_b_values_all_xdf.csv"

SCRIPT_DIR = os.getcwd()
NPZ_OUT = os.path.join(SCRIPT_DIR, "tcn_dataset_b_emg_com_future_multihorizon_missteps.npz")


# ========================= FEATURE SELECTION (ONE PLACE) =========================
# Use EXACT feature names that appear in feature_names printout.
FEATURE_DROP = ["b_m"]   # e.g., ["ACCmag_Pelvis", "ACCmag_T8", "ACCmag_Head"]  # drop KIN
FEATURE_KEEP = None # or set to a list to explicitly keep only these, in this order

# Safety
FEATURE_SELECTION_STRICT = True  # if True, error if a requested feature name isn't present

# Windowing / horizons
INPUT_WINDOW_SEC = 0.10
STRIDE_SEC = 0.01
HORIZONS_SEC = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]

# Shared preproc params
TARGET_FS = 50.0
MIN_TRIAL_DURATION_SEC = 5.0
B_CM_ABS_THRESH = 100.0
OUTLIER_RATIO_MAX = 0.1
EVENT_TOL_PRE_SEC = 0.05
EVENT_TOL_POST_SEC = 0.10

# Streams (must match inference)
EMG_STREAM_NAME = "EMG"
COM_STREAM_NAME = "CenterOfMass1"

# IMPORTANT:
# - No toggles here. This notebook follows cfg.include_kin_acc_features automatically.
# - Set include_kin_acc_features in cfg below to True/False depending on what you want saved.

SEED = 42
np.random.seed(SEED)

cfg = PreprocConfig(
    xdf_root_cf=XDF_ROOT_CF,
    xdf_root_sd=XDF_ROOT_SD,
    emg_stream_name=EMG_STREAM_NAME,
    com_stream_name=COM_STREAM_NAME,

    # ---- KIN settings live here (NOT as notebook toggles) ----
    # If True -> saves ACCmag_Pelvis/T8/Head (16 features)
    # If False -> old behavior (13 features)
    kin_stream_name="LinearSegmentKinematicsDatagram1",
    include_kin_acc_features=True,
    kin_strict=True,

    target_fs=TARGET_FS,
    min_trial_duration_sec=MIN_TRIAL_DURATION_SEC,
    b_cm_abs_thresh=B_CM_ABS_THRESH,
    outlier_ratio_max=OUTLIER_RATIO_MAX,
    event_tol_pre_sec=EVENT_TOL_PRE_SEC,
    event_tol_post_sec=EVENT_TOL_POST_SEC,
)

def select_features(feat: np.ndarray, feature_names: list, keep=None, drop=None, strict=True):
    """
    feat: (T, C)
    feature_names: list[str] length C
    keep: list[str] or None (explicit ordered whitelist)
    drop: list[str] or None (blacklist)
    Returns: feat_sel, feature_names_sel
    """
    feature_names = [str(x) for x in feature_names]
    name_to_idx = {n: i for i, n in enumerate(feature_names)}

    if keep is not None:
        missing = [n for n in keep if n not in name_to_idx]
        if missing and strict:
            raise RuntimeError(f"FEATURE_KEEP contains names not present: {missing}\nAvailable: {feature_names}")
        keep_idx = [name_to_idx[n] for n in keep if n in name_to_idx]
        feat2 = feat[:, keep_idx]
        names2 = [feature_names[i] for i in keep_idx]
        return feat2, names2

    if drop is None:
        return feat, feature_names

    missing = [n for n in drop if n not in name_to_idx]
    if missing and strict:
        raise RuntimeError(f"FEATURE_DROP contains names not present: {missing}\nAvailable: {feature_names}")

    drop_set = set([n for n in drop if n in name_to_idx])
    keep_idx = [i for i, n in enumerate(feature_names) if n not in drop_set]
    feat2 = feat[:, keep_idx]
    names2 = [feature_names[i] for i in keep_idx]
    return feat2, names2




# ========================= builder-specific helpers =========================
def resample_and_label_events(
    mos_df, emg_df, com_df, video_df, missteps, trial_key, xdf_path, cfg: PreprocConfig
):
    """
    Returns:
      feat: (T,C) float32 on uniform grid
      t_uniform: (T,)
      label_u: (T,) int8 where 1 indicates misstep band
      feature_names: list[str] (C)
    """
    # --- KIN follows cfg.include_kin_acc_features automatically ---
    kin_df = None
    if getattr(cfg, "include_kin_acc_features", False):
        kin_df = load_kin_df(xdf_path, cfg)

    t_uniform, feat_df, feature_names = build_uniform_features(mos_df, emg_df, com_df, cfg, kin_df=kin_df)
    if t_uniform is None:
        return None, None, None, None

    feat = feat_df.drop(columns=["timestamp"]).values.astype(np.float32)  # (T,C)
    # ---- APPLY FEATURE SELECTION HERE (ONE PLACE) ----
    feat, feature_names = select_features(
        feat, feature_names,
        keep=FEATURE_KEEP,
        drop=FEATURE_DROP,
        strict=FEATURE_SELECTION_STRICT
    )

    label_u = np.zeros(len(t_uniform), dtype=np.int8)

    intervals = map_frames_to_event_intervals(video_df, mos_ts0=float(t_uniform[0]), missteps=missteps, cfg=cfg)
    for (t_s, t_e, _typ) in intervals:
        if t_e < t_s:
            t_s, t_e = t_e, t_s
        mask = (t_uniform >= t_s) & (t_uniform <= t_e)
        label_u[mask] = 1

    return feat, t_uniform, label_u, feature_names


def build_sequences(trial_keys, trial_paths, mos_dict, emg_dict, com_dict, video_dict, misstep_dict, cfg: PreprocConfig):
    X_all, Y_all = [], []
    L_cur_all, L_multi_all = [], []
    TRIAL_ID_all = []

    feature_names_global = None

    input_len = int(round(INPUT_WINDOW_SEC * cfg.target_fs))
    stride = max(1, int(round(STRIDE_SEC * cfg.target_fs)))
    max_future = int(round(max(HORIZONS_SEC) * cfg.target_fs))

    for key in tqdm(trial_keys, desc="Building windows"):
        mos_df = mos_dict.get(key)
        emg_df = emg_dict.get(key)
        com_df = com_dict.get(key)
        video_df = video_dict.get(key)
        missteps = misstep_dict.get(key, [])

        if mos_df is None or emg_df is None or com_df is None or video_df is None:
            continue

        xdf_path = trial_paths.get(key, None)
        if not xdf_path:
            continue

        feat, t_uniform, label_u, feature_names = resample_and_label_events(
            mos_df, emg_df, com_df, video_df, missteps, key, xdf_path, cfg
        )
        if feat is None:
            continue

        # Enforce global feature consistency across trials (critical!)
        if feature_names_global is None:
            feature_names_global = feature_names
        else:
            if len(feature_names) != len(feature_names_global) or any(a != b for a, b in zip(feature_names, feature_names_global)):
                raise RuntimeError(
                    f"Feature name mismatch in trial {key}\nExpected: {feature_names_global}\nGot:      {feature_names}"
                )

        bcm = np.interp(t_uniform, mos_df["timestamp"].values, mos_df["b_cm"].values).astype(np.float32)
        T = len(t_uniform)

        s = 0
        while True:
            e = s + input_len
            last = e - 1
            if last + max_future >= T:
                break

            X_win = feat[s:e]  # (L,C)

            y_vec = []
            for H_sec in HORIZONS_SEC:
                steps = int(round(H_sec * cfg.target_fs))
                idx = last + steps
                y_vec.append(bcm[idx])
            y_vec = np.array(y_vec, dtype=np.float32)

            lbl_cur = int(label_u[s:e].max())

            lbl_vec = []
            for H_sec in HORIZONS_SEC:
                steps = int(round(H_sec * cfg.target_fs))
                fut_start = last + 1
                fut_end = min(last + steps, T - 1)
                lbl_h = int(label_u[fut_start:fut_end + 1].max()) if fut_start <= fut_end else 0
                lbl_vec.append(lbl_h)
            lbl_vec = np.array(lbl_vec, dtype=np.int8)

            X_all.append(X_win.astype(np.float32))
            Y_all.append(y_vec)
            L_cur_all.append(lbl_cur)
            L_multi_all.append(lbl_vec)
            TRIAL_ID_all.append(key)

            s += stride

    if len(X_all) == 0:
        raise RuntimeError("No windows created. Check paths/streams/Excel list.")

    X_all = np.array(X_all, dtype=np.float32)
    Y_all = np.array(Y_all, dtype=np.float32)
    labels_cur = np.array(L_cur_all, dtype=np.int8)
    labels_multi = np.stack(L_multi_all, axis=0).astype(np.int8)
    trial_ids = np.array(TRIAL_ID_all, dtype=object)

    return X_all, Y_all, labels_cur, labels_multi, trial_ids, feature_names_global


# ========================= MAIN =========================
def main():
    misstep_dict, trial_list = load_missteps_from_excel(MISSTEP_XLSX)
    if len(trial_list) == 0:
        raise RuntimeError(f"No trials in Excel: {MISSTEP_XLSX}")

    # Resolve XDF paths
    trial_paths = {}
    for trial in trial_list:
        p = find_xdf_path(trial, cfg)
        if p:
            trial_paths[trial] = p
    valid_trials = sorted(trial_paths.keys())
    if len(valid_trials) == 0:
        raise RuntimeError("None of the Excel trials found in CF/SD roots.")

    print(f"[INFO] Trials in Excel: {len(trial_list)}")
    print(f"[INFO] Trials found on disk: {len(valid_trials)}")

    # Outlier filter by b_cm
    keep_files = compute_keep_files(B_CSV_PATH, cfg)
    valid_trials = [t for t in valid_trials if t in keep_files]
    print(f"[INFO] Trials after b_cm outlier filtering: {len(valid_trials)}")
    if len(valid_trials) == 0:
        raise RuntimeError("All trials filtered out by b_cm outlier rule.")

    # MoS dict
    mos_dict = build_mos_dict(B_CSV_PATH, valid_trials, cfg)
    if len(mos_dict) == 0:
        raise RuntimeError("No MoS rows found for selected trials.")

    # Load streams
    emg_dict, com_dict, video_dict = {}, {}, {}
    for trial in tqdm(valid_trials, desc="Loading XDF"):
        xdf_path = trial_paths[trial]

        emg_df = load_stream_df(xdf_path, cfg.emg_stream_name, "EMG_ch")
        com_df = load_stream_df(xdf_path, cfg.com_stream_name, "COM_ch")
        vid_df = load_video_df(xdf_path, cfg)

        if emg_df is None or com_df is None or vid_df is None:
            continue

        emg_df = clean_emg_envelope(emg_df, cfg)

        emg_dict[trial] = emg_df
        com_dict[trial] = com_df
        video_dict[trial] = vid_df

    final_trials = sorted(set(valid_trials) & set(mos_dict) & set(emg_dict) & set(com_dict) & set(video_dict))
    print(f"[INFO] Trials with MoS+EMG+COM+video: {len(final_trials)}")
    if len(final_trials) == 0:
        raise RuntimeError("No trials have complete MoS/EMG/COM/video data.")

    X, Y, labels_cur, labels_multi, trial_ids, feature_names = build_sequences(
        final_trials, trial_paths, mos_dict, emg_dict, com_dict, video_dict, misstep_dict, cfg
    )

    print(f"[INFO] X.shape = {X.shape} (N,L,C)")
    print(f"[INFO] labels_multi.shape = {labels_multi.shape} (N,H)")
    print(f"[INFO] feature_names (C={len(feature_names)}): {feature_names}")

    np.savez(
        NPZ_OUT,
        X=X,
        Y=Y,
        labels_cur=labels_cur,
        labels_multi=labels_multi,
        horizons=np.array(HORIZONS_SEC, dtype=np.float32),
        feature_names=np.array(feature_names, dtype=object),
        target_fs=float(cfg.target_fs),
        input_window_sec=float(INPUT_WINDOW_SEC),
        stride_sec=float(STRIDE_SEC),
        event_tol_pre_sec=float(cfg.event_tol_pre_sec),
        event_tol_post_sec=float(cfg.event_tol_post_sec),
        misstep_xlsx=str(MISSTEP_XLSX),
        b_csv_path=str(B_CSV_PATH),
        xdf_root_cf=str(cfg.xdf_root_cf),
        xdf_root_sd=str(cfg.xdf_root_sd),
        trial_ids=trial_ids,
    )

    print(f"[SAVED] {NPZ_OUT}")

main()


[OK] preprocess_shared loaded.
[INFO] Trials in Excel: 19
[INFO] Trials found on disk: 19
[INFO] Trials after b_cm outlier filtering: 19


Stream 4: Calculated effective sampling rate 88.9661 Hz is different from specified rate 50.0000 Hz.
Stream 12: Calculated effective sampling rate 199.0655 Hz is different from specified rate 100.0000 Hz.
Stream 23: Calculated effective sampling rate 7.7475 Hz is different from specified rate 1024.0000 Hz.
Stream 4: Calculated effective sampling rate 88.9661 Hz is different from specified rate 50.0000 Hz.
Stream 12: Calculated effective sampling rate 199.0655 Hz is different from specified rate 100.0000 Hz.
Stream 23: Calculated effective sampling rate 7.7475 Hz is different from specified rate 1024.0000 Hz.
Stream 4: Calculated effective sampling rate 88.9661 Hz is different from specified rate 50.0000 Hz.
Stream 12: Calculated effective sampling rate 199.0655 Hz is different from specified rate 100.0000 Hz.
Stream 23: Calculated effective sampling rate 7.7475 Hz is different from specified rate 1024.0000 Hz.
Stream 7: Calculated effective sampling rate 199.1575 Hz is different from s

[INFO] Trials with MoS+EMG+COM+video: 19


Stream 4: Calculated effective sampling rate 88.9661 Hz is different from specified rate 50.0000 Hz.
Stream 12: Calculated effective sampling rate 199.0655 Hz is different from specified rate 100.0000 Hz.
Stream 23: Calculated effective sampling rate 7.7475 Hz is different from specified rate 1024.0000 Hz.
Stream 7: Calculated effective sampling rate 199.1575 Hz is different from specified rate 100.0000 Hz.
Stream 19: Calculated effective sampling rate 89.6179 Hz is different from specified rate 50.0000 Hz.
Stream 22: Calculated effective sampling rate 7.6587 Hz is different from specified rate 1024.0000 Hz.
Stream 18: Calculated effective sampling rate 89.4609 Hz is different from specified rate 50.0000 Hz.
Stream 7: Calculated effective sampling rate 199.1426 Hz is different from specified rate 100.0000 Hz.
Stream 21: Calculated effective sampling rate 7.6745 Hz is different from specified rate 1024.0000 Hz.
Stream 11: Calculated effective sampling rate 89.6929 Hz is different from s

[INFO] X.shape = (11367, 5, 15) (N,L,C)
[INFO] labels_multi.shape = (11367, 10) (N,H)
[INFO] feature_names (C=15): ['b_cm', 'EMG_ch1_env', 'EMG_ch2_env', 'EMG_ch3_env', 'EMG_ch4_env', 'EMG_ch5_env', 'EMG_ch6_env', 'EMG_ch7_env', 'EMG_ch8_env', 'ACCmag_Pelvis', 'ACCmag_T8', 'ACCmag_Head', 'COM_ch1', 'COM_ch2', 'COM_ch3']
[SAVED] R:\Research Projects\NASA_Full\Other side projects\859 Class project\MateoCode\tcn_dataset_b_emg_com_future_multihorizon_missteps.npz





In [6]:
import os
import numpy as np
import pandas as pd

# ----------------------------
# Paths
# ----------------------------
NPZ_PATH = "tcn_dataset_b_emg_com_future_multihorizon_missteps.npz"

EXPORT_ROOT = "npz_export"
WINDOW_CSV_DIR = os.path.join(EXPORT_ROOT, "windows_csv")

os.makedirs(EXPORT_ROOT, exist_ok=True)
os.makedirs(WINDOW_CSV_DIR, exist_ok=True)

# ----------------------------
# Load NPZ
# ----------------------------
# ----------------------------
# Load NPZ
# ----------------------------
data = np.load(NPZ_PATH, allow_pickle=True)

X = data["X"]
labels_cur = data["labels_cur"]
labels_multi = data["labels_multi"]
horizons = data["horizons"]
target_fs = float(data["target_fs"])
input_window_sec = float(data["input_window_sec"])
trial_ids = data["trial_ids"]


N, L, C = X.shape
H = labels_multi.shape[1]

print(f"Loaded NPZ: N={N}, L={L}, C={C}, H={H}")

# ----------------------------
# 1) Save metadata
# ----------------------------
metadata_df = pd.DataFrame({
    "target_fs": [target_fs],
    "input_window_sec": [input_window_sec],
    "num_windows": [N],
    "window_length_samples": [L],
    "num_features": [C],
    "num_horizons": [H]
})

metadata_df.to_csv(os.path.join(EXPORT_ROOT, "metadata.csv"), index=False)

# ----------------------------
# 2) Save horizons
# ----------------------------
horizons_df = pd.DataFrame({
    "horizon_index": range(H),
    "horizon_sec": horizons
})

horizons_df.to_csv(os.path.join(EXPORT_ROOT, "horizons.csv"), index=False)

# ----------------------------
# 3) Save window-level labels (Excel-friendly)
# ----------------------------
labels_df = pd.DataFrame({
    "window_idx": range(N),
    "trial_id": trial_ids,      # <-- THIS IS THE KEY LINE
    "label_cur": labels_cur
})


for i, h in enumerate(horizons):
    labels_df[f"unstable_within_{h:.2f}s"] = labels_multi[:, i]

labels_df.to_csv(os.path.join(EXPORT_ROOT, "window_labels.csv"), index=False)

# ----------------------------
# 4) Save ALL windows as individual CSVs
# ----------------------------
feature_names = data["feature_names"]
feature_cols = list(feature_names)

for i in range(N):
    df_win = pd.DataFrame(X[i], columns=feature_cols)
    df_win.insert(0, "sample_idx", range(L))
    out_path = os.path.join(WINDOW_CSV_DIR, f"window_{i:05d}.csv")
    df_win.to_csv(out_path, index=False)

print(f"Saved {N} window CSV files.")

# ----------------------------
# 5) OPTIONAL: Excel workbook with a few example windows
# ----------------------------
EXAMPLE_EXCEL = os.path.join(EXPORT_ROOT, "example_windows.xlsx")

n_examples = min(10, N)  # change if you want more
with pd.ExcelWriter(EXAMPLE_EXCEL, engine="openpyxl") as writer:
    metadata_df.to_excel(writer, sheet_name="metadata", index=False)
    horizons_df.to_excel(writer, sheet_name="horizons", index=False)
    labels_df.head(500).to_excel(writer, sheet_name="labels_preview", index=False)

    for i in range(n_examples):
        df_win = pd.DataFrame(X[i], columns=feature_cols)
        df_win.insert(0, "sample_idx", range(L))
        df_win.to_excel(writer, sheet_name=f"window_{i}", index=False)

print("Export complete.")


Loaded NPZ: N=789, L=5, C=15, H=10
Saved 789 window CSV files.
Export complete.
