In [13]:
# %% Copy keypoint CSVs grouped by (PD_or_C, turning_angle, type_of_turn)
import os
import shutil
from pathlib import Path
import pandas as pd

# ===== CONFIG (your exact paths) =====
FEATURES_CSV = r"D:\Courses\thesis\data\21h9f9e30v9cl2fapjggz4q1x7\Turning\Data\cleaned_turning_project_2d_features.csv"
BASE_TURNING_DIR = r"D:\Courses\thesis\data\21h9f9e30v9cl2fapjggz4q1x7\Turning\Data\turning_2D3D_skeletons_coarsened\Turning_coarsen_CSV"
OUTPUT_DIR = r"D:\Courses\thesis\data\turning_keypoints_grouped"

# ===== Expected columns in FEATURES_CSV =====
COL_TURN_ID = "Turn ID"
COL_SUBJ_ID = "Participant ID number"
COL_PD_OR_C = "PD_or_C"
COL_ANGLE   = "turning_angle"
COL_TURNTYP = "type_of_turn"

# --- Load features table ---
df = pd.read_csv(FEATURES_CSV)

# Basic checks
for col in [COL_TURN_ID, COL_SUBJ_ID, COL_PD_OR_C, COL_ANGLE, COL_TURNTYP]:
    if col not in df.columns:
        raise ValueError(f"Required column '{col}' not found in {FEATURES_CSV}")

# Keep only needed columns and drop rows missing essentials
df = df[[COL_TURN_ID, COL_SUBJ_ID, COL_PD_OR_C, COL_ANGLE, COL_TURNTYP]].dropna(subset=[COL_TURN_ID, COL_SUBJ_ID, COL_PD_OR_C])

# Normalize strings
norm = lambda x: str(x).strip()
df[COL_PD_OR_C] = df[COL_PD_OR_C].map(norm)
df[COL_ANGLE]   = df[COL_ANGLE].map(norm)
df[COL_TURNTYP] = df[COL_TURNTYP].map(norm)

# Safe int cast for IDs used in folder names
def to_int_safe(x):
    try:
        return int(float(x))
    except Exception:
        return None

df["__turn_id_int"] = df[COL_TURN_ID].apply(to_int_safe)
df["__subj_id_int"] = df[COL_SUBJ_ID].apply(to_int_safe)

# Build folder: Pt{ID}_{PD_or_C}_n_{turn_id}
def make_folder_name(row):
    pid = row["__subj_id_int"]
    tid = row["__turn_id_int"]
    lab = row[COL_PD_OR_C]
    if pid is None or tid is None or not lab:
        return None
    return f"Pt{pid}_{lab}_n_{tid}"

df["__folder"] = df.apply(make_folder_name, axis=1)
df_valid = df.dropna(subset=["__folder"]).copy()

BASE_TURNING_DIR = Path(BASE_TURNING_DIR)
OUTPUT_DIR = Path(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

copied, missing = [], []

def safe(s: str) -> str:
    s = str(s)
    for ch in r'<>:"/\|?*':
        s = s.replace(ch, "_")
    return s.strip() or "UNK"

for _, row in df_valid.iterrows():
    folder = row["__folder"]
    src = BASE_TURNING_DIR / folder / "input_2D" / "keypoints.csv"

    pd_or_c = row[COL_PD_OR_C] or "UNK"
    angle   = row[COL_ANGLE]   or "UNK"
    ttype   = row[COL_TURNTYP] or "UNK"

    dst_dir = OUTPUT_DIR / safe(pd_or_c) / safe(angle) / safe(ttype)
    dst_dir.mkdir(parents=True, exist_ok=True)
    dst = dst_dir / f"{folder}.csv"

    if src.exists():
        try:
            shutil.copy2(src, dst)
            copied.append({
                "folder": folder, "src": str(src), "dst": str(dst),
                COL_PD_OR_C: pd_or_c, COL_ANGLE: angle, COL_TURNTYP: ttype,
                COL_SUBJ_ID: row[COL_SUBJ_ID], COL_TURN_ID: row[COL_TURN_ID],
            })
        except Exception as e:
            missing.append({
                "folder": folder, "src": str(src), "reason": f"copy_error: {e}",
                COL_PD_OR_C: pd_or_c, COL_ANGLE: angle, COL_TURNTYP: ttype,
                COL_SUBJ_ID: row[COL_SUBJ_ID], COL_TURN_ID: row[COL_TURN_ID],
            })
    else:
        missing.append({
            "folder": folder, "src": str(src), "reason": "not_found",
            COL_PD_OR_C: pd_or_c, COL_ANGLE: angle, COL_TURNTYP: ttype,
            COL_SUBJ_ID: row[COL_SUBJ_ID], COL_TURN_ID: row[COL_TURN_ID],
        })

# Save manifests for audit
copied_df = pd.DataFrame(copied)
missing_df = pd.DataFrame(missing)
copied_csv  = OUTPUT_DIR / "_manifest_copied.csv"
missing_csv = OUTPUT_DIR / "_manifest_missing.csv"
copied_df.to_csv(copied_csv, index=False)
missing_df.to_csv(missing_csv, index=False)

print(f"Done. Copied: {len(copied)} | Missing: {len(missing)}")
print(f"Copied manifest : {copied_csv}")
print(f"Missing manifest: {missing_csv}")

if not copied_df.empty:
    print("\nTop groups by count:")
    print(copied_df.groupby([COL_PD_OR_C, COL_ANGLE, COL_TURNTYP]).size().sort_values(ascending=False).head(20))
else:
    print("\nNo files copied. Double-check paths and that folder names match 'Pt<ID>_<PD/C>_n_<turnID>'.")


Done. Copied: 1681 | Missing: 0
Copied manifest : D:\Courses\thesis\data\turning_keypoints_grouped\_manifest_copied.csv
Missing manifest: D:\Courses\thesis\data\turning_keypoints_grouped\_manifest_missing.csv

Top groups by count:
PD_or_C  turning_angle  type_of_turn
PD       90_degrees     pivot_turn      404
C        90_degrees     pivot_turn      392
         180_degrees    pivot_turn      234
PD       180_degrees    pivot_turn      209
         135_degrees    pivot_turn       97
         90_degrees     step_turn        90
         180_degrees    step_turn        76
C        135_degrees    pivot_turn       74
PD       135_degrees    step_turn        33
         90_degrees     -                18
C        90_degrees     step_turn        13
         180_degrees    step_turn        12
         90_degrees     -                10
PD       135_degrees    -                 4
         180_degrees    -                 4
C        225_degrees    pivot_turn        3
         135_degrees    step