In [None]:
from pathlib import Path
import subprocess
import shlex
import re
import time

# ---------------- Configuration ----------------
ROOT_DIR = Path(r"E:\trainingAggression_MR")                               # Root of your dataset
OUTPUT_ROOT_NAME = "training sessions"                                     # Output folder name
FFMPEG = r"C:\Users\Admin\Desktop\ffmpeg-2025-09-28-git-0fdb5829e3-full_build\bin\ffmpeg.exe"  # ffmpeg path
PRESET = "slow"                                                            # ffmpeg preset
OVERWRITE = True                                                           # Use -y to overwrite outputs
DRY_RUN = False                                                            # True = preview commands without running
# ------------------------------------------------


def find_mouse_id(mouse_folder_name: str) -> str:
    """Extract mouseID from folder like '20250825_mouse975826_trainingSessions'."""
    m = re.search(r"_(mouse\d+)_?trainingSessions", mouse_folder_name, flags=re.IGNORECASE)
    if m:
        return m.group(1)
    m2 = re.search(r"(\d{5,})", mouse_folder_name)
    if m2:
        return f"mouse{m2.group(1)}"
    return mouse_folder_name


def detect_view(folder_name: str) -> str:
    """Determine 'frontView' or 'topView' from parent folder name."""
    s = folder_name.lower()
    if "topview" in s or "top" in s:
        return "topView"
    if "frontview" in s or "front" in s:
        return "frontView"
    return "frontView"


def is_optogenetic(path: Path) -> bool:
    """Return True if path contains 'optogen' (case/typo tolerant)."""
    return "optogen" in str(path).lower()


def should_skip_input(src_path: Path, output_root: Path) -> bool:
    """
    Skip:
    - already compressed inputs (_comp.avi)
    - files inside the output tree
    """
    if src_path.name.lower().endswith("_comp.avi"):
        return True
    try:
        src_path.relative_to(output_root)
        return True
    except Exception:
        return False


### NEW: helper to decide if we already have the compressed output
def output_already_exists(dst: Path) -> bool:
    """Return True if this output file already exists (so we don't redo it or make _v2)."""
    return dst.exists()


def compress_with_ffmpeg(src: Path, dst: Path, crf: int, preset: str = PRESET, overwrite: bool = OVERWRITE):
    """Run ffmpeg compression and report timing."""
    cmd = [
        FFMPEG,
        "-i", str(src),
        "-c:v", "libx264",
        "-preset", preset,
        "-crf", str(crf),
    ]
    if overwrite:
        cmd.append("-y")
    cmd.append(str(dst))

    cmd_str = " ".join(shlex.quote(c) for c in cmd)
    print(f"\n[COMMAND] {cmd_str}")

    if DRY_RUN:
        print("[DRY RUN] Skipping execution.")
        return

    start_time = time.time()
    result = subprocess.run(cmd, capture_output=True, text=True)
    end_time = time.time()
    elapsed = end_time - start_time

    if result.returncode != 0:
        print(f"[ERROR] ffmpeg failed for {src.name}")
        print(result.stderr)
    else:
        print(f"[OK] {dst.name}  |  CRF={crf}  |  Time: {elapsed/60:.2f} min ({elapsed:.1f} s)")


def main():
    output_root = ROOT_DIR / OUTPUT_ROOT_NAME
    output_root.mkdir(parents=True, exist_ok=True)

    total = skipped = processed = 0

    # iterate over mice (all top-level dirs except the output root)
    for mouse_dir in [p for p in ROOT_DIR.iterdir() if p.is_dir() and p.name != OUTPUT_ROOT_NAME]:
        mouse_id = find_mouse_id(mouse_dir.name)

        # iterate over Day* folders
        for day_dir in [d for d in mouse_dir.glob("Day*") if d.is_dir()]:

            # iterate recursively for .avi
            for avi in day_dir.rglob("*.avi"):
                total += 1

                # 1) skip inputs that are already _comp or live in output tree
                if should_skip_input(avi, output_root):
                    skipped += 1
                    continue

                # figure view + CRF
                view = detect_view(avi.parent.name)
                crf = 16 if is_optogenetic(avi) else 22  # CRF decision

                # build mirrored output dir and filename
                out_dir = output_root / mouse_dir.name / day_dir.name
                out_dir.mkdir(parents=True, exist_ok=True)

                out_name = f"{mouse_id}_{day_dir.name}_{view}_comp.avi"
                out_path = out_dir / out_name

                # 2) if output file already exists, assume it's done and skip
                if output_already_exists(out_path):
                    print(f"[SKIP existing] {out_path}")
                    skipped += 1
                    continue

                # otherwise compress
                compress_with_ffmpeg(avi, out_path, crf)
                processed += 1

    print("\n=== Summary ===")
    print(f"Total .avi files found : {total}")
    print(f"Skipped                 : {skipped}")
    print(f"Compressed              : {processed}")


# Run the main process
main()


[SKIP existing] E:\trainingAggression_MR\training sessions\20250825_mouse975826_trainingSessions\Day02\mouse975826_Day02_frontView_comp.avi
[SKIP existing] E:\trainingAggression_MR\training sessions\20250825_mouse975826_trainingSessions\Day02\mouse975826_Day02_topView_comp.avi
[SKIP existing] E:\trainingAggression_MR\training sessions\20250825_mouse975826_trainingSessions\Day03\mouse975826_Day03_frontView_comp.avi
[SKIP existing] E:\trainingAggression_MR\training sessions\20250825_mouse975826_trainingSessions\Day03\mouse975826_Day03_topView_comp.avi
[SKIP existing] E:\trainingAggression_MR\training sessions\20250825_mouse975826_trainingSessions\Day04\mouse975826_Day04_frontView_comp.avi
[SKIP existing] E:\trainingAggression_MR\training sessions\20250825_mouse975826_trainingSessions\Day04\mouse975826_Day04_topView_comp.avi
[SKIP existing] E:\trainingAggression_MR\training sessions\20250825_mouse975826_trainingSessions\Day05\mouse975826_Day05_frontView_comp.avi
[SKIP existing] E:\trainin