In [3]:
import pyautogui
import time

time.sleep(2.0)
pyautogui.rightClick()

In [4]:
# === CONFIG ==============================================================
from pathlib import Path
import tempfile
# Encoder & quality controls
ENCODER      = "libx264"   # "libx264" (best quality) or "videotoolbox" (fast on Mac)
CRF          = 16          # 16–18 = visually lossless; bigger = smaller files
X264_PRESET  = "slow"      # "slow" (quality), "medium", "fast"
PROFILE      = "high"      # h264 profile
LEVEL        = "4.2"       # OK for 1080p30/60; adjust if needed
COPY_AUDIO   = True        # keep original audio


# Where to save the rendered video:
OUTPUT_DIR = Path.home() / "Downloads" / "reddit1_captioned"   # <— change me

# Filename pattern (placeholders: {stem}=input filename stem, {ts}=timestamp, {anim}=ANIM)
FILENAME_TEMPLATE = "exported_{ts}.mp4"


#INPUT_VIDEO       = Path.home() / "Downloads" / "reddit1_filmora_clipstore" / "tester88888.mp4"
MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English

# Caption look
FONT_SIZE         = 210
UPPERCASE         = True

# Outline controls
BORDER_PX         = 12.0             # base outline thickness (px)
SHADOW_PX         = 2.0              # shadow strength
BLUR_PX           = 0.0              # small blur reduces shimmer; try 0.3–0.8 if edges flicker
STABILIZE_OUTLINE = True             # <<< turn this on to animate \bord with scale

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30
CUT_AHEAD_SEC     = 0.00
TAIL_HOLD_SEC     = 1.20

# Grouping hyperparams (control words/characters per caption)
MAX_WORDS_PER_CAP = 1
MAX_CHARS_PER_CAP = None
MAX_GAP_SEC       = 1.20

# Animation
# ANIM choices: "none","fade","pop","zoom","bounce","slide_up","slide_down",
#               "slide_left","slide_right","rotate","inflate","inflate_soft"
ANIM              = "inflate"
ANIM_IN_MS        = 20000   # your original long ramp; try 200–400 for subtle pop
ANIM_OUT_MS       = 50

# Fonts
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Rendering safety
AUTO_PLAYRES      = True             # match ASS PlayRes to actual video resolution
YUV444_RENDER     = True             # render subs in 4:4:4 to stabilize edges, then downsample
# ========================================================================

def probe_color_metadata(video_path: Path):
    """
    Return dict with color_primaries, color_trc, colorspace, color_range (tv/pc) if present.
    Prevents gamma/contrast shifts after burn-in.
    """
    import json, subprocess
    cmd = [
        "ffprobe","-v","error","-select_streams","v:0",
        "-show_entries","stream=color_primaries,color_transfer,color_space,color_range",
        "-of","json", str(video_path)
    ]
    try:
        data = json.loads(subprocess.check_output(cmd, text=True))
        st = (data.get("streams") or [{}])[0]
        prim = st.get("color_primaries")
        trc  = st.get("color_transfer")
        spc  = st.get("color_space")
        rng  = st.get("color_range")  # "tv" or "pc"
        # Map ffprobe keys to ffmpeg options
        out = {}
        if prim: out["-color_primaries"] = prim
        if trc:  out["-color_trc"]       = trc
        if spc:  out["-colorspace"]      = spc
        if rng:  out["-color_range"]     = rng
        return out
    except Exception:
        return {}

def build_encoder_flags():
    """
    Choose encoder args for best visual fidelity.
    """
    flags = []
    if ENCODER == "libx264":
        flags += ["-c:v","libx264",
                  "-preset", X264_PRESET,
                  "-profile:v", PROFILE,
                  "-level:v", LEVEL,
                  "-pix_fmt","yuv420p",
                  "-crf", str(CRF)]
        # Reasonable x264 params for detail retention without going crazy:
        flags += ["-x264-params", "aq-mode=2:aq-strength=1.0:ref=5:bframes=5:me=umh:subme=7"]
    else:  # "videotoolbox" (fast; OK quality if CRF ~18)
        flags += ["-c:v","h264_videotoolbox",
                  "-profile:v", PROFILE,
                  "-pix_fmt","yuv420p",
                  "-crf", str(CRF),
                  "-allow_sw","1"]
    return flags



import os, subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel
import re
from datetime import datetime 

def timestamp(fmt: str = "%Y%m%d_%H%M%S") -> str:
    """Current local time formatted for filenames."""
    return datetime.now().strftime(fmt)

def sanitize_stem(stem: str) -> str:
    """Make a filesystem-safe stem."""
    return re.sub(r'[^A-Za-z0-9_.-]+', '_', stem).strip('_')

def ensure_dir(p: Path) -> Path:
    """Create directory if needed and return it."""
    p.mkdir(parents=True, exist_ok=True)
    return p


# ---------- probe video resolution ----------
def probe_resolution(video_path: Path) -> tuple[int, int]:
    cmd = [
        "ffprobe","-v","error","-select_streams","v:0",
        "-show_entries","stream=width,height","-of","csv=s=x:p=0", str(video_path)
    ]
    try:
        out = subprocess.check_output(cmd, text=True).strip()
        w, h = map(int, out.split("x"))
        return w, h
    except Exception:
        # fallback
        return 1920, 1080

# centers derived later once we know resolution
CENTER_X, CENTER_Y = None, None

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header template (filled later with resolution) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: {play_w}
PlayResY: {play_h}
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,{border},{shadow},5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

# compute start/end scales & animated \bord when stabilizing
def _scale_spec_for_anim(name: str):
    name = (name or "none").lower()
    # start_scale, end_scale
    if name in ("inflate", "inflate_soft", "pop"):
        return 80, 100
    if name == "zoom":
        return 60, 100
    # slide/rotate/fade/none: no scale change
    return 100, 100

def _fmt_float(x: float) -> str:
    # compact float formatting for ASS tags
    return f"{x:.2f}".rstrip("0").rstrip(".")

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int,
             border_px: float, stabilize_outline: bool, blur_px: float) -> str:
    # snap to pixel to reduce subpixel shimmer
    cx, cy = int(round(cx)), int(round(cy))
    s0, s1 = _scale_spec_for_anim(name)
    # outline start/end if stabilizing (inverse proportional to scale)
    if stabilize_outline and (s0 != s1):
        bord0 = border_px * (s0 / 100.0)
        bord1 = border_px * (s1 / 100.0)
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = rf"\t(0,{in_ms},\bord{_fmt_float(bord1)})"
    else:
        bord0 = border_px
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = ""

    blur_tag = (rf"\blur{_fmt_float(blur_px)}" if blur_px and blur_px > 0 else "")

    name = (name or "none").lower()
    if name == "none":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"
    if name == "fade":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fad({in_ms},{out_ms})}}"
    if name == "pop":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "zoom":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx60\fscy60\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "bounce":
        return (rf"{{\an5\move({cx},{cy-40},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}"
                rf"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120,{in_ms},\fscx100\fscy100){bord_anim}}}")
    if name == "slide_up":
        return rf"{{\an5\move({cx},{cy+60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_down":
        return rf"{{\an5\move({cx},{cy-60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_left":
        return rf"{{\an5\move({cx-140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_right":
        return rf"{{\an5\move({cx+140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "rotate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\frz-12\t(0,{in_ms},\frz0)}}"
    if name == "inflate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "inflate_soft":
        return (rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\alpha&H20&\blur2"
                rf"\t(0,{in_ms},\fscx100\fscy100\alpha&H00&\blur0){bord_anim}}}")
    return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"

# ---------- Group words into captions ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                play_w: int, play_h: int,
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100,
                                border_px=12.0,
                                stabilize_outline=True,
                                blur_px=0.0):
    cx = int(round(play_w / 2))
    cy = int(round(play_h / 2))
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            t1 = max(natural_end + tail_hold, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()

        ov = anim_tag(cx, cy, anim, in_ms, out_ms, border_px, stabilize_outline, blur_px)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events
import subprocess, shutil, platform, tempfile, inspect


import subprocess, shutil, platform, tempfile, inspect

def beta_captions_v3(INPUT_VIDEO) -> str:
    print(f"[debug] using {inspect.currentframe().f_code.co_name}")

    video_path = Path(INPUT_VIDEO).expanduser().resolve()
    assert video_path.exists(), f"Video not found: {video_path}"
    print("[info] video:", video_path)

    # Resolution & center
    PLAY_W, PLAY_H = probe_resolution(video_path) if AUTO_PLAYRES else (1920, 1080)
    cx, cy = (PLAY_W // 2, PLAY_H // 2) if (CENTER_X is None or CENTER_Y is None) else (CENTER_X, CENTER_Y)
    print(f"[info] PlayRes set to: {PLAY_W}x{PLAY_H} | Center=({cx},{cy})")

    # Transcribe
    print("[info] loading Whisper model …")
    model = load_whisper_auto(MODEL_NAME)
    print("[info] transcribing (word timestamps) …")
    segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

    words = []
    for seg in segments:
        if seg.words:
            for w in seg.words:
                tok = (w.word or "").strip()
                if tok:
                    words.append({"start": float(w.start), "end": float(w.end), "text": tok})
    print(f"[info] words captured: {len(words)}")

    # Build ASS in-memory
    _, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
    ASS_HEADER = ASS_HEADER_TMPL.format(
        play_w=PLAY_W, play_h=PLAY_H, font=FONT_NAME, size=FONT_SIZE,
        border=_fmt_float(BORDER_PX), shadow=_fmt_float(SHADOW_PX)
    )
    caption_lines = group_words_to_captions(words, MAX_WORDS_PER_CAP, MAX_CHARS_PER_CAP, MAX_GAP_SEC)
    ass_events = build_center_caption_events(
        caption_lines,
        play_w=PLAY_W, play_h=PLAY_H,
        uppercase=UPPERCASE,
        min_caption=MIN_CAPTION_SEC,
        cut_ahead=CUT_AHEAD_SEC,
        tail_hold=TAIL_HOLD_SEC,
        anim=ANIM,
        in_ms=ANIM_IN_MS,
        out_ms=ANIM_OUT_MS,
        border_px=BORDER_PX,
        stabilize_outline=STABILIZE_OUTLINE,
        blur_px=BLUR_PX
    )
    ass_text = ASS_HEADER + "\n".join(ass_events)

    # Output path (timestamped)
    out_dir  = ensure_dir(Path(OUTPUT_DIR))
    ts       = timestamp()
    out_name = FILENAME_TEMPLATE.format(stem=sanitize_stem(video_path.stem), ts=ts, anim=ANIM)
    out_video = (out_dir / out_name).resolve()
    out_tmp   = out_video.with_suffix(".tmp.mp4")

    print("[info] OUTPUT_DIR:", out_dir)
    print("[info] Will write FINAL:", out_video)

    # Colorspace passthrough to prevent gamma/contrast shifts
    color_flags = probe_color_metadata(video_path)
    color_list  = [kv for pair in color_flags.items() for kv in pair]
    if color_list:
        print("[info] Color metadata passthrough:", dict(zip(color_list[::2], color_list[1::2])))

    # ffmpeg command
    if not shutil.which("ffmpeg"):
        raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")
    fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"

    enc_flags   = build_encoder_flags()
    audio_flags = ["-c:a","copy"] if COPY_AUDIO else ["-c:a","aac","-b:a","192k"]

    tmp_ass = None
    try:
        # write temp .ass first so we can insert the real path into -vf
        with tempfile.NamedTemporaryFile("w", suffix=".ass", delete=False, encoding="utf-8") as tmp:
            tmp.write(ass_text)
            tmp.flush()
            tmp_ass = Path(tmp.name)

        # ✅ build the filtergraph *now* with the actual ass path — no f-string placeholder
        if YUV444_RENDER:
            vf_arg_final = f"format=yuv444p,ass={tmp_ass.as_posix()}{fontsdir_arg},format=yuv420p"
        else:
            vf_arg_final = f"ass={tmp_ass.as_posix()}{fontsdir_arg}"

        cmd = (["ffmpeg","-y","-i",str(video_path),
                "-vf", vf_arg_final]
               + enc_flags
               + audio_flags
               + color_list
               + ["-movflags","+faststart",  # web-friendly
                  str(out_tmp)])

        print("[info] ffmpeg cmd:\n ", " ".join(cmd))
        run = subprocess.run(cmd, check=False, text=True, capture_output=True)
        if run.returncode != 0:
            print(run.stderr)
            raise RuntimeError(f"ffmpeg failed (code {run.returncode})")

        out_tmp.replace(out_video)
        print("[done] saved:", out_video)

    finally:
        if tmp_ass and tmp_ass.exists():
            try: tmp_ass.unlink()
            except Exception as e: print(f"[warn] could not delete temp ASS: {e}")
        if out_tmp.exists():
            try: out_tmp.unlink()
            except Exception: pass

    if not out_video.exists():
        raise FileNotFoundError(f"Expected output not found: {out_video}")

    return out_video.as_posix()


In [5]:
from pathlib import Path

beta_captions_v3(Path.home() / "Downloads" / "reddit1_filmora_clipstore" / "tester88888.mp4")

[debug] using beta_captions_v3
[info] video: /Users/marcus/Downloads/reddit1_filmora_clipstore/tester88888.mp4
[info] PlayRes set to: 1080x1920 | Center=(540,960)
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 22
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] OUTPUT_DIR: /Users/marcus/Downloads/reddit1_captioned
[info] Will write FINAL: /Users/marcus/Downloads/reddit1_captioned/exported_20250905_034133.mp4
[info] Color metadata passthrough: {'-color_primaries': 'bt709', '-color_trc': 'bt709', '-colorspace': 'bt709', '-color_range': 'tv'}
[info] ffmpeg cmd:
  ffmpeg -y -i /Users/marcus/Downloads/reddit1_filmora_clipstore/tester88888.mp4 -vf format=yuv444p,ass=/var/folders/n2/rstg0c3

'/Users/marcus/Downloads/reddit1_captioned/exported_20250905_034133.mp4'