In [3]:
import pyautogui
import time

time.sleep(2.0)
pyautogui.rightClick()

In [4]:
# === CONFIG ==============================================================
from pathlib import Path
import tempfile
# Encoder & quality controls
ENCODER      = "libx264"   # "libx264" (best quality) or "videotoolbox" (fast on Mac)
CRF          = 16          # 16–18 = visually lossless; bigger = smaller files
X264_PRESET  = "slow"      # "slow" (quality), "medium", "fast"
PROFILE      = "high"      # h264 profile
LEVEL        = "4.2"       # OK for 1080p30/60; adjust if needed
COPY_AUDIO   = True        # keep original audio


# Where to save the rendered video:
OUTPUT_DIR = Path.home() / "Downloads" / "reddit1_captioned"   # <— change me

# Filename pattern (placeholders: {stem}=input filename stem, {ts}=timestamp, {anim}=ANIM)
FILENAME_TEMPLATE = "exported_{ts}.mp4"


#INPUT_VIDEO       = Path.home() / "Downloads" / "reddit1_filmora_clipstore" / "tester88888.mp4"
MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English

# Caption look
FONT_SIZE         = 210
UPPERCASE         = True

# Outline controls
BORDER_PX         = 12.0             # base outline thickness (px)
SHADOW_PX         = 2.0              # shadow strength
BLUR_PX           = 0.0              # small blur reduces shimmer; try 0.3–0.8 if edges flicker
STABILIZE_OUTLINE = True             # <<< turn this on to animate \bord with scale

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30
CUT_AHEAD_SEC     = 0.00
TAIL_HOLD_SEC     = 1.20

# Grouping hyperparams (control words/characters per caption)
MAX_WORDS_PER_CAP = 1
MAX_CHARS_PER_CAP = None
MAX_GAP_SEC       = 1.20

# Animation
# ANIM choices: "none","fade","pop","zoom","bounce","slide_up","slide_down",
#               "slide_left","slide_right","rotate","inflate","inflate_soft"
ANIM              = "inflate"
ANIM_IN_MS        = 20000   # your original long ramp; try 200–400 for subtle pop
ANIM_OUT_MS       = 50

# Fonts
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Rendering safety
AUTO_PLAYRES      = True             # match ASS PlayRes to actual video resolution
YUV444_RENDER     = True             # render subs in 4:4:4 to stabilize edges, then downsample
# ========================================================================

def probe_color_metadata(video_path: Path):
    """
    Return dict with color_primaries, color_trc, colorspace, color_range (tv/pc) if present.
    Prevents gamma/contrast shifts after burn-in.
    """
    import json, subprocess
    cmd = [
        "ffprobe","-v","error","-select_streams","v:0",
        "-show_entries","stream=color_primaries,color_transfer,color_space,color_range",
        "-of","json", str(video_path)
    ]
    try:
        data = json.loads(subprocess.check_output(cmd, text=True))
        st = (data.get("streams") or [{}])[0]
        prim = st.get("color_primaries")
        trc  = st.get("color_transfer")
        spc  = st.get("color_space")
        rng  = st.get("color_range")  # "tv" or "pc"
        # Map ffprobe keys to ffmpeg options
        out = {}
        if prim: out["-color_primaries"] = prim
        if trc:  out["-color_trc"]       = trc
        if spc:  out["-colorspace"]      = spc
        if rng:  out["-color_range"]     = rng
        return out
    except Exception:
        return {}

def build_encoder_flags():
    """
    Choose encoder args for best visual fidelity.
    """
    flags = []
    if ENCODER == "libx264":
        flags += ["-c:v","libx264",
                  "-preset", X264_PRESET,
                  "-profile:v", PROFILE,
                  "-level:v", LEVEL,
                  "-pix_fmt","yuv420p",
                  "-crf", str(CRF)]
        # Reasonable x264 params for detail retention without going crazy:
        flags += ["-x264-params", "aq-mode=2:aq-strength=1.0:ref=5:bframes=5:me=umh:subme=7"]
    else:  # "videotoolbox" (fast; OK quality if CRF ~18)
        flags += ["-c:v","h264_videotoolbox",
                  "-profile:v", PROFILE,
                  "-pix_fmt","yuv420p",
                  "-crf", str(CRF),
                  "-allow_sw","1"]
    return flags



import os, subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel
import re
from datetime import datetime 

def timestamp(fmt: str = "%Y%m%d_%H%M%S") -> str:
    """Current local time formatted for filenames."""
    return datetime.now().strftime(fmt)

def sanitize_stem(stem: str) -> str:
    """Make a filesystem-safe stem."""
    return re.sub(r'[^A-Za-z0-9_.-]+', '_', stem).strip('_')

def ensure_dir(p: Path) -> Path:
    """Create directory if needed and return it."""
    p.mkdir(parents=True, exist_ok=True)
    return p


# ---------- probe video resolution ----------
def probe_resolution(video_path: Path) -> tuple[int, int]:
    cmd = [
        "ffprobe","-v","error","-select_streams","v:0",
        "-show_entries","stream=width,height","-of","csv=s=x:p=0", str(video_path)
    ]
    try:
        out = subprocess.check_output(cmd, text=True).strip()
        w, h = map(int, out.split("x"))
        return w, h
    except Exception:
        # fallback
        return 1920, 1080

# centers derived later once we know resolution
CENTER_X, CENTER_Y = None, None

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header template (filled later with resolution) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: {play_w}
PlayResY: {play_h}
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,{border},{shadow},5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

# compute start/end scales & animated \bord when stabilizing
def _scale_spec_for_anim(name: str):
    name = (name or "none").lower()
    # start_scale, end_scale
    if name in ("inflate", "inflate_soft", "pop"):
        return 80, 100
    if name == "zoom":
        return 60, 100
    # slide/rotate/fade/none: no scale change
    return 100, 100

def _fmt_float(x: float) -> str:
    # compact float formatting for ASS tags
    return f"{x:.2f}".rstrip("0").rstrip(".")

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int,
             border_px: float, stabilize_outline: bool, blur_px: float) -> str:
    # snap to pixel to reduce subpixel shimmer
    cx, cy = int(round(cx)), int(round(cy))
    s0, s1 = _scale_spec_for_anim(name)
    # outline start/end if stabilizing (inverse proportional to scale)
    if stabilize_outline and (s0 != s1):
        bord0 = border_px * (s0 / 100.0)
        bord1 = border_px * (s1 / 100.0)
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = rf"\t(0,{in_ms},\bord{_fmt_float(bord1)})"
    else:
        bord0 = border_px
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = ""

    blur_tag = (rf"\blur{_fmt_float(blur_px)}" if blur_px and blur_px > 0 else "")

    name = (name or "none").lower()
    if name == "none":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"
    if name == "fade":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fad({in_ms},{out_ms})}}"
    if name == "pop":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "zoom":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx60\fscy60\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "bounce":
        return (rf"{{\an5\move({cx},{cy-40},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}"
                rf"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120,{in_ms},\fscx100\fscy100){bord_anim}}}")
    if name == "slide_up":
        return rf"{{\an5\move({cx},{cy+60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_down":
        return rf"{{\an5\move({cx},{cy-60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_left":
        return rf"{{\an5\move({cx-140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_right":
        return rf"{{\an5\move({cx+140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "rotate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\frz-12\t(0,{in_ms},\frz0)}}"
    if name == "inflate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "inflate_soft":
        return (rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\alpha&H20&\blur2"
                rf"\t(0,{in_ms},\fscx100\fscy100\alpha&H00&\blur0){bord_anim}}}")
    return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"

# ---------- Group words into captions ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                play_w: int, play_h: int,
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100,
                                border_px=12.0,
                                stabilize_outline=True,
                                blur_px=0.0):
    cx = int(round(play_w / 2))
    cy = int(round(play_h / 2))
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            t1 = max(natural_end + tail_hold, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()

        ov = anim_tag(cx, cy, anim, in_ms, out_ms, border_px, stabilize_outline, blur_px)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events
import subprocess, shutil, platform, tempfile, inspect


import subprocess, shutil, platform, tempfile, inspect

def beta_captions_v3(INPUT_VIDEO) -> str:
    print(f"[debug] using {inspect.currentframe().f_code.co_name}")

    video_path = Path(INPUT_VIDEO).expanduser().resolve()
    assert video_path.exists(), f"Video not found: {video_path}"
    print("[info] video:", video_path)

    # Resolution & center
    PLAY_W, PLAY_H = probe_resolution(video_path) if AUTO_PLAYRES else (1920, 1080)
    cx, cy = (PLAY_W // 2, PLAY_H // 2) if (CENTER_X is None or CENTER_Y is None) else (CENTER_X, CENTER_Y)
    print(f"[info] PlayRes set to: {PLAY_W}x{PLAY_H} | Center=({cx},{cy})")

    # Transcribe
    print("[info] loading Whisper model …")
    model = load_whisper_auto(MODEL_NAME)
    print("[info] transcribing (word timestamps) …")
    segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

    words = []
    for seg in segments:
        if seg.words:
            for w in seg.words:
                tok = (w.word or "").strip()
                if tok:
                    words.append({"start": float(w.start), "end": float(w.end), "text": tok})
    print(f"[info] words captured: {len(words)}")

    # Build ASS in-memory
    _, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
    ASS_HEADER = ASS_HEADER_TMPL.format(
        play_w=PLAY_W, play_h=PLAY_H, font=FONT_NAME, size=FONT_SIZE,
        border=_fmt_float(BORDER_PX), shadow=_fmt_float(SHADOW_PX)
    )
    caption_lines = group_words_to_captions(words, MAX_WORDS_PER_CAP, MAX_CHARS_PER_CAP, MAX_GAP_SEC)
    ass_events = build_center_caption_events(
        caption_lines,
        play_w=PLAY_W, play_h=PLAY_H,
        uppercase=UPPERCASE,
        min_caption=MIN_CAPTION_SEC,
        cut_ahead=CUT_AHEAD_SEC,
        tail_hold=TAIL_HOLD_SEC,
        anim=ANIM,
        in_ms=ANIM_IN_MS,
        out_ms=ANIM_OUT_MS,
        border_px=BORDER_PX,
        stabilize_outline=STABILIZE_OUTLINE,
        blur_px=BLUR_PX
    )
    ass_text = ASS_HEADER + "\n".join(ass_events)

    # Output path (timestamped)
    out_dir  = ensure_dir(Path(OUTPUT_DIR))
    ts       = timestamp()
    out_name = FILENAME_TEMPLATE.format(stem=sanitize_stem(video_path.stem), ts=ts, anim=ANIM)
    out_video = (out_dir / out_name).resolve()
    out_tmp   = out_video.with_suffix(".tmp.mp4")

    print("[info] OUTPUT_DIR:", out_dir)
    print("[info] Will write FINAL:", out_video)

    # Colorspace passthrough to prevent gamma/contrast shifts
    color_flags = probe_color_metadata(video_path)
    color_list  = [kv for pair in color_flags.items() for kv in pair]
    if color_list:
        print("[info] Color metadata passthrough:", dict(zip(color_list[::2], color_list[1::2])))

    # ffmpeg command
    if not shutil.which("ffmpeg"):
        raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")
    fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"

    enc_flags   = build_encoder_flags()
    audio_flags = ["-c:a","copy"] if COPY_AUDIO else ["-c:a","aac","-b:a","192k"]

    tmp_ass = None
    try:
        # write temp .ass first so we can insert the real path into -vf
        with tempfile.NamedTemporaryFile("w", suffix=".ass", delete=False, encoding="utf-8") as tmp:
            tmp.write(ass_text)
            tmp.flush()
            tmp_ass = Path(tmp.name)

        # ✅ build the filtergraph *now* with the actual ass path — no f-string placeholder
        if YUV444_RENDER:
            vf_arg_final = f"format=yuv444p,ass={tmp_ass.as_posix()}{fontsdir_arg},format=yuv420p"
        else:
            vf_arg_final = f"ass={tmp_ass.as_posix()}{fontsdir_arg}"

        cmd = (["ffmpeg","-y","-i",str(video_path),
                "-vf", vf_arg_final]
               + enc_flags
               + audio_flags
               + color_list
               + ["-movflags","+faststart",  # web-friendly
                  str(out_tmp)])

        print("[info] ffmpeg cmd:\n ", " ".join(cmd))
        run = subprocess.run(cmd, check=False, text=True, capture_output=True)
        if run.returncode != 0:
            print(run.stderr)
            raise RuntimeError(f"ffmpeg failed (code {run.returncode})")

        out_tmp.replace(out_video)
        print("[done] saved:", out_video)

    finally:
        if tmp_ass and tmp_ass.exists():
            try: tmp_ass.unlink()
            except Exception as e: print(f"[warn] could not delete temp ASS: {e}")
        if out_tmp.exists():
            try: out_tmp.unlink()
            except Exception: pass

    if not out_video.exists():
        raise FileNotFoundError(f"Expected output not found: {out_video}")

    return out_video.as_posix()


In [5]:
from pathlib import Path

beta_captions_v3(Path.home() / "Downloads" / "reddit1_filmora_clipstore" / "tester88888.mp4")

[debug] using beta_captions_v3
[info] video: /Users/marcus/Downloads/reddit1_filmora_clipstore/tester88888.mp4
[info] PlayRes set to: 1080x1920 | Center=(540,960)
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 22
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] OUTPUT_DIR: /Users/marcus/Downloads/reddit1_captioned
[info] Will write FINAL: /Users/marcus/Downloads/reddit1_captioned/exported_20250905_034133.mp4
[info] Color metadata passthrough: {'-color_primaries': 'bt709', '-color_trc': 'bt709', '-colorspace': 'bt709', '-color_range': 'tv'}
[info] ffmpeg cmd:
  ffmpeg -y -i /Users/marcus/Downloads/reddit1_filmora_clipstore/tester88888.mp4 -vf format=yuv444p,ass=/var/folders/n2/rstg0c3

'/Users/marcus/Downloads/reddit1_captioned/exported_20250905_034133.mp4'

In [6]:
# === CONFIG ==============================================================

from pathlib import Path
import tempfile

# Where to save the rendered video:
OUTPUT_DIR = Path.home() / "Downloads" / "reddit1_filmora_captioned"

# Filename pattern (placeholders: {stem}=input filename stem, {ts}=timestamp, {anim}=ANIM)
FILENAME_TEMPLATE = "exported_{ts}.mp4"

MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English

# Caption look
FONT_SIZE         = 210
UPPERCASE         = True

# Outline controls
BORDER_PX         = 12.0
SHADOW_PX         = 2.0
BLUR_PX           = 0.0
STABILIZE_OUTLINE = True

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30
CUT_AHEAD_SEC     = 0.00
TAIL_HOLD_SEC     = 1.20

# Grouping hyperparams
MAX_WORDS_PER_CAP = 1
MAX_CHARS_PER_CAP = None
MAX_GAP_SEC       = 1.20

# Animation
ANIM              = "inflate"
ANIM_IN_MS        = 20000
ANIM_OUT_MS       = 50

# Fonts
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Rendering safety
AUTO_PLAYRES      = True             # match ASS PlayRes to actual video resolution
YUV444_RENDER     = True             # render subs in 4:4:4 to stabilize edges, then downsample

# --- NEW: Reddit intro card overlay --------------------------------------
# You can set this to a FILE or a FOLDER. If it's a folder, the first image is used.
# Example folder you mentioned:
INTRO_CARD_SRC     = Path("/Users/marcus/Downloads/Thumb_shorts_white")   # or None to disable
INTRO_ENABLED      = True            # quick master toggle
INTRO_SECS         = 3.0             # how long the card sits on top
INTRO_FADE         = 0.30            # fade-out duration (must be <= INTRO_SECS)
INTRO_SCALE        = 0.92            # fraction of main video width
INTRO_CROP_BOTTOM  = 0.12            # crop bottom % of card (0.0..1.0); 0.12 = remove 12% from bottom
INTRO_OFFSET_X     = 0               # px offset from centered X (positive => right)
INTRO_OFFSET_Y     = 0               # px offset from centered Y (positive => down)
# ========================================================================

import os, subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel
import re
from datetime import datetime

def timestamp(fmt: str = "%Y%m%d_%H%M%S") -> str:
    return datetime.now().strftime(fmt)

def sanitize_stem(stem: str) -> str:
    return re.sub(r'[^A-Za-z0-9_.-]+', '_', stem).strip('_')

def ensure_dir(p: Path) -> Path:
    p.mkdir(parents=True, exist_ok=True)
    return p

# ---------- probe video resolution ----------
def probe_resolution(video_path: Path) -> tuple[int, int]:
    cmd = [
        "ffprobe","-v","error","-select_streams","v:0",
        "-show_entries","stream=width,height","-of","csv=s=x:p=0", str(video_path)
    ]
    try:
        out = subprocess.check_output(cmd, text=True).strip()
        w, h = map(int, out.split("x"))
        return w, h
    except Exception:
        return 1920, 1080

CENTER_X, CENTER_Y = None, None

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: {play_w}
PlayResY: {play_h}
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,{border},{shadow},5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def _scale_spec_for_anim(name: str):
    name = (name or "none").lower()
    if name in ("inflate", "inflate_soft", "pop"):
        return 80, 100
    if name == "zoom":
        return 60, 100
    return 100, 100

def _fmt_float(x: float) -> str:
    return f"{x:.2f}".rstrip("0").rstrip(".")

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int,
             border_px: float, stabilize_outline: bool, blur_px: float) -> str:
    cx, cy = int(round(cx)), int(round(cy))
    s0, s1 = _scale_spec_for_anim(name)
    if stabilize_outline and (s0 != s1):
        bord0 = border_px * (s0 / 100.0)
        bord1 = border_px * (s1 / 100.0)
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = rf"\t(0,{in_ms},\bord{_fmt_float(bord1)})"
    else:
        bord_tag0 = rf"\bord{_fmt_float(border_px)}"
        bord_anim = ""
    blur_tag = (rf"\blur{_fmt_float(blur_px)}" if blur_px and blur_px > 0 else "")
    name = (name or "none").lower()
    if name == "none":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"
    if name == "fade":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fad({in_ms},{out_ms})}}"
    if name == "pop":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "zoom":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx60\fscy60\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "bounce":
        return (rf"{{\an5\move({cx},{cy-40},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}"
                rf"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120,{in_ms},\fscx100\fscy100){bord_anim}}}")
    if name == "slide_up":
        return rf"{{\an5\move({cx},{cy+60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_down":
        return rf"{{\an5\move({cx},{cy-60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_left":
        return rf"{{\an5\move({cx-140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_right":
        return rf"{{\an5\move({cx+140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "rotate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\frz-12\t(0,{in_ms},\frz0)}}"
    if name == "inflate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "inflate_soft":
        return (rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\alpha&H20&\blur2"
                rf"\t(0,{in_ms},\fscx100\fscy100\alpha&H00&\blur0){bord_anim}}}")
    return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"

# ---------- Group words into captions ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                play_w: int, play_h: int,
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100,
                                border_px=12.0,
                                stabilize_outline=True,
                                blur_px=0.0):
    cx = int(round(play_w / 2))
    cy = int(round(play_h / 2))
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            t1 = max(natural_end + tail_hold, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()

        ov = anim_tag(cx, cy, anim, in_ms, out_ms, border_px, stabilize_outline, blur_px)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events

# ---------- NEW: resolve an intro image from a file OR folder -------------
def resolve_intro_image(src: Path | None) -> Path | None:
    if not src:
        return None
    src = Path(src).expanduser()
    if not src.exists():
        return None
    if src.is_file():
        return src
    # folder: pick first image
    for ext in ("*.png","*.jpg","*.jpeg","*.webp"):
        files = sorted(src.glob(ext))
        if files:
            return files[0]
    return None

# ---------- MAIN: captions + optional reddit intro overlay ----------------
def beta_captions(INPUT_VIDEO: str | Path,
                  intro_card_src: Path | None = INTRO_CARD_SRC,
                  intro_enabled: bool = INTRO_ENABLED,
                  intro_secs: float = INTRO_SECS,
                  intro_fade: float = INTRO_FADE,
                  intro_scale: float = INTRO_SCALE,
                  intro_crop_bottom: float = INTRO_CROP_BOTTOM,
                  intro_offset_x: int = INTRO_OFFSET_X,
                  intro_offset_y: int = INTRO_OFFSET_Y) -> str:

    # ---------- 1) Transcribe ----------
    video_path = Path(INPUT_VIDEO).expanduser().resolve()
    assert video_path.exists(), f"Video not found: {video_path}"
    print("[info] video:", video_path)

    PLAY_W, PLAY_H = probe_resolution(video_path) if AUTO_PLAYRES else (1920, 1080)
    if CENTER_X is None or CENTER_Y is None:
        cx, cy = PLAY_W // 2, PLAY_H // 2
    else:
        cx, cy = CENTER_X, CENTER_Y
    print(f"[info] PlayRes set to: {PLAY_W}x{PLAY_H} | Center=({cx},{cy})")

    print("[info] loading Whisper model …")
    model = load_whisper_auto(MODEL_NAME)

    print("[info] transcribing (word timestamps) …")
    segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

    words = []
    for seg in segments:
        if seg.words:
            for w in seg.words:
                tok = (w.word or "").strip()
                if tok:
                    words.append({"start": float(w.start), "end": float(w.end), "text": tok})

    print(f"[info] words captured: {len(words)}")

    # ---------- 2) Build ASS ----------
    font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
    ASS_HEADER = ASS_HEADER_TMPL.format(
        play_w=PLAY_W, play_h=PLAY_H, font=FONT_NAME, size=FONT_SIZE,
        border=_fmt_float(BORDER_PX), shadow=_fmt_float(SHADOW_PX)
    )

    caption_lines = group_words_to_captions(
        words,
        max_words=MAX_WORDS_PER_CAP,
        max_chars=MAX_CHARS_PER_CAP,
        max_gap_s=MAX_GAP_SEC
    )
    print(f"[info] caption groups built: {len(caption_lines)} "
          f"(max_words={MAX_WORDS_PER_CAP}, max_chars={MAX_CHARS_PER_CAP}, max_gap_s={MAX_GAP_SEC})")

    ass_events = build_center_caption_events(
        caption_lines,
        play_w=PLAY_W, play_h=PLAY_H,
        uppercase=UPPERCASE,
        min_caption=MIN_CAPTION_SEC,
        cut_ahead=CUT_AHEAD_SEC,
        tail_hold=TAIL_HOLD_SEC,
        anim=ANIM,
        in_ms=ANIM_IN_MS,
        out_ms=ANIM_OUT_MS,
        border_px=BORDER_PX,
        stabilize_outline=STABILIZE_OUTLINE,
        blur_px=BLUR_PX
    )
    ass_text = ASS_HEADER + "\n".join(ass_events)

    # ---------- 3) Decide output path ----------
    out_dir = ensure_dir(Path(OUTPUT_DIR))
    safe_stem = sanitize_stem(video_path.stem)
    ts = timestamp()
    out_name = FILENAME_TEMPLATE.format(stem=safe_stem, ts=ts, anim=ANIM)
    out_video = (out_dir / out_name).resolve()
    out_tmp = out_video.with_suffix(".tmp.mp4")

    print("[info] OUTPUT_DIR:", out_dir)
    print("[info] Output file:", out_video)

    # ---------- 4) FFmpeg (write temp .ass and burn) ----------
    if not shutil.which("ffmpeg"):
        raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

    fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"

    tmp_path = None
    try:
        # write ASS to temp file
        with tempfile.NamedTemporaryFile("w", suffix=".ass", delete=False, encoding="utf-8") as tmp:
            tmp.write(ass_text)
            tmp.flush()
            tmp_path = Path(tmp.name)

        vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"

        # Try to resolve an intro image (file or from folder)
        intro_img = resolve_intro_image(intro_card_src) if intro_enabled else None
        if intro_img:
            print(f"[info] intro card: {intro_img}")

        if intro_img and intro_secs > 0:
            # --- subtitles chain for the base video ---
            if YUV444_RENDER:
                base_chain = f"format=yuv444p,ass={tmp_path.as_posix()}{fontsdir_arg}"
            else:
                base_chain = f"ass={tmp_path.as_posix()}{fontsdir_arg}"

            # Safe clamps
            fade_d     = max(0.0, min(float(intro_fade), float(intro_secs)))
            crop_keep  = max(0.0, min(1.0, 1.0 - float(intro_crop_bottom)))
            scale_frac = max(0.05, min(2.0, float(intro_scale)))

            # Compute target card width in pixels (even number for H.264)
            scaled_w = int(round(PLAY_W * scale_frac))
            if scaled_w % 2:
                scaled_w -= 1
            if scaled_w < 2:
                scaled_w = 2  # safety

            # Filter graph:
            # [0:v] -> subtitles -> [base]
            # [1:v] -> crop -> scale to scaled_w -> fade alpha -> [cardf]
            # [base][cardf] overlay centered for first intro_secs -> [vout]
            fc = (
                f"[0:v]{base_chain}[base];"
                f"[1:v]format=rgba,crop=iw:ih*{crop_keep}:0:0[cardc];"
                f"[cardc]scale={scaled_w}:-1[cards];"
                f"[cards]fade=t=out:st={intro_secs - fade_d}:d={fade_d}:alpha=1[cardf];"
                f"[base][cardf]overlay="
                f"x=(main_w-overlay_w)/2+{int(intro_offset_x)}:"
                f"y=(main_h-overlay_h)/2+{int(intro_offset_y)}:"
                f"enable=between(t,0,{intro_secs})"
                f"[v];"
                f"[v]format=yuv420p[vout]"
            )

            cmd = [
                "ffmpeg", "-y",
                "-i", str(video_path),
                "-loop", "1", "-t", f"{intro_secs + 0.5}", "-i", str(intro_img),
                "-filter_complex", fc,
                "-map", "[vout]", "-map", "0:a?",
                "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
                "-c:a", "copy",
                "-movflags", "+faststart",
                str(out_tmp)
            ]

        else:
            # --- subtitles only with -vf ---
            if YUV444_RENDER:
                vf_arg = f"format=yuv444p,ass={tmp_path.as_posix()}{fontsdir_arg},format=yuv420p"
            else:
                vf_arg = f"ass={tmp_path.as_posix()}{fontsdir_arg}"

            cmd = [
                "ffmpeg", "-y",
                "-i", str(video_path),
                "-vf", vf_arg,
                "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
                "-c:a", "copy",
                "-movflags", "+faststart",
                str(out_tmp)
            ]

        print("[info] ffmpeg cmd:", " ".join(cmd))
        run = subprocess.run(cmd, check=False, text=True, capture_output=True)
        if run.returncode != 0:
            print(run.stderr)
            raise RuntimeError(f"ffmpeg failed (code {run.returncode})")

        out_tmp.replace(out_video)
        print("[done] saved:", out_video)

    finally:
        if tmp_path and tmp_path.exists():
            try:
                tmp_path.unlink()
            except Exception as e:
                print(f"[warn] could not delete temp ASS: {e}")
        if out_tmp.exists():
            try:
                out_tmp.unlink()
            except Exception:
                pass

    if not out_video.exists():
        raise FileNotFoundError(f"Expected output not found: {out_video}")
    return out_video.as_posix()


In [7]:
# Or override per-call:
beta_captions(
    "/Users/marcus/Downloads/reddit1_filmora_captioned/tester12345.mp4",
    intro_card_src=Path("/Users/marcus/Downloads/Thumb_shorts_white.png"),
    intro_secs=3.0,
    intro_fade=0.3,
    intro_scale=0.92,
    intro_crop_bottom=0.12,
    intro_offset_x=0,
    intro_offset_y=0
)


[info] video: /Users/marcus/Downloads/reddit1_filmora_captioned/tester12345.mp4
[info] PlayRes set to: 1080x1920 | Center=(540,960)
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 22
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] caption groups built: 22 (max_words=1, max_chars=None, max_gap_s=1.2)
[info] OUTPUT_DIR: /Users/marcus/Downloads/reddit1_filmora_captioned
[info] Output file: /Users/marcus/Downloads/reddit1_filmora_captioned/exported_20250906_020900.mp4
[info] intro card: /Users/marcus/Downloads/Thumb_shorts_white.png
[info] ffmpeg cmd: ffmpeg -y -i /Users/marcus/Downloads/reddit1_filmora_captioned/tester12345.mp4 -loop 1 -t 3.5 -i /Users/marcus/Downloads/Thumb_shorts_whit

RuntimeError: ffmpeg failed (code 8)

In [11]:
# === CONFIG ==============================================================

from pathlib import Path
import tempfile

# Where to save the rendered video:
OUTPUT_DIR = Path.home() / "Downloads" / "reddit1_filmora_captioned"

# Filename pattern (placeholders: {stem}=input filename stem, {ts}=timestamp, {anim}=ANIM)
FILENAME_TEMPLATE = "exported_{ts}.mp4"

MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English

# Caption look
FONT_SIZE         = 210
UPPERCASE         = True

# Outline controls
BORDER_PX         = 12.0
SHADOW_PX         = 2.0
BLUR_PX           = 0.0
STABILIZE_OUTLINE = True

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30
CUT_AHEAD_SEC     = 0.00
TAIL_HOLD_SEC     = 1.20

# Grouping hyperparams
MAX_WORDS_PER_CAP = 1
MAX_CHARS_PER_CAP = None
MAX_GAP_SEC       = 1.20

# Animation
ANIM              = "inflate"
ANIM_IN_MS        = 20000
ANIM_OUT_MS       = 50

# Fonts
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Rendering safety
AUTO_PLAYRES      = True             # match ASS PlayRes to actual video resolution
YUV444_RENDER     = True             # render subs in 4:4:4 to stabilize edges, then downsample

# --- NEW: Reddit intro card overlay --------------------------------------
INTRO_CARD_SRC     = Path("/Users/marcus/Downloads/Thumb_shorts_white")   # file OR folder; None disables
INTRO_ENABLED      = True
INTRO_SECS         = 3.0
INTRO_FADE         = 0.30
INTRO_SCALE        = 0.92
INTRO_CROP_BOTTOM  = 0.12
INTRO_OFFSET_X     = 0
INTRO_OFFSET_Y     = 0
INTRO_ROUND_PX     = 40   
# ========================================================================

import os, subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel
from datetime import datetime

def timestamp(fmt: str = "%Y%m%d_%H%M%S") -> str:
    return datetime.now().strftime(fmt)

def sanitize_stem(stem: str) -> str:
    return re.sub(r'[^A-Za-z0-9_.-]+', '_', stem).strip('_')

def ensure_dir(p: Path) -> Path:
    p.mkdir(parents=True, exist_ok=True)
    return p

# ---------- probe video resolution ----------
def probe_resolution(video_path: Path) -> tuple[int, int]:
    cmd = [
        "ffprobe","-v","error","-select_streams","v:0",
        "-show_entries","stream=width,height","-of","csv=s=x:p=0", str(video_path)
    ]
    try:
        out = subprocess.check_output(cmd, text=True).strip()
        w, h = map(int, out.split("x"))
        print(f"[info] probed resolution: {w}x{h}")
        return w, h
    except Exception:
        print("[warn] ffprobe failed, falling back to 1920x1080")
        return 1920, 1080

CENTER_X, CENTER_Y = None, None

# ---------- load Whisper ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: {play_w}
PlayResY: {play_h}
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,{border},{shadow},5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def _scale_spec_for_anim(name: str):
    name = (name or "none").lower()
    if name in ("inflate", "inflate_soft", "pop"):
        return 80, 100
    if name == "zoom":
        return 60, 100
    return 100, 100

def _fmt_float(x: float) -> str:
    return f"{x:.2f}".rstrip("0").rstrip(".")

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int,
             border_px: float, stabilize_outline: bool, blur_px: float) -> str:
    cx, cy = int(round(cx)), int(round(cy))
    s0, s1 = _scale_spec_for_anim(name)
    if stabilize_outline and (s0 != s1):
        bord0 = border_px * (s0 / 100.0)
        bord1 = border_px * (s1 / 100.0)
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = rf"\t(0,{in_ms},\bord{_fmt_float(bord1)})"
    else:
        bord_tag0 = rf"\bord{_fmt_float(border_px)}"
        bord_anim = ""
    blur_tag = (rf"\blur{_fmt_float(blur_px)}" if blur_px and blur_px > 0 else "")
    name = (name or "none").lower()
    if name == "none":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"
    if name == "fade":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fad({in_ms},{out_ms})}}"
    if name == "pop":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "zoom":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx60\fscy60\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "bounce":
        return (rf"{{\an5\move({cx},{cy-40},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}"
                rf"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120,{in_ms},\fscx100\fscy100){bord_anim}}}")
    if name == "slide_up":
        return rf"{{\an5\move({cx},{cy+60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_down":
        return rf"{{\an5\move({cx},{cy-60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_left":
        return rf"{{\an5\move({cx-140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_right":
        return rf"{{\an5\move({cx+140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "rotate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\frz-12\t(0,{in_ms},\frz0)}}"
    if name == "inflate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "inflate_soft":
        return (rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\alpha&H20&\blur2"
                rf"\t(0,{in_ms},\fscx100\fscy100\alpha&H00&\blur0){bord_anim}}}")
    return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"

# ---------- Group words into captions ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                play_w: int, play_h: int,
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100,
                                border_px=12.0,
                                stabilize_outline=True,
                                blur_px=0.0):
    cx = int(round(play_w / 2))
    cy = int(round(play_h / 2))
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            t1 = max(natural_end + tail_hold, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()

        ov = anim_tag(cx, cy, anim, in_ms, out_ms, border_px, stabilize_outline, blur_px)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events

# ---------- resolve an intro image (file OR folder) -------------
def resolve_intro_image(src: Path | None) -> Path | None:
    if not src:
        print("[debug] resolve_intro_image: src=None")
        return None
    src = Path(src).expanduser()
    print(f"[debug] resolve_intro_image: candidate='{src}' exists={src.exists()} is_dir={src.is_dir()}")
    if src.exists():
        if src.is_file():
            return src
        # folder: pick first image
        allowed = {".png",".jpg",".jpeg",".webp",".bmp"}
        imgs = [f for f in sorted(src.iterdir()) if f.is_file() and f.suffix.lower() in allowed]
        return imgs[0] if imgs else None
    return None

# ---------- MAIN ----------------------------------------------------------
def beta_captions(INPUT_VIDEO: str | Path,
                  intro_card_src: Path | None = INTRO_CARD_SRC,
                  intro_enabled: bool = INTRO_ENABLED,
                  intro_secs: float = INTRO_SECS,
                  intro_fade: float = INTRO_FADE,
                  intro_scale: float = INTRO_SCALE,
                  intro_crop_bottom: float = INTRO_CROP_BOTTOM,
                  intro_offset_x: int = INTRO_OFFSET_X,
                  intro_offset_y: int = INTRO_OFFSET_Y,
                  intro_round_px: int = INTRO_ROUND_PX) -> str:

    video_path = Path(INPUT_VIDEO).expanduser().resolve()
    assert video_path.exists(), f"Video not found: {video_path}"
    print("[info] video:", video_path)

    PLAY_W, PLAY_H = probe_resolution(video_path) if AUTO_PLAYRES else (1920, 1080)
    if CENTER_X is None or CENTER_Y is None:
        cx, cy = PLAY_W // 2, PLAY_H // 2
    else:
        cx, cy = CENTER_X, CENTER_Y
    print(f"[info] PlayRes set to: {PLAY_W}x{PLAY_H} | Center=({cx},{cy})")

    print("[info] loading Whisper model …")
    model = load_whisper_auto(MODEL_NAME)

    print("[info] transcribing (word timestamps) …")
    segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

    words = []
    for seg in segments:
        if seg.words:
            for w in seg.words:
                tok = (w.word or "").strip()
                if tok:
                    words.append({"start": float(w.start), "end": float(w.end), "text": tok})

    print(f"[info] words captured: {len(words)}")

    # ---------- Build ASS ----------
    _, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
    ASS_HEADER = ASS_HEADER_TMPL.format(
        play_w=PLAY_W, play_h=PLAY_H, font=FONT_NAME, size=FONT_SIZE,
        border=_fmt_float(BORDER_PX), shadow=_fmt_float(SHADOW_PX)
    )

    caption_lines = group_words_to_captions(
        words,
        max_words=MAX_WORDS_PER_CAP,
        max_chars=MAX_CHARS_PER_CAP,
        max_gap_s=MAX_GAP_SEC
    )
    print(f"[info] caption groups built: {len(caption_lines)} "
          f"(max_words={MAX_WORDS_PER_CAP}, max_chars={MAX_CHARS_PER_CAP}, max_gap_s={MAX_GAP_SEC})")

    ass_events = build_center_caption_events(
        caption_lines,
        play_w=PLAY_W, play_h=PLAY_H,
        uppercase=UPPERCASE,
        min_caption=MIN_CAPTION_SEC,
        cut_ahead=CUT_AHEAD_SEC,
        tail_hold=TAIL_HOLD_SEC,
        anim=ANIM,
        in_ms=ANIM_IN_MS,
        out_ms=ANIM_OUT_MS,
        border_px=BORDER_PX,
        stabilize_outline=STABILIZE_OUTLINE,
        blur_px=BLUR_PX
    )
    ass_text = ASS_HEADER + "\n".join(ass_events)

    # ---------- Output path ----------
    out_dir = ensure_dir(Path(OUTPUT_DIR))
    ts = timestamp()
    out_name = FILENAME_TEMPLATE.format(stem=sanitize_stem(video_path.stem), ts=ts, anim=ANIM)
    out_video = (out_dir / out_name).resolve()
    out_tmp = out_video.with_suffix(".tmp.mp4")
    print("[info] OUTPUT_DIR:", out_dir)
    print("[info] Output file:", out_video)

    if not shutil.which("ffmpeg"):
        raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

    fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"

    tmp_path = None
    try:
        # write ASS to temp file
        with tempfile.NamedTemporaryFile("w", suffix=".ass", delete=False, encoding="utf-8") as tmp:
            tmp.write(ass_text)
            tmp.flush()
            tmp_path = Path(tmp.name)

        vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"

        intro_img = resolve_intro_image(intro_card_src) if intro_enabled else None
        print(f"[debug] intro_enabled={intro_enabled} intro_secs={intro_secs} intro_fade={intro_fade}")
        print(f"[debug] resolved intro image: {intro_img}")

        if intro_img and intro_secs > 0:
            # subtitles chain for base video
            base_chain = (f"format=yuv444p,ass={tmp_path.as_posix()}{fontsdir_arg}"
                        if YUV444_RENDER else
                        f"ass={tmp_path.as_posix()}{fontsdir_arg}")

            fade_d     = max(0.0, min(float(intro_fade), float(intro_secs)))
            crop_keep  = max(0.0, min(1.0, 1.0 - float(intro_crop_bottom)))
            scale_frac = max(0.05, min(2.0, float(intro_scale)))
            scaled_w   = int(round(PLAY_W * scale_frac))
            if scaled_w % 2: scaled_w -= 1
            if scaled_w < 2: scaled_w = 2

            enable_expr = f"between(t\\,0\\,{intro_secs})"  # escape commas

            round_px = max(0, int(intro_round_px))
            print(f"[debug] overlay params: crop_keep={crop_keep} scale_frac={scale_frac} "
                f"scaled_w={scaled_w} fade_d={fade_d} offsets=({intro_offset_x},{intro_offset_y}) "
                f"round_px={round_px}")

            # Build the card-processing chain; if rounding requested, compute an alpha mask via geq()
            if round_px > 0:
                # alpha expression: 255 inside rounded-rect, 0 outside
                aexpr = (
                    f"if(lte(hypot("
                    f"if(lt(X,{round_px}),{round_px}-X,if(lt(W-X,{round_px}),{round_px}-(W-X),0)),"
                    f"if(lt(Y,{round_px}),{round_px}-Y,if(lt(H-Y,{round_px}),{round_px}-(H-Y),0))"
                    f"),{round_px}),255,0)"
                )
                card_chain = (
                    f"[cardc]scale={scaled_w}:-1[card_s];"
                    f"[card_s]format=rgba,"
                    f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='{aexpr}'[card_r];"
                    f"[card_r]fade=t=out:st={intro_secs - fade_d}:d={fade_d}:alpha=1[cardf];"
                )
            else:
                card_chain = (
                    f"[cardc]scale={scaled_w}:-1[cards];"
                    f"[cards]fade=t=out:st={intro_secs - fade_d}:d={fade_d}:alpha=1[cardf];"
                )

            fc = (
                f"[0:v]{base_chain}[base];"
                f"[1:v]format=rgba,crop=iw:ih*{crop_keep}:0:0[cardc];"
                f"{card_chain}"
                f"[base][cardf]overlay="
                f"x=(main_w-overlay_w)/2+{int(intro_offset_x)}:"
                f"y=(main_h-overlay_h)/2+{int(intro_offset_y)}:"
                f"enable='{enable_expr}'[v];"
                f"[v]format=yuv420p[vout]"
            )
            print("[debug] filter_complex >>>\n" + fc + "\n<<< end filter_complex")


            cmd = [
                "ffmpeg","-y","-hide_banner","-loglevel","info",
                "-i", str(video_path),
                "-loop","1","-t", f"{intro_secs + 0.5}","-i", str(intro_img),
                "-filter_complex", fc,
                "-map","[vout]","-map","0:a?",
                "-c:v", vcodec, "-preset","veryfast","-crf","18",
                "-c:a","copy",
                "-movflags","+faststart",
                str(out_tmp)
            ]
        else:
            vf_arg = (f"format=yuv444p,ass={tmp_path.as_posix()}{fontsdir_arg},format=yuv420p"
                      if YUV444_RENDER else
                      f"ass={tmp_path.as_posix()}{fontsdir_arg}")
            cmd = [
                "ffmpeg","-y","-hide_banner","-loglevel","info",
                "-i", str(video_path),
                "-vf", vf_arg,
                "-c:v", vcodec, "-preset","veryfast","-crf","18",
                "-c:a","copy",
                "-movflags","+faststart",
                str(out_tmp)
            ]

        print("[info] ffmpeg cmd:", " ".join(cmd))
        run = subprocess.run(cmd, check=False, text=True, capture_output=True)
        if run.returncode != 0:
            # show head/tail to diagnose
            err = run.stderr or ""
            head = "\n".join(err.splitlines()[:20])
            tail = "\n".join(err.splitlines()[-20:])
            print("\n[ffmpeg stderr — head]\n" + head + "\n")
            print("[ffmpeg stderr — tail]\n" + tail + "\n")
            print("[ffmpeg stderr — end]")
            raise RuntimeError(f"ffmpeg failed (code {run.returncode})")

        out_tmp.replace(out_video)
        print("[done] saved:", out_video)

    finally:
        if tmp_path and tmp_path.exists():
            try: tmp_path.unlink()
            except Exception as e: print(f"[warn] could not delete temp ASS: {e}")
        if out_tmp.exists():
            try: out_tmp.unlink()
            except Exception: pass

    if not out_video.exists():
        raise FileNotFoundError(f"Expected output not found: {out_video}")
    return out_video.as_posix()


In [18]:
# Or override per-call:
pathing = beta_captions(
    "/Users/marcus/Downloads/reddit1_filmora_captioned/tester12345.mp4",
    intro_card_src=Path("/Users/marcus/Downloads/Shorts_thumbv2w.png"),
    intro_secs=4.0,
    intro_fade=0.1,
    intro_scale=0.80,
    intro_crop_bottom=0.25,
    intro_offset_x=0,
    intro_offset_y=0,
    intro_round_px=45
)


[info] video: /Users/marcus/Downloads/reddit1_filmora_captioned/tester12345.mp4
[info] probed resolution: 1080x1920
[info] PlayRes set to: 1080x1920 | Center=(540,960)
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 22
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] caption groups built: 22 (max_words=1, max_chars=None, max_gap_s=1.2)
[info] OUTPUT_DIR: /Users/marcus/Downloads/reddit1_filmora_captioned
[info] Output file: /Users/marcus/Downloads/reddit1_filmora_captioned/exported_20250906_023447.mp4
[debug] resolve_intro_image: candidate='/Users/marcus/Downloads/Shorts_thumbv2w.png' exists=True is_dir=False
[debug] intro_enabled=True intro_secs=4.0 intro_fade=0.1
[debug] resolved int

In [19]:
pathing

'/Users/marcus/Downloads/reddit1_filmora_captioned/exported_20250906_023447.mp4'

In [25]:
from __future__ import annotations
from pathlib import Path
from datetime import datetime
from PIL import Image, ImageDraw, ImageFont

def _timestamp(fmt="%Y%m%d_%H%M%S") -> str:
    return datetime.now().strftime(fmt)

def _ensure_dir(p: Path) -> Path:
    p = Path(p)
    p.mkdir(parents=True, exist_ok=True)
    return p

def _find_arial_path() -> Path | None:
    """Try common system paths for 'Arial.ttf' (macOS/Windows)."""
    candidates = [
        # macOS
        "/System/Library/Fonts/Supplemental/Arial.ttf",
        "/System/Library/Fonts/Arial.ttf",
        "/Library/Fonts/Arial.ttf",
        # Windows
        "C:/Windows/Fonts/arial.ttf",
        "C:/Windows/Fonts/Arial.ttf",
    ]
    for c in candidates:
        p = Path(c)
        if p.exists():
            return p
    return None

def render_sentence_on_thumbnail(
    image_path: str | Path,
    sentence: str,
    box: tuple[int, int, int, int],           # (x, y, w, h) in pixels
    out_dir: str | Path = "/Users/marcus/Downloads/shorts_thumbnails_storage",
    # Font controls
    font_path: str | Path | None = None,       # None => auto-resolve Arial
    min_font: int = 16,
    max_font: int = 420,
    line_spacing: float = 1.08,                # line height multiplier
    letter_spacing_px: int = 0,                # NEW: tracking in pixels
    # Color & effects
    fill=(255, 255, 255),                      # text fill color
    stroke_fill=(0, 0, 0),                     # outline color
    stroke_ratio: float = 0.08,                # outline thickness as % of font size
    bold_strength_px: int = 0,                 # NEW: thicken fill by Npx (draw extra passes)
    padding_px: int = 12,                      # inner padding inside the box
) -> str:
    """
    Draw `sentence` into `box=(x,y,w,h)` on `image_path`, choosing the largest uniform font size
    that fits both width and height (with wrapping). Supports letter spacing and adjustable fill
    boldness (thickness). Saves a PNG to `out_dir` and returns its path.
    """

    image_path = Path(image_path)
    out_dir = _ensure_dir(Path(out_dir))
    assert image_path.exists(), f"Image not found: {image_path}"

    # Resolve font
    if font_path is None:
        arial = _find_arial_path()
        if not arial:
            raise FileNotFoundError(
                "Arial.ttf not found on this system. Pass a valid `font_path` to a .ttf file."
            )
        font_path = arial
    font_path = Path(font_path)
    assert font_path.exists(), f"Font not found: {font_path}"

    # Load image + drawer
    img = Image.open(image_path).convert("RGBA")
    draw = ImageDraw.Draw(img)

    x, y, bw, bh = box
    inner_w = max(1, bw - 2 * padding_px)
    inner_h = max(1, bh - 2 * padding_px)

    # --- measurement helpers (respect letter spacing) ---------------------
    def char_width(ch: str, font: ImageFont.FreeTypeFont, stroke_w: int) -> int:
        bbox = draw.textbbox((0, 0), ch, font=font, stroke_width=stroke_w)
        return bbox[2] - bbox[0]

    def line_width(line: str, font: ImageFont.FreeTypeFont, stroke_w: int) -> int:
        if not line:
            return 0
        total = 0
        for i, ch in enumerate(line):
            total += char_width(ch, font, stroke_w)
            if i < len(line) - 1:
                total += max(0, letter_spacing_px)
        return total

    def wrap_lines(font: ImageFont.FreeTypeFont, text: str, max_w: int, stroke_w: int) -> list[str]:
        """Greedy word wrap using width with tracking."""
        words = text.split()
        lines: list[str] = []
        cur = ""
        for w in words:
            attempt = (cur + " " + w).strip() if cur else w
            if line_width(attempt, font, stroke_w) <= max_w:
                cur = attempt
            else:
                if cur:
                    lines.append(cur)
                cur = w
        if cur:
            lines.append(cur)
        return lines

    def block_size(font: ImageFont.FreeTypeFont, lines: list[str], stroke_w: int) -> tuple[int, int]:
        """Return (max_width, total_height) for wrapped lines with spacing and stroke."""
        max_w_px, total_h = 0, 0
        ascent, descent = font.getmetrics()
        # Add a tiny headroom for stroke top/bottom
        base_line_h = ascent + descent + stroke_w
        line_h = int(round(base_line_h * line_spacing))
        for ln in lines:
            w_px = line_width(ln, font, stroke_w)
            max_w_px = max(max_w_px, w_px)
            total_h += line_h
        # a little extra breathing space
        total_h += max(1, stroke_w)
        return max_w_px, total_h

    # --- search best font size -------------------------------------------
    best = None
    lo, hi = int(min_font), int(max_font)
    while lo <= hi:
        mid = (lo + hi) // 2
        font_mid = ImageFont.truetype(str(font_path), mid)
        stroke_w = max(0, int(round(mid * stroke_ratio)))
        lines = wrap_lines(font_mid, sentence, inner_w, stroke_w)
        w_px, h_px = block_size(font_mid, lines, stroke_w)
        if w_px <= inner_w and h_px <= inner_h:
            best = (mid, stroke_w, lines)
            lo = mid + 1   # try bigger
        else:
            hi = mid - 1   # too big

    if best is None:
        # fallback: smallest size
        size = int(min_font)
        lines = wrap_lines(ImageFont.truetype(str(font_path), size), sentence, inner_w, 0)
        stroke_w = max(0, int(round(size * stroke_ratio)))
        fnt = ImageFont.truetype(str(font_path), size)
    else:
        size, stroke_w, lines = best
        fnt = ImageFont.truetype(str(font_path), size)

    # --- compute vertical centering ---------------------------------------
    ascent, descent = fnt.getmetrics()
    base_line_h = ascent + descent + stroke_w
    line_h = int(round(base_line_h * line_spacing))
    total_h = len(lines) * line_h + max(1, stroke_w)
    start_y = y + padding_px + (inner_h - total_h) // 2
    center_x = x + bw // 2

    # --- draw with tracking + optional bold fill --------------------------
    def draw_line(ln: str, top_y: int):
        # center horizontally using measured width with tracking
        ln_w = line_width(ln, fnt, stroke_w)
        left_x = int(center_x - ln_w / 2)

        # helper: draw one pass of the whole line (char-by-char)
        def _draw_pass(dx: int, dy: int, use_stroke: bool, color):
            cx = left_x + dx
            for i, ch in enumerate(ln):
                draw.text((cx, top_y + dy), ch, font=fnt, fill=color,
                          stroke_width=stroke_w if use_stroke else 0,
                          stroke_fill=stroke_fill if use_stroke else None)
                cx += char_width(ch, fnt, stroke_w) + max(0, letter_spacing_px)

        # Thicken fill first (no stroke) by drawing nearby offsets
        r = max(0, int(bold_strength_px))
        if r > 0:
            offsets = {
                ( r, 0), (-r, 0), (0,  r), (0, -r),
                ( r,  r), ( r, -r), (-r,  r), (-r, -r),
            }
            for dx, dy in offsets:
                _draw_pass(dx, dy, use_stroke=False, color=fill)

        # Main pass: with outline for crisp edges
        _draw_pass(0, 0, use_stroke=True, color=fill)

    cur_y = start_y
    for ln in lines:
        draw_line(ln, cur_y)
        cur_y += line_h

    out_path = out_dir / f"thumb_{_timestamp()}.png"
    img.save(out_path, format="PNG", optimize=True)
    return str(out_path)


In [41]:
out_png = render_sentence_on_thumbnail(
    image_path="/Users/marcus/Downloads/Shorts_thumbv2w.png",
    sentence="Doctors lied about my brother’s condition after he went into a coma.",
    box=(40, 60, 1080, 500),  # (x, y, w, h) tuned for your template
    out_dir="/Users/marcus/Downloads/shorts_thumbnails_storage",
    font_path=None,                 # None => auto-detect Arial; or pass a .ttf path explicitly
    min_font=28,
    max_font=28,
    line_spacing=1.06,
    letter_spacing_px=0.0,            # NEW: tweak tracking
    bold_strength_px=1,             # NEW: thicken fill by ~1px (set 0 to disable)
    fill=(255, 255, 255),
    stroke_fill=(0, 0, 0),
    stroke_ratio=1.00,              # outline ≈ 9% of size
    padding_px=0,
)
print("Saved:", out_png)


Saved: /Users/marcus/Downloads/shorts_thumbnails_storage/thumb_20250906_030330.png


In [44]:
from pathlib import Path
from datetime import datetime
from PIL import Image, ImageDraw, ImageFont

def _ts(fmt="%Y%m%d_%H%M%S"): 
    return datetime.now().strftime(fmt)

def _ensure_dir(p: str | Path) -> Path:
    p = Path(p); p.mkdir(parents=True, exist_ok=True); return p

def _find_arial() -> Path | None:
    # Common macOS / Windows locations
    for p in [
        "/System/Library/Fonts/Supplemental/Arial.ttf",
        "/System/Library/Fonts/Arial.ttf",
        "/Library/Fonts/Arial.ttf",
        "C:/Windows/Fonts/arial.ttf",
        "C:/Windows/Fonts/Arial.ttf",
    ]:
        if Path(p).exists(): return Path(p)
    return None

def render_black_topleft(
    image_path: str | Path,
    text: str,
    box: tuple[int, int, int, int],      # (x, y, w, h) area to fill
    out_dir: str | Path = "/Users/marcus/Downloads/shorts_thumbnails_storage",
    font_size: int = 160,                 # target size; will shrink-to-fit
    min_font: int = 24,                   # shrink floor
    line_spacing: float = 1.08,           # line height multiplier
    letter_spacing_px: int = 0,           # tracking between all characters
    bold_px: int = 0,                     # thickness (0=normal). Uses stroke to “embolden”
    padding_px: int = 8,                  # inner padding inside the box
    font_path: str | Path | None = None,  # None => auto-find Arial
    color=(0, 0, 0),                      # solid black
) -> str:
    """
    Draws plain black Arial from the TOP-LEFT of `box`, word-wrapped.
    Uses `font_size` but shrinks if needed to fit width & height.
    Boldness is simulated via stroke_width (thickness in pixels).
    Returns the saved PNG path.
    """
    image_path = Path(image_path); assert image_path.exists(), f"Image not found: {image_path}"
    out_dir = _ensure_dir(out_dir)

    if font_path is None:
        font_path = _find_arial()
        if not font_path:
            raise FileNotFoundError("Arial.ttf not found. Pass `font_path` to a valid .ttf.")
    font_path = Path(font_path); assert font_path.exists(), f"Font not found: {font_path}"

    img = Image.open(image_path).convert("RGBA")
    draw = ImageDraw.Draw(img)

    x, y, bw, bh = box
    max_w = max(1, bw - 2 * padding_px)
    max_h = max(1, bh - 2 * padding_px)

    # ---------- measurement helpers (respect stroke & letter spacing) ----------
    def char_size(ch: str, fnt: ImageFont.FreeTypeFont) -> tuple[int, int]:
        b = draw.textbbox((0, 0), ch, font=fnt, stroke_width=bold_px)
        return b[2] - b[0], b[3] - b[1]

    def string_width(s: str, fnt: ImageFont.FreeTypeFont) -> int:
        if not s: return 0
        w = 0
        for i, ch in enumerate(s):
            cw, _ = char_size(ch, fnt)
            w += cw
            if i < len(s) - 1:
                w += max(0, letter_spacing_px)
        return w

    def single_line_height(fnt: ImageFont.FreeTypeFont) -> int:
        # bbox includes stroke; multiply by line_spacing
        b = draw.textbbox((0, 0), "Hg", font=fnt, stroke_width=bold_px)
        raw = max(1, b[3] - b[1])
        return int(round(raw * line_spacing))

    def wrap_words(fnt: ImageFont.FreeTypeFont, s: str, maxw: int) -> list[str]:
        words = s.split()
        lines, cur = [], ""
        for w in words:
            cand = (cur + " " + w).strip() if cur else w
            if string_width(cand, fnt) <= maxw:
                cur = cand
            else:
                if cur: lines.append(cur)
                cur = w   # assume no single-word outliers per your guarantee
        if cur: lines.append(cur)
        return lines

    def block_dims(fnt: ImageFont.FreeTypeFont, lines: list[str]) -> tuple[int, int]:
        lh = single_line_height(fnt)
        H = len(lines) * lh
        W = 0
        for ln in lines:
            W = max(W, string_width(ln, fnt))
        return W, H

    # ---------- fit loop ----------
    size = max(min_font, int(font_size))
    fnt = ImageFont.truetype(str(font_path), size)
    lines = wrap_words(fnt, text, max_w)
    W, H = block_dims(fnt, lines)

    while (W > max_w or H > max_h) and size > min_font:
        size -= 2
        fnt = ImageFont.truetype(str(font_path), size)
        lines = wrap_words(fnt, text, max_w)
        W, H = block_dims(fnt, lines)

    # ---------- draw top-left (no centering) ----------
    tx = x + padding_px
    ty = y + padding_px
    lh = single_line_height(fnt)

    for ln in lines:
        cx = tx
        for i, ch in enumerate(ln):
            draw.text(
                (cx, ty),
                ch,
                font=fnt,
                fill=color,
                stroke_width=max(0, bold_px),
                stroke_fill=color,   # stroke same color = thicker glyph
            )
            cw, _ = char_size(ch, fnt)
            cx += cw + max(0, letter_spacing_px)
        ty += lh

    out_path = out_dir / f"thumb_black_tl_{_ts()}.png"
    img.save(out_path, "PNG", optimize=True)
    return str(out_path)


In [None]:
saved_path = render_black_topleft(
    image_path="/Users/marcus/Downloads/Shorts_thumbv2w.png",
    text="Have you ever accidentally saved a life? I like playing clash royale and reaching",
    box=(40, 235, 1200, 200),              # (x, y, w, h) — the white card area
    out_dir="/Users/marcus/Downloads/shorts_thumbnails_storage",
    font_size=70,                         # target “X size”
    min_font=48,
    line_spacing=1.42,
    letter_spacing_px=0,                   # normal spacing
    bold_px=1.6,                             # thickness (0=normal, try 2–4 for bold)
    # font_path=None,                      # leave None to auto-find Arial
)
print("Saved to:", saved_path)


Saved to: /Users/marcus/Downloads/shorts_thumbnails_storage/thumb_black_tl_20250906_032007.png


In [69]:
from pathlib import Path
from datetime import datetime
from PIL import Image, ImageDraw, ImageFont

def _ts(fmt="%Y%m%d_%H%M%S"): 
    return datetime.now().strftime(fmt)

def _ensure_dir(p: str | Path) -> Path:
    p = Path(p); p.mkdir(parents=True, exist_ok=True); return p

def _find_arial() -> Path | None:
    # Common macOS / Windows locations
    for p in [
        "/System/Library/Fonts/Supplemental/Arial.ttf",
        "/System/Library/Fonts/Arial.ttf",
        "/Library/Fonts/Arial.ttf",
        "C:/Windows/Fonts/arial.ttf",
        "C:/Windows/Fonts/Arial.ttf",
    ]:
        if Path(p).exists(): return Path(p)
    return None

def render_black_topleft(
    image_path: str | Path,
    text: str,
    box: tuple[int, int, int, int],      # (x, y, w, h)
    out_dir: str | Path = "/Users/marcus/Downloads/shorts_thumbnails_storage",
    font_size: int = 160,                 # will shrink-to-fit
    min_font: int = 24,
    line_spacing: float = 1.08,
    letter_spacing_px: int = 0,           # can be negative (tighten letters)
    space_extra_px: int = 0,              # added to spaces between words
    bold_px: int = 0,                     # thickness via stroke
    padding_px: int = 8,
    font_path: str | Path | None = None,  # None => auto-find Arial
    color=(0, 0, 0),
) -> str:
    """
    Draws plain black Arial from the TOP-LEFT of `box`, word-wrapped.
    `letter_spacing_px` applies between letters inside words (can be negative).
    `space_extra_px` adds extra pixels to spaces (word gaps).
    Returns saved PNG path.
    """
    image_path = Path(image_path); assert image_path.exists(), f"Image not found: {image_path}"
    out_dir = _ensure_dir(out_dir)

    if font_path is None:
        font_path = _find_arial()
        if not font_path:
            raise FileNotFoundError("Arial.ttf not found. Pass `font_path` to a valid .ttf.")
    font_path = Path(font_path); assert font_path.exists(), f"Font not found: {font_path}"

    img = Image.open(image_path).convert("RGBA")
    draw = ImageDraw.Draw(img)

    x, y, bw, bh = box
    max_w = max(1, bw - 2 * padding_px)
    max_h = max(1, bh - 2 * padding_px)

    # ---------- measurement helpers ----------
    def char_bbox(s: str, fnt: ImageFont.FreeTypeFont):
        return draw.textbbox((0, 0), s, font=fnt, stroke_width=bold_px)

    def char_size(ch: str, fnt: ImageFont.FreeTypeFont) -> tuple[int, int]:
        b = char_bbox(ch, fnt)
        return b[2] - b[0], b[3] - b[1]

    def string_width(s: str, fnt: ImageFont.FreeTypeFont) -> int:
        if not s: return 0
        w = 0
        for i, ch in enumerate(s):
            cw, _ = char_size(ch, fnt)
            w += cw
            if i < len(s) - 1:
                nxt = s[i + 1]
                if ch == ' ':
                    # widen spaces themselves
                    w += max(0, space_extra_px)
                else:
                    # letter spacing only if the next thing isn't a space
                    if nxt != ' ':
                        w += letter_spacing_px
        return max(0, w)

    def line_height(fnt: ImageFont.FreeTypeFont) -> int:
        b = char_bbox("Hg", fnt)
        raw = max(1, b[3] - b[1])
        return int(round(raw * line_spacing))

    def wrap_words(fnt: ImageFont.FreeTypeFont, s: str, maxw: int) -> list[str]:
        words = s.split()
        lines, cur = [], ""
        for w in words:
            cand = (cur + " " + w).strip() if cur else w
            if string_width(cand, fnt) <= maxw:
                cur = cand
            else:
                if cur: lines.append(cur)
                cur = w
        if cur: lines.append(cur)
        return lines

    def block_dims(fnt: ImageFont.FreeTypeFont, lines: list[str]) -> tuple[int, int]:
        lh = line_height(fnt)
        H = len(lines) * lh
        W = 0
        for ln in lines:
            W = max(W, string_width(ln, fnt))
        return W, H

    # ---------- fit loop ----------
    size = max(min_font, int(font_size))
    fnt = ImageFont.truetype(str(font_path), size)
    lines = wrap_words(fnt, text, max_w)
    W, H = block_dims(fnt, lines)

    while (W > max_w or H > max_h) and size > min_font:
        size -= 2
        fnt = ImageFont.truetype(str(font_path), size)
        lines = wrap_words(fnt, text, max_w)
        W, H = block_dims(fnt, lines)

    # ---------- draw top-left ----------
    tx = x + padding_px
    ty = y + padding_px
    lh = line_height(fnt)

    for ln in lines:
        cx = tx
        for i, ch in enumerate(ln):
            draw.text(
                (cx, ty),
                ch,
                font=fnt,
                fill=color,
                stroke_width=max(0, bold_px),
                stroke_fill=color,
            )
            cw, _ = char_size(ch, fnt)
            advance = cw
            if i < len(ln) - 1:
                nxt = ln[i + 1]
                if ch == ' ':
                    advance += max(0, space_extra_px)
                else:
                    if nxt != ' ':
                        advance += letter_spacing_px
            cx += advance
        ty += lh

    out_path = out_dir / f"thumb_black_tl_{_ts()}.png"
    img.save(out_path, "PNG", optimize=True)
    return str(out_path)


In [None]:
saved_path = render_black_topleft(
    image_path="/Users/marcus/Downloads/Shorts_thumbv2w.png",
    text="Have you ever accidentally saved a life? I like playing clash royale and reaching",
    box=(40, 235, 1200, 200),              # (x, y, w, h) — the white card area
    out_dir="/Users/marcus/Downloads/shorts_thumbnails_storage",
    font_size=70,                         # target “X size”
    min_font=48,
    line_spacing=1.42,
    letter_spacing_px=0,                   # normal spacing
    space_extra_px=4,                     # widen spaces a bit
    bold_px=1.55,                             # thickness (0=normal, try 2–4 for bold)
    # font_path=None,                      # leave None to auto-find Arial
)
print("Saved to:", saved_path)


Saved to: /Users/marcus/Downloads/shorts_thumbnails_storage/thumb_black_tl_20250906_032420.png
