In [3]:
# --- CONFIG (edit these two lines as needed) -------------------------------
from pathlib import Path
INPUT_VIDEO = Path.home() / "Downloads" / "My Video.mp4"   # path to your video
MODEL_NAME  = "small.en"   # tiny/base/small/medium/large-v3 ; *.en is faster for English
# ---------------------------------------------------------------------------

import subprocess, shutil, platform
from pathlib import Path
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- helpers: ASS (karaoke) writer ----------
ASS_HEADER = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; Colors are BGR (&HAABBGGRR). SecondaryColour is the karaoke highlight color.
Style: Beast,Montserrat Black,84,&H00FFFFFF,&H001CBEFF,&H00000000,&H7F000000,-1,0,0,0,100,100,0,0,1,6,0,2,60,60,96,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _to_cs(seconds: float) -> int:
    # ASS \k uses centiseconds
    return max(1, int(round(seconds * 100)))

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    total_cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(total_cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def write_animated_ass(lines, out_path="subs.ass", y=900, offset=0.0, rate=1.0):
    """
    lines: list[list[{'start','end','text'}]]
    Adds per-word karaoke highlight + a quick 'pop' (bouncy) at line start.
    offset: global shift (s) if you added a preroll
    rate:   time scale (e.g., if final video is 1.05x faster, set rate=1.05)
    """
    def map_t(t): return offset + (t / rate)
    ev = []
    for ln in lines:
        t0, t1 = map_t(ln[0]["start"]), map_t(ln[-1]["end"])
        parts = []
        for w in ln:
            dur_cs = _to_cs(max(0.01, (w["end"] - w["start"]) / rate))
            parts.append(rf"{{\k{dur_cs}}}{w['text']} ")
        karaoke = "".join(parts).rstrip()
        # bouncy pop: overshoot to 120%, settle to 100% in 250 ms
        ov = r"{\an2\pos(960,%d)\fscx120\fscy120\t(0,250,\fscx100\fscy100)}" % y
        ev.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{karaoke}")
    Path(out_path).write_text(ASS_HEADER + "\n".join(ev), encoding="utf-8")
    return str(out_path)

# ---------- 1) Transcribe to word timestamps ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"

print("Transcribing with faster-whisper… (this downloads the model on first run)")
# Good default on most CPUs/Apple Silicon; try 'auto' or 'int8' if you prefer:
COMPUTE_TYPE = "int8"

model = WhisperModel(MODEL_NAME, compute_type=COMPUTE_TYPE)
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"Got {len(words)} words with timestamps.")

# ---------- 2) Group words into timed lines (tune these) ----------
def group_words(words, max_words_per_line=6, max_gap_s=0.6):
    lines, cur, last_end = [], [], None
    for w in words:
        new = False
        if last_end is not None and (w["start"] - last_end) > max_gap_s:
            new = True
        if cur and len(cur) >= max_words_per_line:
            new = True
        if new:
            lines.append(cur); cur = []
        cur.append(w); last_end = w["end"]
    if cur: lines.append(cur)
    return lines

lines = group_words(words, max_words_per_line=6, max_gap_s=0.6)
print(f"Built {len(lines)} caption lines.")

# ---------- 3) Write animated ASS (MrBeast-style) ----------
ass_path = video_path.with_suffix("")  # strip .mp4
ass_path = Path(str(ass_path) + "_auto.ass")
ass_file = write_animated_ass(lines, out_path=ass_path, y=900, offset=0.0, rate=1.0)
print("Wrote ASS:", ass_file)

# ---------- 4) Burn captions with FFmpeg ----------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it (brew/winget/apt) and rerun.")

out_video = video_path.with_suffix("")
out_video = Path(str(out_video) + "_captions.mp4")

# Choose fast encoder for your OS
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
# Escape single quotes in path for the filter arg
ass_for_filter = Path(ass_file).as_posix().replace("'", r"\'")
vf_arg = f"ass='{ass_for_filter}'"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]
print("Running FFmpeg…")
subprocess.run(cmd, check=True)
print(f"Done. Saved: {out_video}")


Transcribing with faster-whisper… (this downloads the model on first run)
Got 9 words with timestamps.
Built 2 caption lines.
Wrote ASS: /Users/marcus/Downloads/My Video_auto.ass
Running FFmpeg…


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

Done. Saved: /Users/marcus/Downloads/My Video_captions.mp4


[out#0/mp4 @ 0x1356053e0] video:2220KiB audio:71KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.142595%
frame=   75 fps=0.0 q=-0.0 Lsize=    2295KiB time=00:00:02.96 bitrate=6350.9kbits/s speed=4.49x elapsed=0:00:00.65    


In [6]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO = Path.home() / "Downloads" / "My Video.mp4"
MODEL_NAME  = "small.en"       # tiny/base/small/medium/large-v3; *.en is faster for English
FONT_NAME   = "Komika Axis"    # must be installed; code will check and print findings
FONT_SIZE   = 168              # big (~2×)
CENTER_X, CENTER_Y = 960, 540  # 1920x1080 center; change for other resolutions
UPPERCASE   = True             # ALL CAPS
MIN_WORD_SEC = 0.38            # minimum on-screen time per word for readability
# ========================================================================

import subprocess, shutil, platform, re, os, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font discovery (print checks + return fontsdir if found) ----------
def find_komika_axis_dir():
    name_q = "komika axis"
    cand_dirs = []
    sysname = platform.system()
    if sysname == "Darwin":
        cand_dirs = [
            Path.home() / "Library/Fonts",
            Path("/Library/Fonts"),
            Path("/System/Library/Fonts"),
        ]
    elif sysname == "Windows":
        cand_dirs = [Path("C:/Windows/Fonts")]
    else:  # Linux
        cand_dirs = [
            Path.home() / ".local/share/fonts",
            Path("/usr/local/share/fonts"),
            Path("/usr/share/fonts"),
        ]
    found_dir = None
    found_files = []
    for d in cand_dirs:
        if not d.exists():
            continue
        for p in d.rglob("*"):
            if p.suffix.lower() in {".ttf", ".otf"} and name_q in p.name.lower():
                found_files.append(p)
                found_dir = p.parent
    if found_files:
        print("[font] Found Komika Axis files:")
        for p in found_files:
            print("       -", p)
        print("[font] Using fontsdir:", found_dir)
        return found_dir
    # try fontconfig if available
    try:
        out = subprocess.run(["fc-list"], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True)
        matches = [ln for ln in out.stdout.splitlines() if name_q in ln.lower()]
        if matches:
            print("[font] fc-list detected Komika Axis:")
            for ln in matches[:6]:
                print("       -", ln)
            # best effort: get dir of first path before colon
            first_path = matches[0].split(":")[0]
            fdir = Path(first_path).parent
            print("[font] Using fontsdir:", fdir)
            return fdir
    except Exception:
        pass
    print("[font] WARNING: Could not confirm Komika Axis via file scan or fc-list.")
    print("        Ensure the font is installed system-wide. Using font name:", FONT_NAME)
    return None

# ---------- ASS helpers (centered, big, no bounce) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def build_center_word_events(words, center_xy=(960,540), uppercase=True, min_dur=0.38):
    """One Dialogue per word; static, centered (no bounce)."""
    cx, cy = center_xy
    events = []
    for w in words:
        t0, t1 = float(w["start"]), float(w["end"])
        if t1 - t0 < min_dur:
            t1 = t0 + min_dur
        token = (w["text"] or "").strip()
        token = re.sub(r"^\W+|\W+$", "", token)
        if not token:
            continue
        if uppercase:
            token = token.upper()
        # Center (an=5) with fixed position at (cx,cy); NO animation
        ov = r"{\an5\pos(" + f"{cx},{cy}" + r")}"
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{token}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Build ASS (one word per card, centered & big, white/black) ----------
ass_header = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)
ass_events = build_center_word_events(words, center_xy=(CENTER_X, CENTER_Y),
                                      uppercase=UPPERCASE, min_dur=MIN_WORD_SEC)
ass_text = ass_header + "\n".join(ass_events)

# sanitize filenames (avoid spaces so -vf quoting is easy)
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
ass_path = video_path.with_name(f"{safe_stem}_auto.ass")
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] ASS style uses Fontname =", FONT_NAME)

# ---------- 3) Burn captions with FFmpeg --------------------------------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

# Try to locate Komika Axis and pass fontsdir to libass for certainty
fontsdir = find_komika_axis_dir()
fontsdir_arg = f":fontsdir={fontsdir.as_posix()}" if fontsdir else ""

out_video = video_path.with_name(f"{safe_stem}_mrbeast_static.mp4")
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/My Video.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 9
[info] wrote ASS: /Users/marcus/Downloads/My_Video_auto.ass
[check] ASS style uses Fontname = Komika Axis
        Ensure the font is installed system-wide. Using font name: Komika Axis
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/My_Video_auto.ass


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/My_Video_mrbeast_static.mp4


[out#0/mp4 @ 0x146f0c780] video:2223KiB audio:71KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.142404%
frame=   75 fps=0.0 q=-0.0 Lsize=    2298KiB time=00:00:02.96 bitrate=6359.5kbits/s speed=3.64x elapsed=0:00:00.81    


In [8]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO = Path.home() / "Downloads" / "My Video.mp4"
MODEL_NAME  = "small.en"          # tiny/base/small/medium/large-v3; *.en is faster for English
FONT_SIZE   = 168                 # big (~2×)
CENTER_X, CENTER_Y = 960, 540     # 1920x1080 center; change for other resolutions
UPPERCASE   = True                # ALL CAPS
MIN_WORD_SEC = 0.38               # minimum on-screen time per word for readability

# Where YOU will place the downloaded font (no system install needed):
CUSTOM_FONT_DIR = Path.home() / "Documents" / "mrbeast_caps" / "fonts"
# ========================================================================

import subprocess, shutil, platform, re, os, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- custom font discovery in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    """
    Returns (font_file_path, font_family_name).
    - If a TTF/OTF is in font_dir, pick the first one.
    - Try to read the true family name via fontTools (if installed).
    - Fallback to filename stem if fontTools isn't available.
    """
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download a .ttf or .otf font from dafont.com (or anywhere).")
        print(f"2) Place the file here:\n   {font_dir}\n")
        print("3) Re-run this cell — it will use the font automatically.")
        raise SystemExit("[exit] No font found yet. Put a .ttf/.otf in the folder above.")
    font_file = candidates[0]
    # Try to get the internal family name (best for libass matching)
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional dependency
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)  # 1=Family, 4=Full name
    except Exception:
        pass
    if not family:
        family = font_file.stem
        print("[warn] Could not determine font family automatically; using file name stem.")
        print("       (Optional) pip install fonttools for automatic detection.")
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS helpers (centered, big, no bounce) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def build_center_word_events(words, center_xy=(960,540), uppercase=True, min_dur=0.38):
    """One Dialogue per word; static, centered (no bounce)."""
    cx, cy = center_xy
    events = []
    for w in words:
        t0, t1 = float(w["start"]), float(w["end"])
        if t1 - t0 < min_dur:
            t1 = t0 + min_dur
        token = (w["text"] or "").strip()
        token = re.sub(r"^\W+|\W+$", "", token)
        if not token:
            continue
        if uppercase:
            token = token.upper()
        ov = r"{\an5\pos(" + f"{cx},{cy}" + r")}"  # static, centered
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{token}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose your downloaded font & build ASS ------------------------
font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)

ass_header = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)
ass_events = build_center_word_events(words, center_xy=(CENTER_X, CENTER_Y),
                                      uppercase=UPPERCASE, min_dur=MIN_WORD_SEC)
ass_text = ass_header + "\n".join(ass_events)

# sanitize filenames (avoid spaces so -vf quoting is easy)
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
ass_path = video_path.with_name(f"{safe_stem}_auto.ass")
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] ASS style uses Fontname =", FONT_NAME)

# ---------- 3) Burn captions with FFmpeg --------------------------------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

# Tell libass to look right in your font folder (no system install needed)
fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"

out_video = video_path.with_name(f"{safe_stem}_mrbeast_static.mp4")
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/My Video.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 9
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] wrote ASS: /Users/marcus/Downloads/My_Video_auto.ass
[check] ASS style uses Fontname = Komika Axis
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/My_Video_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/My_Video_mrbeast_static.mp4


[out#0/mp4 @ 0x15a00f3b0] video:2187KiB audio:71KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.144722%
frame=   75 fps=0.0 q=-0.0 Lsize=    2261KiB time=00:00:02.96 bitrate=6257.7kbits/s speed=4.42x elapsed=0:00:00.66    


In [11]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO    = Path.home() / "Downloads" / "My Video.mp4"
MODEL_NAME     = "small.en"      # tiny/base/small/medium/large-v3; *.en is faster for English
FONT_SIZE      = 264             # <— hyperparameter (make bigger/smaller)
CENTER_X, CENTER_Y = 960, 540    # 1920x1080 center; change for other resolutions
UPPERCASE      = True            # ALL CAPS
MIN_WORD_SEC   = 0.38            # min readable time per word
CUT_AHEAD_SEC  = 0.05            # remove the old word this many seconds before the next shows
CUSTOM_FONT_DIR = Path.home() / "Documents" / "mrbeast_caps" / "fonts"  # put your .ttf/.otf here
# ========================================================================

import subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- custom font discovery in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    """
    Returns (font_file_path, font_family_name).
    - If a TTF/OTF is in font_dir, pick the first one.
    - Try to read the true family name via fontTools (if installed).
    - Fallback to filename stem if fontTools isn't available.
    """
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download a .ttf or .otf font (e.g., from dafont.com).")
        print(f"2) Place the file here:\n   {font_dir}\n")
        print("3) Re-run this cell — it will use the font automatically.")
        raise SystemExit("[exit] No font found yet. Put a .ttf/.otf in the folder above.")
    font_file = candidates[0]
    # Try to get the internal family name (best for libass matching)
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional dependency (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)  # 1=Family, 4=Full name
    except Exception:
        pass
    if not family:
        family = font_file.stem
        print("[warn] Could not determine font family automatically; using file name stem.")
        print("       (Optional) pip install fonttools for automatic detection.")
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS helpers (centered, big, one-at-a-time) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def build_center_word_events_one_at_a_time(words, center_xy=(960,540),
                                           uppercase=True, min_dur=0.38, cut_ahead=0.05):
    """
    One Dialogue per word; static, centered.
    Each word ends slightly BEFORE the next begins (cut_ahead), guaranteeing only one
    subtitle is visible at a time.
    """
    cx, cy = center_xy
    events = []
    n = len(words)
    for i, w in enumerate(words):
        t0 = float(w["start"])
        # base end: at least spoken end or min_dur, whichever is longer
        t1 = max(float(w["end"]), t0 + min_dur)
        # trim to just before the next word starts
        if i + 1 < n:
            next_start = float(words[i+1]["start"])
            t1 = min(t1, next_start - cut_ahead)
        # ensure positive duration
        if t1 <= t0:
            t1 = t0 + 0.05
        token = (w["text"] or "").strip()
        token = re.sub(r"^\W+|\W+$", "", token)
        if not token:
            continue
        if uppercase:
            token = token.upper()
        ov = r"{\an5\pos(" + f"{cx},{cy}" + r")}"  # static, centered
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{token}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose your downloaded font & build ASS ------------------------
font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)

ass_header = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)
ass_events = build_center_word_events_one_at_a_time(
    words,
    center_xy=(CENTER_X, CENTER_Y),
    uppercase=UPPERCASE,
    min_dur=MIN_WORD_SEC,
    cut_ahead=CUT_AHEAD_SEC
)
ass_text = ass_header + "\n".join(ass_events)

# write ASS next to output (Downloads root)
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
downloads = Path.home() / "Downloads"
ass_path = downloads / f"{safe_stem}_auto.ass"
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] ASS style uses Fontname =", FONT_NAME, "| FONT_SIZE =", FONT_SIZE)

# ---------- 3) Burn captions with FFmpeg (output to Downloads) ----------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

# Point libass to your custom font folder (no system install needed)
fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"

out_video = downloads / f"{safe_stem}_mrbeast_static.mp4"
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/My Video.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 9
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] wrote ASS: /Users/marcus/Downloads/My_Video_auto.ass
[check] ASS style uses Fontname = Komika Axis | FONT_SIZE = 264
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/My_Video_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/My_Video_mrbeast_static.mp4


[out#0/mp4 @ 0x123e04230] video:2235KiB audio:71KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.141662%
frame=   75 fps=0.0 q=-0.0 Lsize=    2310KiB time=00:00:02.96 bitrate=6392.7kbits/s speed=4.36x elapsed=0:00:00.67    


In [27]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO      = Path.home() / "Downloads" / "My Video-1.mp4"
MODEL_NAME       = "small.en"       # tiny/base/small/medium/large-v3; *.en is faster for English
FONT_SIZE        = 224              # <— hyperparameter (make bigger/smaller)
CENTER_X, CENTER_Y = 960, 540       # center on 1920x1080 (change for other resolutions)
UPPERCASE        = True             # ALL CAPS
MIN_WORD_SEC     = 0.990             # minimum readable duration per word
CUT_AHEAD_SEC    = 0.05             # end each word slightly before next starts (one-at-a-time)
CUSTOM_FONT_DIR  = Path.home() / "Documents" / "mrbeast_caps" / "fonts"  # put your .ttf/.otf here

# Animation hyperparameters
# ANIM choices: "none", "fade", "pop", "zoom", "bounce", "slide_up", "slide_down", "slide_left", "slide_right", "rotate"
ANIM            = "bounce"
ANIM_IN_MS      = 9500   # main appear time (ms) for transform/move
ANIM_OUT_MS     = 0   # fade-out tail (used by 'fade'; others ignore)
# ========================================================================

import subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- custom font discovery in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    """
    Returns (font_file_path, font_family_name).
    - Picks first .ttf/.otf in font_dir (create the folder if missing).
    - Tries to read the internal family name via fontTools (optional).
    """
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run this cell.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)  # 1=Family, 4=Full name
    except Exception:
        pass
    if not family:
        family = font_file.stem
        print("[warn] Could not determine font family automatically; using file name stem.")
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS helpers (centered, big, one-at-a-time, with animation) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int) -> str:
    """
    Returns the ASS override tag for the chosen animation (center aligned).
    """
    name = (name or "none").lower()
    if name == "none":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")}"
    if name == "fade":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fad(" + f"{in_ms},{out_ms}" + r")}"
    if name == "pop":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "zoom":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx60\fscy60\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "bounce":
        # move from slightly above + overshoot then settle
        return (r"{\an5\move(" + f"{cx},{cy-40},{cx},{cy},0,{in_ms}" + r")"
                r"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120," + f"{in_ms}" + r",\fscx100\fscy100)}")
    if name == "slide_up":
        return r"{\an5\move(" + f"{cx},{cy+60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_down":
        return r"{\an5\move(" + f"{cx},{cy-60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_left":
        return r"{\an5\move(" + f"{cx-140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_right":
        return r"{\an5\move(" + f"{cx+140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "rotate":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\frz-12\t(0," + f"{in_ms}" + r",\frz0)}"
    # default
    return r"{\an5\pos(" + f"{cx},{cy}" + r")}"

def build_center_word_events_one_at_a_time(words, center_xy=(960,540),
                                           uppercase=True, min_dur=0.38, cut_ahead=0.05,
                                           anim="none", in_ms=220, out_ms=100):
    """
    One Dialogue per word; static position (center). Only one visible at a time by trimming
    each word to end slightly before the next starts.
    """
    cx, cy = center_xy
    events = []
    n = len(words)
    for i, w in enumerate(words):
        t0 = float(w["start"])
        t1 = max(float(w["end"]), t0 + min_dur)
        if i + 1 < n:
            next_start = float(words[i+1]["start"])
            t1 = min(t1, next_start - cut_ahead)
        if t1 <= t0:
            t1 = t0 + 0.05
        token = (w["text"] or "").strip()
        token = re.sub(r"^\W+|\W+$", "", token)
        if not token:
            continue
        if uppercase:
            token = token.upper()
        ov = anim_tag(cx, cy, anim, in_ms, out_ms)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{token}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose your downloaded font & build ASS ------------------------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf/.otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run this cell.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)

ASS_HEADER = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)
ass_events = build_center_word_events_one_at_a_time(
    words,
    center_xy=(CENTER_X, CENTER_Y),
    uppercase=UPPERCASE,
    min_dur=MIN_WORD_SEC,
    cut_ahead=CUT_AHEAD_SEC,
    anim=ANIM,
    in_ms=ANIM_IN_MS,
    out_ms=ANIM_OUT_MS
)
ass_text = ASS_HEADER + "\n".join(ass_events)

# write ASS + output into Downloads
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
downloads = Path.home() / "Downloads"
ass_path = downloads / f"{safe_stem}_auto.ass"
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] ASS Fontname =", FONT_NAME, "| FONT_SIZE =", FONT_SIZE, "| ANIM =", ANIM)

# ---------- 3) Burn captions with FFmpeg (output to Downloads) ----------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"  # point libass to your font folder
out_video = downloads / f"{safe_stem}_mrbeast_{ANIM}.mp4"
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/My Video-1.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 88
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] wrote ASS: /Users/marcus/Downloads/My_Video-1_auto.ass
[check] ASS Fontname = Komika Axis | FONT_SIZE = 224 | ANIM = bounce
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/My_Video-1_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/My_Video-1_mrbeast_bounce.mp4


[out#0/mp4 @ 0x1398067f0] video:49290KiB audio:667KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.038686%
frame=  710 fps=182 q=-0.0 Lsize=   49976KiB time=00:00:28.36 bitrate=14436.0kbits/s speed=7.28x elapsed=0:00:03.89    


In [41]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO       = Path.home() / "Downloads" / "My Video-1.mp4"
MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English
FONT_SIZE         = 200              # hyperparameter: caption size
CENTER_X, CENTER_Y= 960, 540         # center for 1920x1080; change for other resolutions
UPPERCASE         = True             # ALL CAPS for captions

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30             # minimum on-screen time per caption (readability)
CUT_AHEAD_SEC     = 0.00             # end each caption slightly before next starts (one-at-a-time)

# Grouping hyperparams (control words/characters per caption)
MAX_WORDS_PER_CAP = 1                # e.g., 1 = one word per card; set 2, 3, ... to show more
MAX_CHARS_PER_CAP = None             # e.g., 18; or None to ignore char limit
MAX_GAP_SEC       = 9.90             # start a new caption if silence/gap exceeds this

# Font: drop your .ttf/.otf here (no system install needed)
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Animation hyperparameters
# ANIM choices: "none", "fade", "pop", "zoom", "bounce", "slide_up", "slide_down", "slide_left", "slide_right", "rotate"
ANIM              = "inflate"
ANIM_IN_MS        = 500   # main appear time (ms) for transform/move
ANIM_OUT_MS       = 50    # fade-out tail (used by 'fade'; others ignore)
# ========================================================================

import subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run this cell.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header (centered, white text, thick black outline) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int) -> str:
    name = (name or "none").lower()
    if name == "none":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")}"
    if name == "fade":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fad(" + f"{in_ms},{out_ms}" + r")}"
    if name == "pop":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "zoom":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx60\fscy60\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "bounce":
        return (r"{\an5\move(" + f"{cx},{cy-40},{cx},{cy},0,{in_ms}" + r")"
                r"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120," + f"{in_ms}" + r",\fscx100\fscy100)}")
    if name == "slide_up":
        return r"{\an5\move(" + f"{cx},{cy+60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_down":
        return r"{\an5\move(" + f"{cx},{cy-60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_left":
        return r"{\an5\move(" + f"{cx-140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_right":
        return r"{\an5\move(" + f"{cx+140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "rotate":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\frz-12\t(0," + f"{in_ms}" + r",\frz0)}"
    return r"{\an5\pos(" + f"{cx},{cy}" + r")}"

# ---------- Group words into captions (by count/chars and pauses) ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    # keep simple punctuation that tends to belong to the word
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)  # trim leading/trailing punctuation tokens

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    """
    Returns a list of caption groups, where each group is a list of word dicts.
    Grouping rules:
      - Start new caption on long silence (> max_gap_s)
      - Limit by max_words per caption
      - (Optional) Limit by max_chars (space-joined), without splitting words
    """
    lines, cur = [], []
    last_end = None
    cur_len = 0
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        # Would adding this token exceed caps?
        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                center_xy=(960,540),
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.05,
                                anim="none",
                                in_ms=220,
                                out_ms=100):
    """
    Build ASS Dialogue lines from grouped captions.
    Ensures: only one caption at a time (by cutting each before next starts).
    """
    cx, cy = center_xy
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        t1 = max(natural_end, t0 + min_caption)
        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            t1 = min(t1, next_start - cut_ahead)
        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()
        ov = anim_tag(cx, cy, anim, in_ms, out_ms)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose your downloaded font & build ASS ------------------------
font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
ASS_HEADER = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)

# Group words into captions based on your hyperparams
caption_lines = group_words_to_captions(
    words,
    max_words=MAX_WORDS_PER_CAP,
    max_chars=MAX_CHARS_PER_CAP,
    max_gap_s=MAX_GAP_SEC
)
print(f"[info] caption groups built: {len(caption_lines)} "
      f"(max_words={MAX_WORDS_PER_CAP}, max_chars={MAX_CHARS_PER_CAP}, max_gap_s={MAX_GAP_SEC})")

ass_events = build_center_caption_events(
    caption_lines,
    center_xy=(CENTER_X, CENTER_Y),
    uppercase=UPPERCASE,
    min_caption=MIN_CAPTION_SEC,
    cut_ahead=CUT_AHEAD_SEC,
    anim=ANIM,
    in_ms=ANIM_IN_MS,
    out_ms=ANIM_OUT_MS
)
ass_text = ASS_HEADER + "\n".join(ass_events)

# write ASS + output into Downloads
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
downloads = Path.home() / "Downloads"
ass_path = downloads / f"{safe_stem}_auto.ass"
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] FONT =", FONT_NAME, "| SIZE =", FONT_SIZE, "| ANIM =", ANIM)

# ---------- 3) Burn captions with FFmpeg (output to Downloads) ----------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"  # point libass to your font folder
out_video = downloads / f"{safe_stem}_mrbeast_{ANIM}.mp4"
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/My Video-1.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 88
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] caption groups built: 88 (max_words=1, max_chars=None, max_gap_s=9.9)
[info] wrote ASS: /Users/marcus/Downloads/My_Video-1_auto.ass
[check] FONT = Komika Axis | SIZE = 200 | ANIM = inflate
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/My_Video-1_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/My_Video-1_mrbeast_inflate.mp4


[out#0/mp4 @ 0x124922b70] video:49280KiB audio:667KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.038694%
frame=  710 fps=187 q=-0.0 Lsize=   49966KiB time=00:00:28.36 bitrate=14433.1kbits/s speed=7.48x elapsed=0:00:03.79    


In [39]:
#Hold after display

In [43]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO       = Path.home() / "Downloads" / "My Video-1.mp4"
MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English
FONT_SIZE         = 200              # hyperparameter: caption size
CENTER_X, CENTER_Y= 960, 540         # center for 1920x1080; change for other resolutions
UPPERCASE         = True             # ALL CAPS for captions

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30             # minimum on-screen time per caption (readability)
CUT_AHEAD_SEC     = 0.00             # end each caption slightly before next starts (one-at-a-time)
TAIL_HOLD_SEC     = 1.20             # NEW: extra hold after caption, capped to avoid overlap with next

# Grouping hyperparams (control words/characters per caption)
MAX_WORDS_PER_CAP = 1                # e.g., 1 = one word per card; set 2, 3, ... to show more
MAX_CHARS_PER_CAP = None             # e.g., 18; or None to ignore char limit
MAX_GAP_SEC       = 1.20             # start a new caption if silence/gap exceeds this

# Font: drop your .ttf/.otf here (no system install needed)
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Animation hyperparameters
# ANIM choices: "none", "fade", "pop", "zoom", "bounce", "slide_up", "slide_down",
#               "slide_left", "slide_right", "rotate", "inflate", "inflate_soft"
ANIM              = "inflate"
ANIM_IN_MS        = 20000   # main appear time (ms) for transform/move
ANIM_OUT_MS       = 50    # fade-out tail (used by 'fade'; others ignore)
# ========================================================================

import subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run this cell.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header (centered, white text, thick black outline) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int) -> str:
    name = (name or "none").lower()
    if name == "none":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")}"
    if name == "fade":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fad(" + f"{in_ms},{out_ms}" + r")}"
    if name == "pop":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "zoom":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx60\fscy60\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "bounce":
        return (r"{\an5\move(" + f"{cx},{cy-40},{cx},{cy},0,{in_ms}" + r")"
                r"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120," + f"{in_ms}" + r",\fscx100\fscy100)}")
    if name == "slide_up":
        return r"{\an5\move(" + f"{cx},{cy+60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_down":
        return r"{\an5\move(" + f"{cx},{cy-60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_left":
        return r"{\an5\move(" + f"{cx-140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_right":
        return r"{\an5\move(" + f"{cx+140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "rotate":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\frz-12\t(0," + f"{in_ms}" + r",\frz0)}"
    if name == "inflate":
        # clean blow-up: scale 80% -> 100%, no move/overshoot
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "inflate_soft":
        # blow-up with slight blur fade for smoother edges
        return (r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\blur2\alpha&H20&"
                r"\t(0," + f"{in_ms}" + r",\fscx100\fscy100\blur0\alpha&H00&)}")
    return r"{\an5\pos(" + f"{cx},{cy}" + r")}"

# ---------- Group words into captions (by count/chars and pauses) ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                center_xy=(960,540),
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100):
    """
    Build ASS Dialogue lines from grouped captions.
    - Ensures one-at-a-time (by capping end at next_start - cut_ahead)
    - Extends each caption by up to `tail_hold` seconds into silence,
      but never overlaps the next caption.
    """
    cx, cy = center_xy
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        # base end: at least natural end or min_caption
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            # available gap after this caption (minus safety cut)
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            # extend by up to tail_hold, but not beyond the next caption
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            # last caption: freely extend by tail_hold
            t1 = natural_end + tail_hold
            # still honor min_caption
            t1 = max(t1, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()
        ov = anim_tag(cx, cy, anim, in_ms, out_ms)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose your downloaded font & build ASS ------------------------
font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
ASS_HEADER = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)

caption_lines = group_words_to_captions(
    words,
    max_words=MAX_WORDS_PER_CAP,
    max_chars=MAX_CHARS_PER_CAP,
    max_gap_s=MAX_GAP_SEC
)
print(f"[info] caption groups built: {len(caption_lines)} "
      f"(max_words={MAX_WORDS_PER_CAP}, max_chars={MAX_CHARS_PER_CAP}, max_gap_s={MAX_GAP_SEC})")

ass_events = build_center_caption_events(
    caption_lines,
    center_xy=(CENTER_X, CENTER_Y),
    uppercase=UPPERCASE,
    min_caption=MIN_CAPTION_SEC,
    cut_ahead=CUT_AHEAD_SEC,
    tail_hold=TAIL_HOLD_SEC,        # <<— NEW
    anim=ANIM,
    in_ms=ANIM_IN_MS,
    out_ms=ANIM_OUT_MS
)
ass_text = ASS_HEADER + "\n".join(ass_events)

# write ASS + output into Downloads
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
downloads = Path.home() / "Downloads"
ass_path = downloads / f"{safe_stem}_auto.ass"
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] FONT =", FONT_NAME, "| SIZE =", FONT_SIZE, "| ANIM =", ANIM)

# ---------- 3) Burn captions with FFmpeg (output to Downloads) ----------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"
out_video = downloads / f"{safe_stem}_mrbeast_{ANIM}.mp4"
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/My Video-1.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 88
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] caption groups built: 88 (max_words=1, max_chars=None, max_gap_s=1.2)
[info] wrote ASS: /Users/marcus/Downloads/My_Video-1_auto.ass
[check] FONT = Komika Axis | SIZE = 200 | ANIM = inflate
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/My_Video-1_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/My_Video-1_mrbeast_inflate.mp4


[out#0/mp4 @ 0x12b004bd0] video:49287KiB audio:667KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.038688%
frame=  710 fps=167 q=-0.0 Lsize=   49973KiB time=00:00:28.36 bitrate=14435.1kbits/s speed=6.66x elapsed=0:00:04.25    


In [7]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO       = Path.home() / "Downloads" / "reddit1_filmora_clipstore" / "tester88888.mp4"
MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English
FONT_SIZE         = 130              # hyperparameter: caption size
CENTER_X, CENTER_Y= 960, 540         # center for 1920x1080; change for other resolutions
UPPERCASE         = True             # ALL CAPS for captions

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30             # minimum on-screen time per caption (readability)
CUT_AHEAD_SEC     = 0.00             # end each caption slightly before next starts (one-at-a-time)
TAIL_HOLD_SEC     = 1.20             # NEW: extra hold after caption, capped to avoid overlap with next

# Grouping hyperparams (control words/characters per caption)
MAX_WORDS_PER_CAP = 1                # e.g., 1 = one word per card; set 2, 3, ... to show more
MAX_CHARS_PER_CAP = None             # e.g., 18; or None to ignore char limit
MAX_GAP_SEC       = 1.20             # start a new caption if silence/gap exceeds this

# Font: drop your .ttf/.otf here (no system install needed)
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Animation hyperparameters
# ANIM choices: "none", "fade", "pop", "zoom", "bounce", "slide_up", "slide_down",
#               "slide_left", "slide_right", "rotate", "inflate", "inflate_soft"
ANIM              = "inflate"
ANIM_IN_MS        = 20000   # main appear time (ms) for transform/move
ANIM_OUT_MS       = 50    # fade-out tail (used by 'fade'; others ignore)
# ========================================================================

import subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run this cell.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header (centered, white text, thick black outline) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int) -> str:
    name = (name or "none").lower()
    if name == "none":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")}"
    if name == "fade":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fad(" + f"{in_ms},{out_ms}" + r")}"
    if name == "pop":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "zoom":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx60\fscy60\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "bounce":
        return (r"{\an5\move(" + f"{cx},{cy-40},{cx},{cy},0,{in_ms}" + r")"
                r"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120," + f"{in_ms}" + r",\fscx100\fscy100)}")
    if name == "slide_up":
        return r"{\an5\move(" + f"{cx},{cy+60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_down":
        return r"{\an5\move(" + f"{cx},{cy-60},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_left":
        return r"{\an5\move(" + f"{cx-140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "slide_right":
        return r"{\an5\move(" + f"{cx+140},{cy},{cx},{cy},0,{in_ms}" + r")}"
    if name == "rotate":
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\frz-12\t(0," + f"{in_ms}" + r",\frz0)}"
    if name == "inflate":
        # clean blow-up: scale 80% -> 100%, no move/overshoot
        return r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\t(0," + f"{in_ms}" + r",\fscx100\fscy100)}"
    if name == "inflate_soft":
        # blow-up with slight blur fade for smoother edges
        return (r"{\an5\pos(" + f"{cx},{cy}" + r")\fscx80\fscy80\blur2\alpha&H20&"
                r"\t(0," + f"{in_ms}" + r",\fscx100\fscy100\blur0\alpha&H00&)}")
    return r"{\an5\pos(" + f"{cx},{cy}" + r")}"

# ---------- Group words into captions (by count/chars and pauses) ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                center_xy=(960,540),
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100):
    """
    Build ASS Dialogue lines from grouped captions.
    - Ensures one-at-a-time (by capping end at next_start - cut_ahead)
    - Extends each caption by up to `tail_hold` seconds into silence,
      but never overlaps the next caption.
    """
    cx, cy = center_xy
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        # base end: at least natural end or min_caption
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            # available gap after this caption (minus safety cut)
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            # extend by up to tail_hold, but not beyond the next caption
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            # last caption: freely extend by tail_hold
            t1 = natural_end + tail_hold
            # still honor min_caption
            t1 = max(t1, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()
        ov = anim_tag(cx, cy, anim, in_ms, out_ms)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose your downloaded font & build ASS ------------------------
font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
ASS_HEADER = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)

caption_lines = group_words_to_captions(
    words,
    max_words=MAX_WORDS_PER_CAP,
    max_chars=MAX_CHARS_PER_CAP,
    max_gap_s=MAX_GAP_SEC
)
print(f"[info] caption groups built: {len(caption_lines)} "
      f"(max_words={MAX_WORDS_PER_CAP}, max_chars={MAX_CHARS_PER_CAP}, max_gap_s={MAX_GAP_SEC})")

ass_events = build_center_caption_events(
    caption_lines,
    center_xy=(CENTER_X, CENTER_Y),
    uppercase=UPPERCASE,
    min_caption=MIN_CAPTION_SEC,
    cut_ahead=CUT_AHEAD_SEC,
    tail_hold=TAIL_HOLD_SEC,        # <<— NEW
    anim=ANIM,
    in_ms=ANIM_IN_MS,
    out_ms=ANIM_OUT_MS
)
ass_text = ASS_HEADER + "\n".join(ass_events)

# write ASS + output into Downloads
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
downloads = Path.home() / "Downloads"
ass_path = downloads / f"{safe_stem}_auto.ass"
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] FONT =", FONT_NAME, "| SIZE =", FONT_SIZE, "| ANIM =", ANIM)

# ---------- 3) Burn captions with FFmpeg (output to Downloads) ----------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"
out_video = downloads / f"{safe_stem}_mrbeast_{ANIM}.mp4"
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/reddit1_filmora_clipstore/tester88888.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 22
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] caption groups built: 22 (max_words=1, max_chars=None, max_gap_s=1.2)
[info] wrote ASS: /Users/marcus/Downloads/tester88888_auto.ass
[check] FONT = Komika Axis | SIZE = 130 | ANIM = inflate
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/tester88888_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/tester88888_mrbeast_inflate.mp4


In [8]:
# Shape / aspect controls (percentages)
STRETCH_X_PCT     = 100   # 100 normal, <100 squish width, >100 widen
STRETCH_Y_PCT     = 100   # 100 normal, >100 taller, <100 shorter
INFLATE_START_PCT = 80    # animation start scale relative to baseline stretch


# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO       = Path.home() / "Downloads" / "reddit1_filmora_clipstore" / "tester88888.mp4"
MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English
FONT_SIZE         = 130              # hyperparameter: caption size
CENTER_X, CENTER_Y= 960, 540         # center for 1920x1080; change for other resolutions
UPPERCASE         = True             # ALL CAPS for captions

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30             # minimum on-screen time per caption (readability)
CUT_AHEAD_SEC     = 0.00             # end each caption slightly before next starts (one-at-a-time)
TAIL_HOLD_SEC     = 1.20             # NEW: extra hold after caption, capped to avoid overlap with next

# Grouping hyperparams (control words/characters per caption)
MAX_WORDS_PER_CAP = 1                # e.g., 1 = one word per card; set 2, 3, ... to show more
MAX_CHARS_PER_CAP = None             # e.g., 18; or None to ignore char limit
MAX_GAP_SEC       = 1.20             # start a new caption if silence/gap exceeds this

# Font: drop your .ttf/.otf here (no system install needed)
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Animation hyperparameters
# ANIM choices: "none", "fade", "pop", "zoom", "bounce", "slide_up", "slide_down",
#               "slide_left", "slide_right", "rotate", "inflate", "inflate_soft"
ANIM              = "inflate"
ANIM_IN_MS        = 20000   # main appear time (ms) for transform/move
ANIM_OUT_MS       = 50    # fade-out tail (used by 'fade'; others ignore)
# ========================================================================

import subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run this cell.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header (centered, white text, thick black outline) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,12,2,5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int,
             base_sx: int = 100, base_sy: int = 100, start_pct: int = 80) -> str:
    """
    Returns ASS override tags with non-uniform baseline scaling (base_sx/base_sy).
    start_pct controls where scale-in effects begin relative to that baseline.
    """
    name = (name or "none").lower()
    bsx = max(1, int(round(base_sx)))
    bsy = max(1, int(round(base_sy)))
    start_sx = max(1, int(round(bsx * (start_pct / 100.0))))
    start_sy = max(1, int(round(bsy * (start_pct / 100.0))))

    def pos(): return r"{\an5\pos(" + f"{cx},{cy}" + r")}"

    if name == "none":
        return pos() + fr"\fscx{bsx}\fscy{bsy}"
    if name == "fade":
        return pos() + fr"\fscx{bsx}\fscy{bsy}\fad({in_ms},{out_ms})"
    if name == "pop":
        return pos() + fr"\fscx{start_sx}\fscy{start_sy}\t(0,{in_ms},\fscx{bsx}\fscy{bsy})"
    if name == "zoom":
        return pos() + fr"\fscx{start_sx}\fscy{start_sy}\t(0,{in_ms},\fscx{bsx}\fscy{bsy})"
    if name == "bounce":
        # overshoot up (120%), then settle to 95%, then baseline
        up_sx  = max(1, int(round(bsx * 1.20)))
        up_sy  = max(1, int(round(bsy * 1.20)))
        low_sx = max(1, int(round(bsx * 0.95)))
        low_sy = max(1, int(round(bsy * 0.95)))
        return (pos() +
                fr"\move({cx},{cy-40},{cx},{cy},0,{in_ms})"
                fr"\fscx{up_sx}\fscy{up_sy}"
                fr"\t(0,120,\fscx{low_sx}\fscy{low_sy})"
                fr"\t(120,{in_ms},\fscx{bsx}\fscy{bsy})")
    if name == "slide_up":
        return pos().replace(r"\pos", r"\move").replace(f"({cx},{cy})", f"({cx},{cy+60},{cx},{cy},0,{in_ms})") + fr"\fscx{bsx}\fscy{bsy}"
    if name == "slide_down":
        return pos().replace(r"\pos", r"\move").replace(f"({cx},{cy})", f"({cx},{cy-60},{cx},{cy},0,{in_ms})") + fr"\fscx{bsx}\fscy{bsy}"
    if name == "slide_left":
        return pos().replace(r"\pos", r"\move").replace(f"({cx},{cy})", f"({cx-140},{cy},{cx},{cy},0,{in_ms})") + fr"\fscx{bsx}\fscy{bsy}"
    if name == "slide_right":
        return pos().replace(r"\pos", r"\move").replace(f"({cx},{cy})", f"({cx+140},{cy},{cx},{cy},0,{in_ms})") + fr"\fscx{bsx}\fscy{bsy}"
    if name == "rotate":
        return pos() + fr"\fscx{bsx}\fscy{bsy}\frz-12\t(0,{in_ms},\frz0)"
    if name == "inflate":
        return pos() + fr"\fscx{start_sx}\fscy{start_sy}\t(0,{in_ms},\fscx{bsx}\fscy{bsy})"
    if name == "inflate_soft":
        return (pos() + fr"\fscx{start_sx}\fscy{start_sy}\blur2\alpha&H20&"
                fr"\t(0,{in_ms},\fscx{bsx}\fscy{bsy}\blur0\alpha&H00&)")
    return pos() + fr"\fscx{bsx}\fscy{bsy}"


# ---------- Group words into captions (by count/chars and pauses) ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                center_xy=(960,540),
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100):
    """
    Build ASS Dialogue lines from grouped captions.
    - Ensures one-at-a-time (by capping end at next_start - cut_ahead)
    - Extends each caption by up to `tail_hold` seconds into silence,
      but never overlaps the next caption.
    """
    cx, cy = center_xy
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        # base end: at least natural end or min_caption
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            # available gap after this caption (minus safety cut)
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            # extend by up to tail_hold, but not beyond the next caption
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            # last caption: freely extend by tail_hold
            t1 = natural_end + tail_hold
            # still honor min_caption
            t1 = max(t1, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()
        ov = anim_tag(
        cx, cy, anim, in_ms, out_ms,
        base_sx=STRETCH_X_PCT,
        base_sy=STRETCH_Y_PCT,
        start_pct=INFLATE_START_PCT
        )

        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose your downloaded font & build ASS ------------------------
font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
ASS_HEADER = ASS_HEADER_TMPL.format(font=FONT_NAME, size=FONT_SIZE)

caption_lines = group_words_to_captions(
    words,
    max_words=MAX_WORDS_PER_CAP,
    max_chars=MAX_CHARS_PER_CAP,
    max_gap_s=MAX_GAP_SEC
)
print(f"[info] caption groups built: {len(caption_lines)} "
      f"(max_words={MAX_WORDS_PER_CAP}, max_chars={MAX_CHARS_PER_CAP}, max_gap_s={MAX_GAP_SEC})")

ass_events = build_center_caption_events(
    caption_lines,
    center_xy=(CENTER_X, CENTER_Y),
    uppercase=UPPERCASE,
    min_caption=MIN_CAPTION_SEC,
    cut_ahead=CUT_AHEAD_SEC,
    tail_hold=TAIL_HOLD_SEC,        # <<— NEW
    anim=ANIM,
    in_ms=ANIM_IN_MS,
    out_ms=ANIM_OUT_MS
)
ass_text = ASS_HEADER + "\n".join(ass_events)

# write ASS + output into Downloads
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
downloads = Path.home() / "Downloads"
ass_path = downloads / f"{safe_stem}_auto.ass"
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] FONT =", FONT_NAME, "| SIZE =", FONT_SIZE, "| ANIM =", ANIM)

# ---------- 3) Burn captions with FFmpeg (output to Downloads) ----------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"
out_video = downloads / f"{safe_stem}_mrbeast_{ANIM}.mp4"
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"
vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/reddit1_filmora_clipstore/tester88888.mp4
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 22
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] caption groups built: 22 (max_words=1, max_chars=None, max_gap_s=1.2)
[info] wrote ASS: /Users/marcus/Downloads/tester88888_auto.ass
[check] FONT = Komika Axis | SIZE = 130 | ANIM = inflate
[info] running FFmpeg with filter: ass=/Users/marcus/Downloads/tester88888_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/tester88888_mrbeast_inflate.mp4


[out#0/mp4 @ 0x155c07c20] video:12192KiB audio:165KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.058498%
frame=  210 fps=153 q=-0.0 Lsize=   12364KiB time=00:00:06.96 bitrate=14538.8kbits/s speed=5.06x elapsed=0:00:01.37    


In [14]:
# === CONFIG ==============================================================
from pathlib import Path
INPUT_VIDEO       = Path.home() / "Downloads" / "reddit1_filmora_clipstore" / "tester88888.mp4"
MODEL_NAME        = "small.en"       # tiny/base/small/medium/large-v3; *.en faster for English

# Caption look
FONT_SIZE         = 180
UPPERCASE         = True

# Outline controls
BORDER_PX         = 12.0             # base outline thickness (px)
SHADOW_PX         = 2.0              # shadow strength
BLUR_PX           = 0.0              # small blur reduces shimmer; try 0.3–0.8 if edges flicker
STABILIZE_OUTLINE = True             # <<< turn this on to animate \bord with scale

# Timing hyperparams
MIN_CAPTION_SEC   = 0.30
CUT_AHEAD_SEC     = 0.00
TAIL_HOLD_SEC     = 1.20

# Grouping hyperparams (control words/characters per caption)
MAX_WORDS_PER_CAP = 1
MAX_CHARS_PER_CAP = None
MAX_GAP_SEC       = 1.20

# Animation
# ANIM choices: "none","fade","pop","zoom","bounce","slide_up","slide_down",
#               "slide_left","slide_right","rotate","inflate","inflate_soft"
ANIM              = "inflate"
ANIM_IN_MS        = 20000   # your original long ramp; try 200–400 for subtle pop
ANIM_OUT_MS       = 50

# Fonts
CUSTOM_FONT_DIR   = Path.home() / "Documents" / "mrbeast_caps" / "fonts"

# Rendering safety
AUTO_PLAYRES      = True             # match ASS PlayRes to actual video resolution
YUV444_RENDER     = True             # render subs in 4:4:4 to stabilize edges, then downsample
# ========================================================================

import os, subprocess, shutil, platform, re, sys
from datetime import timedelta
from faster_whisper import WhisperModel

# ---------- probe video resolution ----------
def probe_resolution(video_path: Path) -> tuple[int, int]:
    cmd = [
        "ffprobe","-v","error","-select_streams","v:0",
        "-show_entries","stream=width,height","-of","csv=s=x:p=0", str(video_path)
    ]
    try:
        out = subprocess.check_output(cmd, text=True).strip()
        w, h = map(int, out.split("x"))
        return w, h
    except Exception:
        # fallback
        return 1920, 1080

# centers derived later once we know resolution
CENTER_X, CENTER_Y = None, None

# ---------- load Whisper with a supported compute_type ----------
import platform as _pf
def load_whisper_auto(model_name: str):
    osname = _pf.system()
    candidates = (["float16", "int8", "float32"] if osname == "Darwin"
                  else ["int8_float16", "int8", "float16", "float32"])
    last = None
    for ct in candidates:
        try:
            print(f"[info] trying compute_type={ct} …")
            return WhisperModel(model_name, compute_type=ct, device="auto")
        except ValueError as e:
            print(f"[skip] {e}")
            last = e
    raise last

# ---------- font: use any .ttf/.otf in CUSTOM_FONT_DIR ----------
def pick_custom_font(font_dir: Path):
    font_dir.mkdir(parents=True, exist_ok=True)
    candidates = list(font_dir.glob("*.ttf")) + list(font_dir.glob("*.otf"))
    if not candidates:
        print("\n[FONT SETUP REQUIRED]")
        print("1) Download any .ttf or .otf font.")
        print(f"2) Place it here: {font_dir}")
        print("3) Re-run.")
        raise SystemExit("[exit] No font found yet.")
    font_file = candidates[0]
    family = None
    try:
        from fontTools.ttLib import TTFont  # optional (pip install fonttools)
        tt = TTFont(font_file)
        names = {n.nameID: n.toUnicode() for n in tt["name"].names if n.toUnicode()}
        family = names.get(1) or names.get(4)
    except Exception:
        family = font_file.stem
    print("[font] Using file:", font_file)
    print("[font] Font family set to:", family)
    return font_file, family

# ---------- ASS header template (filled later with resolution) ----------
ASS_HEADER_TMPL = """[Script Info]
ScriptType: v4.00+
PlayResX: {play_w}
PlayResY: {play_h}
ScaledBorderAndShadow: yes
WrapStyle: 2

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
; White text (Primary), THICK black outline, subtle shadow for separation.
Style: Beast,{font},{size},&H00FFFFFF,&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,{border},{shadow},5,60,60,60,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""

def _fmt_time(t: float) -> str:
    td = timedelta(seconds=max(0.0, t))
    cs = int(round(td.total_seconds() * 100))
    h, rem = divmod(cs, 360000)
    m, rem = divmod(rem, 6000)
    s, cs = divmod(rem, 100)
    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

# compute start/end scales & animated \bord when stabilizing
def _scale_spec_for_anim(name: str):
    name = (name or "none").lower()
    # start_scale, end_scale
    if name in ("inflate", "inflate_soft", "pop"):
        return 80, 100
    if name == "zoom":
        return 60, 100
    # slide/rotate/fade/none: no scale change
    return 100, 100

def _fmt_float(x: float) -> str:
    # compact float formatting for ASS tags
    return f"{x:.2f}".rstrip("0").rstrip(".")

def anim_tag(cx: int, cy: int, name: str, in_ms: int, out_ms: int,
             border_px: float, stabilize_outline: bool, blur_px: float) -> str:
    # snap to pixel to reduce subpixel shimmer
    cx, cy = int(round(cx)), int(round(cy))
    s0, s1 = _scale_spec_for_anim(name)
    # outline start/end if stabilizing (inverse proportional to scale)
    if stabilize_outline and (s0 != s1):
        bord0 = border_px * (s0 / 100.0)
        bord1 = border_px * (s1 / 100.0)
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = rf"\t(0,{in_ms},\bord{_fmt_float(bord1)})"
    else:
        bord0 = border_px
        bord_tag0 = rf"\bord{_fmt_float(bord0)}"
        bord_anim = ""

    blur_tag = (rf"\blur{_fmt_float(blur_px)}" if blur_px and blur_px > 0 else "")

    name = (name or "none").lower()
    if name == "none":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"
    if name == "fade":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fad({in_ms},{out_ms})}}"
    if name == "pop":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "zoom":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx60\fscy60\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "bounce":
        return (rf"{{\an5\move({cx},{cy-40},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}"
                rf"\fscx120\fscy120\t(0,120,\fscx95\fscy95)\t(120,{in_ms},\fscx100\fscy100){bord_anim}}}")
    if name == "slide_up":
        return rf"{{\an5\move({cx},{cy+60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_down":
        return rf"{{\an5\move({cx},{cy-60},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_left":
        return rf"{{\an5\move({cx-140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "slide_right":
        return rf"{{\an5\move({cx+140},{cy},{cx},{cy},0,{in_ms}){bord_tag0}{blur_tag}}}"
    if name == "rotate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\frz-12\t(0,{in_ms},\frz0)}}"
    if name == "inflate":
        return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\t(0,{in_ms},\fscx100\fscy100){bord_anim}}}"
    if name == "inflate_soft":
        return (rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}\fscx80\fscy80\alpha&H20&\blur2"
                rf"\t(0,{in_ms},\fscx100\fscy100\alpha&H00&\blur0){bord_anim}}}")
    return rf"{{\an5\pos({cx},{cy}){bord_tag0}{blur_tag}}}"

# ---------- Group words into captions ----------
def clean_token(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"^\s+|\s+$", "", s)
    return re.sub(r"^\W+|\W+$", "", s)

def group_words_to_captions(words,
                            max_words=1,
                            max_chars=None,
                            max_gap_s=0.6):
    lines, cur = [], []
    last_end = None
    for w in words:
        token = clean_token(w["text"])
        if not token:
            continue
        gap = (w["start"] - last_end) if last_end is not None else 0.0

        join_len = len((" ".join([x["text"] for x in cur] + [token])).strip())
        need_new = False
        if last_end is not None and gap > max_gap_s:
            need_new = True
        if cur and len(cur) >= max_words:
            need_new = True
        if (not need_new) and (max_chars is not None) and (join_len > max_chars):
            need_new = True

        if need_new and cur:
            lines.append(cur)
            cur = []

        cur.append({"start": float(w["start"]), "end": float(w["end"]), "text": token})
        last_end = float(w["end"])

    if cur:
        lines.append(cur)
    return lines

def build_center_caption_events(lines,
                                play_w: int, play_h: int,
                                uppercase=True,
                                min_caption=0.30,
                                cut_ahead=0.00,
                                tail_hold=1.20,
                                anim="none",
                                in_ms=220,
                                out_ms=100,
                                border_px=12.0,
                                stabilize_outline=True,
                                blur_px=0.0):
    cx = int(round(play_w / 2))
    cy = int(round(play_h / 2))
    events = []
    n = len(lines)
    for i, ln in enumerate(lines):
        t0 = float(ln[0]["start"])
        natural_end = float(ln[-1]["end"])
        t1 = max(natural_end, t0 + min_caption)

        if i + 1 < n:
            next_start = float(lines[i+1][0]["start"])
            gap_after = max(0.0, next_start - natural_end - cut_ahead)
            t1 = min(max(t1, natural_end + min(tail_hold, gap_after)), next_start - cut_ahead)
        else:
            t1 = max(natural_end + tail_hold, t0 + min_caption)

        if t1 <= t0:
            t1 = t0 + 0.05

        text = " ".join([w["text"] for w in ln]).strip()
        if uppercase:
            text = text.upper()

        ov = anim_tag(cx, cy, anim, in_ms, out_ms, border_px, stabilize_outline, blur_px)
        events.append(f"Dialogue: 0,{_fmt_time(t0)},{_fmt_time(t1)},Beast,,0,0,0,,{ov}{text}")
    return events

# ---------- 1) Transcribe (word timestamps) ----------
video_path = Path(INPUT_VIDEO).expanduser().resolve()
assert video_path.exists(), f"Video not found: {video_path}"
print("[info] video:", video_path)

# Determine ASS PlayRes + default center based on actual video dimensions
PLAY_W, PLAY_H = probe_resolution(video_path) if AUTO_PLAYRES else (1920, 1080)
if CENTER_X is None or CENTER_Y is None:
    CENTER_X, CENTER_Y = PLAY_W // 2, PLAY_H // 2
print(f"[info] PlayRes set to: {PLAY_W}x{PLAY_H} | Center=({CENTER_X},{CENTER_Y})")

print("[info] loading Whisper model …")
model = load_whisper_auto(MODEL_NAME)

print("[info] transcribing (word timestamps) …")
segments, _ = model.transcribe(str(video_path), vad_filter=True, word_timestamps=True)

words = []
for seg in segments:
    if seg.words:
        for w in seg.words:
            tok = (w.word or "").strip()
            if tok:
                words.append({"start": float(w.start), "end": float(w.end), "text": tok})

print(f"[info] words captured: {len(words)}")

# ---------- 2) Choose font & build ASS ------------------------
font_file, FONT_NAME = pick_custom_font(CUSTOM_FONT_DIR)
ASS_HEADER = ASS_HEADER_TMPL.format(
    play_w=PLAY_W, play_h=PLAY_H, font=FONT_NAME, size=FONT_SIZE,
    border=_fmt_float(BORDER_PX), shadow=_fmt_float(SHADOW_PX)
)

caption_lines = group_words_to_captions(
    words,
    max_words=MAX_WORDS_PER_CAP,
    max_chars=MAX_CHARS_PER_CAP,
    max_gap_s=MAX_GAP_SEC
)
print(f"[info] caption groups built: {len(caption_lines)} "
      f"(max_words={MAX_WORDS_PER_CAP}, max_chars={MAX_CHARS_PER_CAP}, max_gap_s={MAX_GAP_SEC})")

ass_events = build_center_caption_events(
    caption_lines,
    play_w=PLAY_W, play_h=PLAY_H,
    uppercase=UPPERCASE,
    min_caption=MIN_CAPTION_SEC,
    cut_ahead=CUT_AHEAD_SEC,
    tail_hold=TAIL_HOLD_SEC,
    anim=ANIM,
    in_ms=ANIM_IN_MS,
    out_ms=ANIM_OUT_MS,
    border_px=BORDER_PX,
    stabilize_outline=STABILIZE_OUTLINE,
    blur_px=BLUR_PX
)
ass_text = ASS_HEADER + "\n".join(ass_events)

# write ASS + output into Downloads
safe_stem = re.sub(r'[^A-Za-z0-9_.-]+', '_', video_path.stem)
downloads = Path.home() / "Downloads"
ass_path = downloads / f"{safe_stem}_auto.ass"
ass_path.write_text(ass_text, encoding="utf-8")
print("[info] wrote ASS:", ass_path)
print("[check] FONT =", FONT_NAME, "| SIZE =", FONT_SIZE, "| ANIM =", ANIM,
      "| STABILIZE =", STABILIZE_OUTLINE, "| BORDER =", BORDER_PX, "| BLUR =", BLUR_PX)

# ---------- 3) Burn captions with FFmpeg (output to Downloads) ----------------
if not shutil.which("ffmpeg"):
    raise SystemExit("FFmpeg not found on PATH. Install it and rerun.")

fontsdir_arg = f":fontsdir={CUSTOM_FONT_DIR.as_posix()}"

# Render in 4:4:4 for crisp subtitles, then convert to standard 4:2:0
if YUV444_RENDER:
    vf_arg = f"format=yuv444p,ass={ass_path.as_posix()}{fontsdir_arg},format=yuv420p"
else:
    vf_arg = f"ass={ass_path.as_posix()}{fontsdir_arg}"

out_video = downloads / f"{safe_stem}_mrbeast_{ANIM}.mp4"
vcodec = "h264_videotoolbox" if platform.system() == "Darwin" else "libx264"

cmd = [
    "ffmpeg", "-y",
    "-i", str(video_path),
    "-vf", vf_arg,
    "-c:v", vcodec, "-preset", "veryfast", "-crf", "18",
    "-c:a", "copy",
    str(out_video)
]

print("[info] running FFmpeg with filter:", vf_arg)
subprocess.run(cmd, check=True)
print("[done] saved:", out_video)


[info] video: /Users/marcus/Downloads/reddit1_filmora_clipstore/tester88888.mp4
[info] PlayRes set to: 1080x1920 | Center=(540,960)
[info] loading Whisper model …
[info] trying compute_type=float16 …
[skip] Requested float16 compute type, but the target device or backend do not support efficient float16 computation.
[info] trying compute_type=int8 …
[info] transcribing (word timestamps) …
[info] words captured: 22
[font] Using file: /Users/marcus/Documents/mrbeast_caps/fonts/KOMIKAX_.ttf
[font] Font family set to: Komika Axis
[info] caption groups built: 22 (max_words=1, max_chars=None, max_gap_s=1.2)
[info] wrote ASS: /Users/marcus/Downloads/tester88888_auto.ass
[check] FONT = Komika Axis | SIZE = 210 | ANIM = inflate | STABILIZE = True | BORDER = 12.0 | BLUR = 0.0
[info] running FFmpeg with filter: format=yuv444p,ass=/Users/marcus/Downloads/tester88888_auto.ass:fontsdir=/Users/marcus/Documents/mrbeast_caps/fonts,format=yuv420p


ffmpeg version 8.0 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/8.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

[done] saved: /Users/marcus/Downloads/tester88888_mrbeast_inflate.mp4


[out#0/mp4 @ 0x10b204080] video:12099KiB audio:165KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.058941%
frame=  210 fps=108 q=-0.0 Lsize=   12271KiB time=00:00:06.96 bitrate=14429.6kbits/s speed=3.58x elapsed=0:00:01.94    
