
# Alethea AI — Track B Pipeline (Colab Notebook)

This notebook implements the **Track B** requirements:
1) Prompt an LLM for continuous text  
2) Call TTS per chunk  
3) Run **Wav2Lip** per chunk  
4) Stitch segments with **clean cross-fades** (FFmpeg/moviepy)  
5) **Normalise audio** levels (EBU R128 via FFmpeg `loudnorm`)

**Bonus**: CLI-like args section for segment length, simple retry logic for network calls, and a test that verifies identical FPS via `ffprobe`.


In [2]:

#@title Install dependencies & set up Wav2Lip
# If running in Colab, you can uncomment the next line
# !apt -y install ffmpeg

# Core Python libs
!pip -q install transformers==4.42.4 torch torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
!pip -q install gTTS==2.5.3 moviepy==1.0.3 numpy==1.26.4 librosa==0.10.2.post1 soundfile==0.12.1 tqdm==4.66.4 pydub==0.25.1

# Wav2Lip setup
import os, sys, shutil, subprocess, json, math, random, time, textwrap, glob, shlex, tempfile, re, pathlib
from pathlib import Path

ROOT = Path.cwd()
W2L_DIR = ROOT / "Wav2Lip"
if not W2L_DIR.exists():
    !git clone -q https://github.com/Rudrabha/Wav2Lip.git
    %cd Wav2Lip
    !pip -q install -r requirements.txt
    %cd ..

# Download pretrained model weights
W2L_CKPT = W2L_DIR / "checkpoints/wav2lip_gan.pth"
W2L_DIR.joinpath("checkpoints").mkdir(exist_ok=True)
if not W2L_CKPT.exists():
    !gdown -q --id 1l5l5qG8G4VqkI3QOcgpFv3G1qX3i6l2b -O Wav2Lip/checkpoints/wav2lip_gan.pth || echo "If gdown fails, please manually place the checkpoint."

print("Setup complete. Wav2Lip dir:", W2L_DIR)


Setup complete. Wav2Lip dir: /content/Wav2Lip


In [3]:
# Clean reinstall NumPy to a known-good build
%pip uninstall -y numpy
%pip install --no-cache-dir "numpy==1.26.4"

# Optional: pin a compatible stack (do AFTER the restart)
# %pip install --no-cache-dir "transformers==4.42.4" "tokenizers<0.20" "torch==2.3.1" "torchaudio==2.3.1" -U


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m157.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is

In [25]:
%pip install --no-cache-dir "librosa==0.10.2.post1" "numba>=0.58" "llvmlite>=0.41" "soundfile" "audioread" "pooch"


Collecting librosa==0.10.2.post1
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting numba>=0.58
  Downloading numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting llvmlite>=0.41
  Downloading llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Downloading librosa-0.10.2.post1-py3-none-any.whl (260 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.1/260.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling coll

In [49]:

#@title Helper utilities (run shell, retry, ffprobe)
import subprocess, shlex, json, os, sys, time
from functools import wraps
from typing import List

def run_cmd(cmd: str, check=True):
    print(">>", cmd)
    proc = subprocess.run(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    print(proc.stdout)
    if check and proc.returncode != 0:
        raise RuntimeError(f"Command failed: {cmd}")
    return proc.stdout

def retry(max_attempts=3, delay=2.0):
    def deco(fn):
        @wraps(fn)
        def wrapper(*args, **kwargs):
            last = None
            for i in range(max_attempts):
                try:
                    return fn(*args, **kwargs)
                except Exception as e:
                    last = e
                    print(f"[retry] Attempt {i+1}/{max_attempts} failed: {e}")
                    time.sleep(delay)
            raise last
        return wrapper
    return deco

def ffprobe_fps(video_path: str) -> float:
    cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=r_frame_rate -of json "{video_path}"'
    out = run_cmd(cmd, check=False)
    try:
        info = json.loads(out)
        rate = info["streams"][0]["r_frame_rate"]
        num, den = rate.split("/")
        num, den = int(num), int(den)
        return num / den if den else float(num)
    except Exception:
        return -1.0


In [50]:

#@title Configuration (acts like CLI flags)
from dataclasses import dataclass
import os

@dataclass
class Config:
    prompt: str = "Tell me a short, uplifting story about perseverance and learning."  #@param {type:"string"}
    segments: int = 4 #@param {type:"integer"}
    segment_chars: int = 200 #@param {type:"integer"}
    voice_lang: str = "en" #@param ["en"]
    face_image_path: str = "/content/face.jpg" #@param {type:"string"}
    output_dir: str = "outputs" #@param {type:"string"}
    crossfade_s: float = 0.75 #@param {type:"number"}
    target_fps: int = 25 #@param {type:"integer"}
    audio_lufs: float = -16.0 #@param {type:"number"}

CFG = Config()
os.makedirs(CFG.output_dir, exist_ok=True)
print(CFG)


Config(prompt='Tell me a short, uplifting story about perseverance and learning.', segments=4, segment_chars=200, voice_lang='en', face_image_path='/content/face.jpg', output_dir='outputs', crossfade_s=0.75, target_fps=25, audio_lufs=-16.0)



## 1) Generate continuous text with an LLM

We use a small, local-friendly model (`distilgpt2`) via `transformers` to avoid paid APIs in Colab.


In [51]:

#@title LLM generation (local, offline)
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)

@retry(max_attempts=2, delay=1.5)
def generate_continuous_text(prompt: str, approx_chars: int) -> str:
    out = gen(prompt, max_new_tokens=max(64, approx_chars//2), do_sample=True, top_p=0.95, temperature=0.9)[0]["generated_text"]
    # Basic clean-up
    return out.strip()

total_chars = CFG.segment_chars * CFG.segments + 80
full_text = generate_continuous_text(CFG.prompt, total_chars)
print("Generated length:", len(full_text))
print(full_text[:600], "...")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated length: 1999
Tell me a short, uplifting story about perseverance and learning. I hope you would like to share it with your friends and family!

So many stories have been submitted on social media, and are very interesting to follow. Some are more about the past and present, and others are more about the future. But these stories do not begin with a real story about a kid. It is about learning to be a part of what makes a great family, and of building relationships with loved ones and friends.
As this story continues, we all will have the same questions as we do now. It will be fun to see what has happened  ...


In [52]:

#@title Split into segments
def split_text(text: str, n_segments: int, segment_chars: int) -> list:
    chunks = []
    buf = text.strip()
    while buf and len(chunks) < n_segments:
        chunk = buf[:segment_chars]
        # try not to cut in middle of sentence
        last = max(chunk.rfind("."), chunk.rfind("!"), chunk.rfind("?"))
        if last > int(segment_chars * 0.5):
            chunk = chunk[:last+1]
        chunks.append(chunk.strip())
        buf = buf[len(chunk):].lstrip()
    if len(chunks) < n_segments and buf:
        chunks.append(buf[:segment_chars].strip())
    return chunks

segments = split_text(full_text, CFG.segments, CFG.segment_chars)
for i, s in enumerate(segments):
    print(f"[Seg {i}] {len(s)} chars: {s[:120]}{'...' if len(s)>120 else ''}")


[Seg 0] 129 chars: Tell me a short, uplifting story about perseverance and learning. I hope you would like to share it with your friends an...
[Seg 1] 168 chars: So many stories have been submitted on social media, and are very interesting to follow. Some are more about the past an...
[Seg 2] 184 chars: But these stories do not begin with a real story about a kid. It is about learning to be a part of what makes a great fa...
[Seg 3] 160 chars: As this story continues, we all will have the same questions as we do now. It will be fun to see what has happened and l...



## 2) TTS per chunk (gTTS)

Uses Google's free TTS (no key). If your environment blocks it, replace with Coqui TTS or any other engine.


In [53]:
#@title Synthesize TTS with gTTS
from pathlib import Path
import os, time
from gtts import gTTS

# ensure output dir exists
outdir = Path(CFG.output_dir)
outdir.mkdir(parents=True, exist_ok=True)

def tts_with_retry(text, lang, mp3_path, attempts=3, delay=1.5):
    for i in range(attempts):
        try:
            gTTS(text=text, lang=lang).save(str(mp3_path))
            return
        except Exception as e:
            print(f"[gTTS] attempt {i+1}/{attempts} failed: {e}")
            time.sleep(delay)
    raise RuntimeError("gTTS failed after retries")

audio_paths = []
for i, text in enumerate(segments):
    tmp_mp3 = outdir / f"seg_{i:03d}.mp3"
    out_wav = outdir / f"seg_{i:03d}.wav"
    tts_with_retry(text, CFG.voice_lang, tmp_mp3)
    run_cmd(f'ffmpeg -y -i "{tmp_mp3}" -ar 16000 -ac 1 "{out_wav}"')
    audio_paths.append(str(out_wav))

print("Audio files:", audio_paths)


>> ffmpeg -y -i "outputs/seg_000.mp3" -ar 16000 -ac 1 "outputs/seg_000.wav"
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --ena


## 3) Run Wav2Lip per chunk

- Provide a single face image (`CFG.face_image_path`).  
- We create a silent looped video from the still image at `CFG.target_fps`, long enough for each chunk's audio, then run Wav2Lip.


In [58]:
IMAGE = "/content/face.jpg"
!wget -O /content/face.jpg https://thispersondoesnotexist.com/

# To use your own image:
# from google.colab import files; files.upload()
# IMAGE = "/content/your_image.jpg"
import os; assert os.path.isfile(IMAGE), "Image missing"; print("Using:", IMAGE)

--2025-08-12 18:43:31--  https://thispersondoesnotexist.com/
Resolving thispersondoesnotexist.com (thispersondoesnotexist.com)... 172.67.130.241, 104.21.3.164, 2606:4700:3032::6815:3a4, ...
Connecting to thispersondoesnotexist.com (thispersondoesnotexist.com)|172.67.130.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 529359 (517K) [image/jpeg]
Saving to: ‘/content/face.jpg’


2025-08-12 18:43:32 (1.09 MB/s) - ‘/content/face.jpg’ saved [529359/529359]

Using: /content/face.jpg


In [None]:
%cd ..
!git clone https://github.com/Rudrabha/Wav2Lip.git
%cd Wav2Lip
!pip install -r requirements.txt
%cd ..
!mkdir -p Wav2Lip/checkpoints
!wget -O Wav2Lip/checkpoints/wav2lip_gan.pth "https://github.com/Rudrabha/Wav2Lip/releases/download/v1.0/wav2lip_gan.pth"


/content
fatal: destination path 'Wav2Lip' already exists and is not an empty directory.
/content/Wav2Lip
Collecting librosa==0.7.0 (from -r requirements.txt (line 1))
  Using cached librosa-0.7.0.tar.gz (1.6 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.17.1 (from -r requirements.txt (line 2))
  Using cached numpy-1.17.1.zip (6.5 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
[31mERROR: Ignored the following yanked versions: 3.4.11.39, 3.4.17.61, 4.4.0.42, 4.4.0.44, 4.5.4.58, 4.5.5.62, 4.7.0.68[0m[31m
[0m[31mERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement opencv-python==4.1.0.25 (from versions: 3.4.0.14, 3.4.10.37, 3.4.11.41, 3.4.11.43, 3.4.11.45, 3.4.13

In [None]:
%cd ..

/content


In [59]:
from pathlib import Path
import os

# Define paths (adjust if your Wav2Lip repo is in another location)
ROOT = Path.cwd()
W2L_DIR = ROOT / "Wav2Lip"
W2L_CKPT = W2L_DIR / "checkpoints" / "wav2lip_gan.pth"
print(W2L_CKPT)

# Safety check
if not W2L_DIR.exists():
    raise FileNotFoundError(f"Wav2Lip directory not found at {W2L_DIR}. Please clone it first.")
if not W2L_CKPT.exists():
    raise FileNotFoundError(f"Checkpoint not found at {W2L_CKPT}. Please download it.")


/content/Wav2Lip/checkpoints/wav2lip_gan.pth


In [65]:
# 1) switch into the repo and ensure temp/ exists
%cd /content/Wav2Lip
!mkdir -p temp

# 2) run inference (use absolute paths for face/audio/outfile)
!python inference.py \
  --checkpoint_path "checkpoints/wav2lip_gan.pth" \
  --face "/content/face.jpg" \
  --audio "/content/outputs/seg_000.wav" \
  --outfile "/content/outputs/seg_000.mp4" \
  --static True --resize_factor 2 --pads 0 10 0 0 --nosmooth

# 3) go back (optional)
%cd /content


/content/Wav2Lip
Using cpu for inference.
Number of frames available for inference: 1
(80, 701)
Length of mel chunks: 216
  0% 0/2 [00:00<?, ?it/s]
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:18<00:00, 18.38s/it]
Load checkpoint from: checkpoints/wav2lip_gan.pth
Model loaded
100% 2/2 [01:06<00:00, 33.49s/it]
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopen

In [62]:
from pathlib import Path
import subprocess, shlex

# Make run_cmd support cwd
def run_cmd(cmd: str, check=True, cwd=None):
    print(">>", cmd)
    proc = subprocess.run(
        shlex.split(cmd),
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        cwd=cwd
    )
    print(proc.stdout)
    if check and proc.returncode != 0:
        raise RuntimeError(f"Command failed: {cmd}")
    return proc.stdout

ROOT = Path("/content")
W2L_DIR = ROOT / "Wav2Lip"
W2L_CKPT = W2L_DIR / "checkpoints" / "wav2lip_gan.pth"
OUTDIR = ROOT / "outputs"

# ensure the repo-local temp directory exists (this is what inference.py expects)
(W2L_DIR / "temp").mkdir(parents=True, exist_ok=True)

def run_wav2lip_img(face_img: str, audio_wav: str, out_mp4: str):
    infer_py = str((W2L_DIR / "inference.py").resolve())
    cmd = (
        f'python "{infer_py}" '
        f'--checkpoint_path "{W2L_CKPT.resolve()}" '
        f'--face "{Path(face_img).resolve()}" '
        f'--audio "{Path(audio_wav).resolve()}" '
        f'--outfile "{Path(out_mp4).resolve()}" '
        f'--static True --nosmooth '
        f'--pads 0 15 0 0 '
        f'--resize_factor 2 '
        f'--wav2lip_batch_size 2 '
        f'--face_det_batch_size 1'
    )
    # IMPORTANT: run from inside the repo so relative "temp/" works
    run_cmd(cmd, cwd=str(W2L_DIR))


In [64]:
# Re-run just the first segment to confirm
run_wav2lip("/content/outputs/seg_003.mp4",
            "/content/outputs/seg_003.wav",
            "/content/outputs/seg_003.mp4")


>> python "/content/Wav2Lip/inference.py" --checkpoint_path "/content/Wav2Lip/checkpoints/wav2lip_gan.pth" --face "/content/outputs/seg_000.mp4" --audio "/content/outputs/seg_000.wav" --outfile "/content/outputs/seg_000.mp4" --static True --nosmooth --pads 0 10 0 0 --resize_factor 1 --wav2lip_batch_size 4 --face_det_batch_size 1
Using cpu for inference.
Reading video frames...
Number of frames available for inference: 298
(80, 701)
Length of mel chunks: 216

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

100%|██████████| 1/1 [00:16<00:00, 16.77s/it][A
100%|██████████| 1/1 [00:16<00:00, 16.77s/it]
[ WARN:0@23.026] global cap.cpp:779 open VIDEOIO(CV_IMAGES): raised OpenCV exception:

OpenCV(4.12.0) /io/opencv/modules/videoio/src/cap_images.cpp:415: error: (-215:Assertion failed) !filename_pattern.empty() in function 'CvVideoWriter_Images'



  2%|▏         | 1/54 [00:18<16:27, 18.63s/it]
  4%|▎         | 2/54 [00:19<07:00,  8.09s/it]
  6%|▌         | 3

In [66]:
from pathlib import Path
import subprocess, shlex

# --- basics / paths ---
ROOT = Path("/content")
W2L_DIR = ROOT / "Wav2Lip"
W2L_CKPT = W2L_DIR / "checkpoints" / "wav2lip_gan.pth"
OUTDIR = Path(CFG.output_dir).resolve()

print("W2L_DIR:", W2L_DIR)
print("Checkpoint exists?", W2L_CKPT.exists())
print("Output dir:", OUTDIR)

# --- run shell with optional working dir ---
def run_cmd(cmd: str, check=True, cwd=None):
    print(">>", cmd)
    proc = subprocess.run(
        shlex.split(cmd),
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        cwd=cwd
    )
    print(proc.stdout)
    if check and proc.returncode != 0:
        raise RuntimeError(f"Command failed: {cmd}")
    return proc.stdout

# Ensure the repo-local temp dir exists (inference.py writes there)
(W2L_DIR / "temp").mkdir(parents=True, exist_ok=True)

# Detect whether this repo expects --static as a flag or value
def _static_takes_value(infer_py: str) -> bool:
    proc = subprocess.run(shlex.split(f'python "{infer_py}" -h'),
                          stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    help_txt = proc.stdout or ""
    return "[--static STATIC]" in help_txt or "--static STATIC" in help_txt

# --- main Wav2Lip runner: use the FACE IMAGE directly ---
def run_wav2lip_img(face_img: str, audio_wav: str, out_mp4: str):
    infer_py = str((W2L_DIR / "inference.py").resolve())
    ckpt_abs = str(W2L_CKPT.resolve())
    face_abs = str(Path(face_img).resolve())
    wav_abs  = str(Path(audio_wav).resolve())
    out_abs  = str(Path(out_mp4).resolve())

    flags = [
        "--nosmooth",
        "--pads", "0", "15", "0", "0",   # a little forehead room helps
        "--resize_factor", "2",          # more stable on CPU
        "--wav2lip_batch_size", "2",
        "--face_det_batch_size", "1",
    ]
    if _static_takes_value(infer_py):
        flags = ["--static", "True"] + flags
    else:
        flags = ["--static"] + flags

    cmd = (
        f'python "{infer_py}" '
        f'--checkpoint_path "{ckpt_abs}" '
        f'--face "{face_abs}" '
        f'--audio "{wav_abs}" '
        f'--outfile "{out_abs}" '
        + " ".join(flags)
    )
    # Run from inside the repo so relative temp/ paths work
    run_cmd(cmd, cwd=str(W2L_DIR))

# --- generate each segment video ---
face_abs = Path(CFG.face_image_path).resolve()
if not face_abs.exists():
    raise FileNotFoundError(f"Face image not found: {face_abs}")

video_paths = []
for i in range(len(audio_paths)):
    wav = OUTDIR / f"seg_{i:03d}.wav"
    out_mp4 = OUTDIR / f"seg_{i:03d}.mp4"
    print(f"Rendering seg_{i:03d} ->", out_mp4)
    run_wav2lip_img(str(face_abs), str(wav), str(out_mp4))
    video_paths.append(str(out_mp4.resolve()))

print("Wav2Lip videos:", video_paths)


W2L_DIR: /content/Wav2Lip
Checkpoint exists? True
Output dir: /content/outputs
Rendering seg_000 -> /content/outputs/seg_000.mp4
>> python "/content/Wav2Lip/inference.py" --checkpoint_path "/content/Wav2Lip/checkpoints/wav2lip_gan.pth" --face "/content/face.jpg" --audio "/content/outputs/seg_000.wav" --outfile "/content/outputs/seg_000.mp4" --static True --nosmooth --pads 0 15 0 0 --resize_factor 2 --wav2lip_batch_size 2 --face_det_batch_size 1
Using cpu for inference.
Number of frames available for inference: 1
(80, 701)
Length of mel chunks: 216

  0%|          | 0/108 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

100%|██████████| 1/1 [00:18<00:00, 18.45s/it][A
100%|██████████| 1/1 [00:18<00:00, 18.45s/it]

  1%|          | 1/108 [00:20<35:42, 20.02s/it]
  2%|▏         | 2/108 [00:20<14:59,  8.49s/it]
  3%|▎         | 3/108 [00:20<08:24,  4.81s/it]
  4%|▎         | 4/108 [00:21<05:20,  3.08s/it]
  5%|▍         | 5/108 [00:21<03:38,  2.12s/it]
  6%|▌         | 6/108 [00


## 4) Stitch with clean cross-fades (video + audio)
We use **FFmpeg** `xfade` for video and `acrossfade` for audio to ensure smooth transitions.


In [67]:
from pathlib import Path
import json, subprocess, shlex

OUTDIR = Path("/content/outputs")

def has_audio(p: Path) -> bool:
    cmd = f'ffprobe -v error -select_streams a -show_entries stream=codec_type -of json "{p}"'
    out = subprocess.run(shlex.split(cmd), stdout=subprocess.PIPE, text=True).stdout
    try:
        j = json.loads(out)
        return bool(j.get("streams"))
    except:
        return False

# Build fixed_paths from your segments and ensure each has audio (remux if needed)
fixed_paths = []
i = 0
while True:
    mp4 = OUTDIR / f"seg_{i:03d}.mp4"
    wav = OUTDIR / f"seg_{i:03d}.wav"
    if not mp4.exists(): break
    if not has_audio(mp4) and wav.exists():
        mp4_aud = OUTDIR / f"seg_{i:03d}_aud.mp4"
        subprocess.run(shlex.split(
            f'ffmpeg -y -i "{mp4}" -i "{wav}" -shortest -map 0:v:0 -map 1:a:0 -c:v copy -c:a aac -b:a 192k "{mp4_aud}"'
        ), check=True)
        fixed_paths.append(str(mp4_aud))
    else:
        fixed_paths.append(str(mp4))
    i += 1

print("Segments to stitch:", fixed_paths)
assert fixed_paths, "No segment videos found. Generate seg_000.mp4, seg_001.mp4, ... first."


Segments to stitch: ['/content/outputs/seg_000.mp4', '/content/outputs/seg_001.mp4', '/content/outputs/seg_002.mp4', '/content/outputs/seg_003.mp4']


In [68]:
stitched_path = str((OUTDIR / "stitched_raw.mp4").resolve())
stitch_with_crossfades(fixed_paths, crossfade_s=CFG.crossfade_s, out_path=stitched_path, target_fps=CFG.target_fps)
print("Stitched:", stitched_path)


ffmpeg -y -i "/content/outputs/seg_000.mp4" -i "/content/outputs/seg_001.mp4" -i "/content/outputs/seg_002.mp4" -i "/content/outputs/seg_003.mp4" -filter_complex "[0:v]fps=25,format=yuv420p,setpts=PTS-STARTPTS[v0];[0:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo,asetpts=PTS-STARTPTS[a0];[1:v]fps=25,format=yuv420p,setpts=PTS-STARTPTS[v1];[1:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo,asetpts=PTS-STARTPTS[a1];[2:v]fps=25,format=yuv420p,setpts=PTS-STARTPTS[v2];[2:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo,asetpts=PTS-STARTPTS[a2];[3:v]fps=25,format=yuv420p,setpts=PTS-STARTPTS[v3];[3:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo,asetpts=PTS-STARTPTS[a3];[v0][v1]xfade=transition=fade:duration=0.75:offset=8.010[v1];[a0][a1]acrossfade=d=0.75:c1=tri:c2=tri[a1];[v1][v2]xfade=transition=fade:duration=0.75:offset=18.612[v2];[a1][a2]acrossfade=d=0.75:c1=tri:c2=tri[a2];[v2][v3]xfade=transition=fade:dura


## 5) Loudness normalisation (EBU R128)
We extract, normalise, and re-mux audio to avoid re-encoding video again.


In [69]:
tmp_audio = str((OUTDIR / "tmp_audio.wav").resolve())
norm_audio = str((OUTDIR / "tmp_audio_norm.wav").resolve())
final_path = str((OUTDIR / "final_normalised.mp4").resolve())
final_fast = str((OUTDIR / "final_faststart.mp4").resolve())

# extract
subprocess.run(shlex.split(f'ffmpeg -y -i "{stitched_path}" -vn -ac 2 -ar 44100 "{tmp_audio}"'), check=True)

# normalize
subprocess.run(shlex.split(
    f'ffmpeg -y -i "{tmp_audio}" -af "loudnorm=I={CFG.audio_lufs}:TP=-1.5:LRA=11:print_format=summary" "{norm_audio}"'
), check=True)

# remux normalized audio with copy of video
subprocess.run(shlex.split(
    f'ffmpeg -y -i "{stitched_path}" -i "{norm_audio}" -map 0:v:0 -map 1:a:0 -c:v copy -c:a aac -b:a 192k "{final_path}"'
), check=True)

# faststart for playback
subprocess.run(shlex.split(
    f'ffmpeg -y -i "{final_path}" -c copy -movflags +faststart "{final_fast}"'
), check=True)

print("Final video:", final_fast)


Final video: /content/outputs/final_faststart.mp4


In [72]:
!ffmpeg -y -i "/content/outputs/stitched_raw.mp4" \
  -vf "scale=1280:720:flags=bicubic,format=yuv420p" -r 25 \
  -c:v libx264 -profile:v high -level 4.0 -preset veryfast -crf 20 \
  -c:a aac -b:a 160k -ar 44100 -ac 2 \
  -movflags +faststart "/content/outputs/final_windows.mp4"


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [71]:
from google.colab import files
files.download("/content/outputs/final_faststart.mp4")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


## Bonus: Verify identical FPS across all Wav2Lip segments


In [70]:

#@title Verify identical FPS via ffprobe
fps_values = []
for v in video_paths:
    fps = ffprobe_fps(v)
    fps_values.append((v, fps))
fps_values


>> ffprobe -v error -select_streams v:0 -show_entries stream=r_frame_rate -of json "/content/outputs/seg_000.mp4"
{
    "programs": [

    ],
    "streams": [
        {
            "r_frame_rate": "25/1"
        }
    ]
}

>> ffprobe -v error -select_streams v:0 -show_entries stream=r_frame_rate -of json "/content/outputs/seg_001.mp4"
{
    "programs": [

    ],
    "streams": [
        {
            "r_frame_rate": "25/1"
        }
    ]
}

>> ffprobe -v error -select_streams v:0 -show_entries stream=r_frame_rate -of json "/content/outputs/seg_002.mp4"
{
    "programs": [

    ],
    "streams": [
        {
            "r_frame_rate": "25/1"
        }
    ]
}

>> ffprobe -v error -select_streams v:0 -show_entries stream=r_frame_rate -of json "/content/outputs/seg_003.mp4"
{
    "programs": [

    ],
    "streams": [
        {
            "r_frame_rate": "25/1"
        }
    ]
}



[('/content/outputs/seg_000.mp4', 25.0),
 ('/content/outputs/seg_001.mp4', 25.0),
 ('/content/outputs/seg_002.mp4', 25.0),
 ('/content/outputs/seg_003.mp4', 25.0)]


### How to use
1. Upload a **front-facing face image** and set `CFG.face_image_path` accordingly.  
2. Tweak `CFG.prompt`, `CFG.segments`, and `CFG.segment_chars`.  
3. Run the notebook cells in order.  
4. Your final video will be at `outputs/final_normalised.mp4`.

If `gTTS` fails in your environment, swap it with **Coqui TTS** or **Piper**.
