In [27]:
import tempfile
import os
import whisperx
import re
import difflib

# Install: pip install demucs
import torch
import torchaudio
from demucs.pretrained import get_model
from demucs.apply import apply_model

song_path = 'conviction.wav'
lyrics_text = """
Conviction in a couple different colors
Shoot a cannon at yourself
Or it's a credo that you holler
"""

# Options
use_vocal_separation = True
whisperx_model_size = "base.en"
device = "cpu"


def extract_vocals(audio_path):
    """Separate vocals using Demucs (no DLL issues on Windows)"""
    print("Separating vocals from music with Demucs...")
    
    # Load model
    model = get_model('htdemucs')
    model.cpu()
    model.eval()
    
    # Load audio
    wav, sr = torchaudio.load(audio_path)
    
    # Apply separation
    with torch.no_grad():
        sources = apply_model(model, wav[None], device='cpu')[0]
    
    # Extract vocals (index 3)
    vocals = sources[3]
    
    # Save to temp file - FIXED: Use NamedTemporaryFile instead of mktemp
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
        temp_vocals = tmp.name
    
    torchaudio.save(temp_vocals, vocals, sr)

    
    print(f"Vocals extracted to: {temp_vocals}")
    return temp_vocals


def _tokenize_words(text: str):
    # keep letters and apostrophes for words like it's, you're
    return re.findall(r"[A-Za-z']+", text.lower())


def _split_lyrics_into_phrases(lyrics_text: str):
    # Split by lines; drop empties
    lines = [ln.strip() for ln in lyrics_text.splitlines()]
    lines = [ln for ln in lines if ln]
    phrases = []
    for ln in lines:
        words = _tokenize_words(ln)
        if words:
            phrases.append({"text": ln, "words": words})
    return phrases


def _group_asr_into_phrases(whisperx_words, gap_threshold: float = 0.9):
    # Group contiguous ASR words separated by gaps > threshold
    phrases = []
    if not whisperx_words:
        return phrases
    current = {"words": [], "start": whisperx_words[0]['start'], "end": whisperx_words[0]['end']}
    for w in whisperx_words:
        if not current["words"]:
            current["words"].append(w)
            current["start"] = w["start"]
            current["end"] = w["end"]
            continue
        gap = w["start"] - current["words"][-1]["end"]
        if gap > gap_threshold:
            phrases.append(current)
            current = {"words": [w], "start": w["start"], "end": w["end"]}
        else:
            current["words"].append(w)
            current["end"] = w["end"]
    if current["words"]:
        phrases.append(current)
    return phrases


def _norm(s: str) -> str:
    return re.sub(r"[^a-z']+", "", s.lower())


def _sim(a: str, b: str) -> float:
    return difflib.SequenceMatcher(None, a, b).ratio()


def _segment_asr_to_lyrics(lyrics_phrases, asr_phrases):
    """
    For each lyric phrase (line) and the corresponding ASR phrase,
    segment the ASR words into contiguous groups mapped to each lyric word.
    Each lyric word receives start=min(group), end=max(group) of its ASR group.
    If ASR runs out, estimate timings for remaining lyric words.
    """
    timings = []
    pairs = min(len(lyrics_phrases), len(asr_phrases))
    for i in range(pairs):
        lp = lyrics_phrases[i]
        ap = asr_phrases[i]
        lwords = lp["words"]
        awords = ap["words"]
        j = 0  # index into ASR words
        for lw in lwords:
            lw_norm = _norm(lw)
            if j >= len(awords):
                # No ASR left; estimate 0.4s per word after last timing
                start_est = timings[-1]["end"] if timings else (ap["end"])  # after phrase
                end_est = start_est + 0.4
                timings.append({"word": lw, "start": start_est, "end": end_est})
                continue
            # Start group with at least one ASR word
            g_start = j
            g_end = j
            current_concat = _norm(awords[g_end]['word'])
            best_sim = _sim(current_concat, lw_norm)
            best_end = g_end
            # Try to extend group greedily while similarity improves or modestly decreases but length is small
            while g_end + 1 < len(awords):
                trial = current_concat + _norm(awords[g_end + 1]['word'])
                trial_sim = _sim(trial, lw_norm)
                # Heuristics: prefer improvement; allow small decrease if current length is far from target
                len_ratio = len(trial) / max(1, len(lw_norm))
                if trial_sim >= best_sim or (best_sim < 0.6 and len_ratio < 1.3):
                    g_end += 1
                    current_concat = trial
                    best_sim = trial_sim
                    best_end = g_end
                else:
                    break
            # Assign timing from first to last ASR in group
            start_t = float(awords[g_start]['start'])
            end_t = float(awords[best_end]['end'])
            timings.append({"word": lw, "start": start_t, "end": end_t})
            j = best_end + 1
        # If ASR words remain in this phrase and lyric words exhausted, extend last lyric word to include leftovers
        if j < len(awords) and timings:
            timings[-1]["end"] = float(awords[-1]['end'])
    # Handle extra lyric phrases (no matching ASR phrase)
    if len(lyrics_phrases) > pairs:
        t = timings[-1]['end'] if timings else 0.0
        for i in range(pairs, len(lyrics_phrases)):
            for w in lyrics_phrases[i]["words"]:
                timings.append({"word": w, "start": t, "end": t + 0.4})
                t += 0.4
    return timings


def align_lyrics_to_audio(audio_path, lyrics_text, use_vocal_separation=True):
    """
    Align YOUR lyrics to the audio timing using WhisperX by mapping entire phrases (lines)
    to contiguous ASR word chunks, then segmenting those chunks to lyric words using a
    greedy similarity-based grouping (combines multiple ASR tokens into one lyric token
    when needed; e.g., ASR "come in and shine" → lyric "conviction").

    Returns a list of word timings for YOUR lyrics.
    """
    # Step 1: Extract vocals if enabled
    if use_vocal_separation:
        audio_to_use = extract_vocals(audio_path)
    else:
        audio_to_use = audio_path
    
    # Step 2: Load WhisperX model
    print(f"Loading WhisperX model ({whisperx_model_size})...")
    model = whisperx.load_model(whisperx_model_size, device, compute_type="float32")
    
    # Step 3: Load audio
    audio = whisperx.load_audio(audio_to_use)
    
    # Step 4: Transcribe to get initial segments (disable VAD for music)
    print("Transcribing audio...")
    result = model.transcribe(audio, batch_size=16, language="en")
    
    # Step 5: Load alignment model for word-level timestamps
    print("Loading alignment model...")
    model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
    
    # Step 6: Align to get precise word timings
    print("Aligning words...")
    result_aligned = whisperx.align(
        result["segments"], 
        model_a, 
        metadata, 
        audio, 
        device,
        return_char_alignments=False
    )
    
    # Step 7: Extract word timings from WhisperX
    whisperx_words = []
    for segment in result_aligned.get("segments", []):
        for word_info in segment.get("words", []):
            if word_info.get('word') is None:
                continue
            whisperx_words.append({
                'word': word_info['word'].strip().lower(),
                'start': float(word_info['start']),
                'end': float(word_info['end'])
            })
    print(f"Extracted {len(whisperx_words)} words from WhisperX")
    # Optional debug (first 50)
    for idx, w in enumerate(whisperx_words[:50]):
        print(f"{idx}: '{w['word']}' - {w['start']:.2f}s to {w['end']:.2f}s")

    # Step 8: Build lyric phrases (lines)
    lyric_phrases = _split_lyrics_into_phrases(lyrics_text)
    if not lyric_phrases:
        raise ValueError("No lyric phrases found. Provide non-empty lyrics_text.")

    # Step 9: Group ASR words into phrases by time gaps
    asr_phrases = _group_asr_into_phrases(whisperx_words, gap_threshold=0.6)
    if not asr_phrases and whisperx_words:
        asr_phrases = [{"words": whisperx_words, "start": whisperx_words[0]['start'], "end": whisperx_words[-1]['end'] }]

    # Step 10: Segment ASR → lyric words
    word_timings = _segment_asr_to_lyrics(lyric_phrases, asr_phrases)

    print(f"\nAligned {len(word_timings)} lyric words using phrase segmentation.")
    return word_timings


# Run the alignment
word_timings = align_lyrics_to_audio(song_path, lyrics_text, use_vocal_separation)

# Display results
print("\nYour lyrics with timings:")
for i, word_data in enumerate(word_timings):
    print(f"{i}: '{word_data['word']}' - {word_data['start']:.2f}s to {word_data['end']:.2f}s")


Separating vocals from music with Demucs...




Vocals extracted to: C:\Users\marcu\AppData\Local\Temp\tmp55wag2t3.wav
Loading WhisperX model (base.en)...
2025-11-16 17:08:20 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...
2025-11-16 17:08:20 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint e:\Documents\CodeStuff\BLAiRE\.venv-whisperx-py313\Lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.


  torchaudio.list_audio_backends()


Transcribing audio...
Loading alignment model...
Loading alignment model...
Aligning words...
Aligning words...
Extracted 22 words from WhisperX
0: 'come' - 0.03s to 0.53s
1: 'in' - 0.55s to 0.63s
2: 'and' - 0.65s to 0.71s
3: 'shine' - 0.73s to 1.48s
4: 'in' - 1.58s to 1.66s
5: 'a' - 1.74s to 1.78s
6: 'couple' - 1.82s to 2.20s
7: 'of' - 2.22s to 2.26s
8: 'different' - 2.28s to 2.46s
9: 'colors' - 2.48s to 3.20s
10: 'shoot' - 4.26s to 4.49s
11: 'a' - 4.57s to 4.58s
12: 'cannon' - 4.62s to 5.01s
13: 'at' - 5.07s to 5.21s
14: 'yourself' - 5.23s to 5.79s
15: 'or' - 8.30s to 8.44s
16: 'it's' - 8.46s to 8.60s
17: 'a' - 8.68s to 8.72s
18: 'cradle' - 8.76s to 9.24s
19: 'that' - 9.30s to 9.66s
20: 'you're' - 9.68s to 10.64s
21: 'hollow' - 10.77s to 10.88s

Aligned 18 lyric words using phrase segmentation.

Your lyrics with timings:
0: 'conviction' - 0.03s to 0.71s
1: 'in' - 0.73s to 1.48s
2: 'a' - 1.58s to 1.78s
3: 'couple' - 1.82s to 2.20s
4: 'different' - 2.22s to 2.46s
5: 'colors' - 2.48s to

In [28]:
import cv2
import numpy as np
import math
from pathlib import Path
import subprocess

# Inputs
output_dir = Path("out")
output_dir.mkdir(parents=True, exist_ok=True)
video_fps = 30
width, height = 1920, 1080

# Font configuration
# If font_path points to a valid .ttf/.otf, we will use PIL TrueType at font_size_px.
# Otherwise we fall back to OpenCV Hershey using hershey_scale.
font_path = r"KGRedHands.ttf"  # set to None or to a valid TTF path
font_size_px = 120  # applies when using TrueType font via PIL
hershey_scale = 2.5  # applies when using OpenCV Hershey fonts

font_color = (255, 255, 255)
font_thickness = 3
stroke_color = (0, 0, 0)
stroke_thickness = 6
line_type = cv2.LINE_AA

# Source audio and timings from previous cells
source_audio = song_path  # from earlier cell
word_timings_in = word_timings  # from earlier cell (list of dicts: word,start,end)

# Video temp paths
temp_video_path = output_dir / "lyrics_black_temp.mp4"
final_video_path = output_dir / "lyrics_black_with_audio.mp4"

# Prepare VideoWriter (H264 via mp4v/avc1 depends on platform; fallback to MJPG)
fourcc = cv2.VideoWriter.fourcc(*"mp4v")
writer = cv2.VideoWriter(str(temp_video_path), fourcc, video_fps, (width, height))

# Decide which font pipeline to use
use_pil_font = bool(font_path) and Path(font_path).exists()
if not use_pil_font and font_path:
    print(f"Warning: font_path not found: {font_path}. Falling back to OpenCV Hershey font.")

# Helper to draw centered text with optional stroke
def draw_centered_text(img, text):
    if not use_pil_font:
        # OpenCV Hershey path (uses hershey_scale)
        font = cv2.FONT_HERSHEY_SIMPLEX
        text_size, _ = cv2.getTextSize(text, font, hershey_scale, font_thickness)
        tw, th = text_size
        x = (width - tw) // 2
        y = (height + th) // 2
        if stroke_thickness > 0:
            cv2.putText(img, text, (x, y), font, hershey_scale, stroke_color, stroke_thickness, line_type)
        cv2.putText(img, text, (x, y), font, hershey_scale, font_color, font_thickness, line_type)
    else:
        # PIL TrueType path (uses font_size_px)
        from PIL import Image, ImageDraw, ImageFont
        pil_img = Image.fromarray(img)
        draw = ImageDraw.Draw(pil_img)
        try:
            fnt = ImageFont.truetype(font_path, font_size_px)
        except Exception as e:
            print(f"Failed to load TrueType font '{font_path}': {e}. Falling back to Hershey.")
            # Fallback to Hershey immediately
            font = cv2.FONT_HERSHEY_SIMPLEX
            text_size, _ = cv2.getTextSize(text, font, hershey_scale, font_thickness)
            tw, th = text_size
            x = (width - tw) // 2
            y = (height + th) // 2
            if stroke_thickness > 0:
                cv2.putText(img, text, (x, y), font, hershey_scale, stroke_color, stroke_thickness, line_type)
            cv2.putText(img, text, (x, y), font, hershey_scale, font_color, font_thickness, line_type)
            return
        # Measure text
        bbox = draw.textbbox((0, 0), text, font=fnt)
        tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
        x = (width - tw) // 2
        y = (height - th) // 2
        # crude stroke by drawing offset shadows
        if stroke_thickness > 0:
            r = max(1, stroke_thickness // 2)
            for dx, dy in [(-r,0),(r,0),(0,-r),(0,r),(-r,-r),(-r,r),(r,-r),(r,r)]:
                draw.text((x+dx, y+dy), text, font=fnt, fill=stroke_color)
        draw.text((x, y), text, font=fnt, fill=font_color)
        img[:] = np.array(pil_img)

# Normalize/clean timings
def clamp(v, lo, hi):
    return max(lo, min(hi, v))

valid_timings = []
for w in word_timings_in:
    try:
        s = float(w.get('start', 0.0))
        e = float(w.get('end', s + 0.25))
        if e <= s:
            e = s + 0.25
        valid_timings.append({'word': str(w.get('word','')), 'start': s, 'end': e})
    except Exception:
        pass

if not valid_timings:
    raise RuntimeError(
        "word_timings is empty. Please ensure you have:\n"
        "1. Run the word alignment cell that populates 'word_timings'\n"
        "2. Verified that the alignment completed successfully\n"
        f"Current word_timings length: {len(word_timings_in)}"
    )

total_duration = valid_timings[-1]['end']

def frame_for_time(t):
    # Generate a single frame for time t
    # Find the active word whose [start,end) contains t
    active = None
    for w in valid_timings:
        if w['start'] <= t < w['end']:
            active = w
            break
    img = np.zeros((height, width, 3), dtype=np.uint8)
    if active:
        draw_centered_text(img, active['word'])
    return img

# Render frames
num_frames = int(math.ceil(total_duration * video_fps))
for i in range(num_frames):
    t = i / video_fps
    frame = frame_for_time(t)
    writer.write(frame)

writer.release()
print(f"Wrote silent video: {temp_video_path}")

# Mux audio using ffmpeg
# - Copy video stream, re-encode audio to AAC for compatibility
# - Shorten to the shorter of video/audio so they end together
ffmpeg_cmd = [
    "ffmpeg",
    "-y",
    "-i", str(temp_video_path),
    "-i", str(source_audio),
    "-c:v", "copy",
    "-c:a", "aac",
    "-shortest",
    str(final_video_path)
]
print("Running:", " ".join(ffmpeg_cmd))
try:
    subprocess.run(ffmpeg_cmd, check=True)
    print(f"Wrote final video with audio: {final_video_path}")
except FileNotFoundError:
    print("ffmpeg not found on PATH. Please install FFmpeg or add it to PATH.")

Wrote silent video: out\lyrics_black_temp.mp4
Running: ffmpeg -y -i out\lyrics_black_temp.mp4 -i conviction.wav -c:v copy -c:a aac -shortest out\lyrics_black_with_audio.mp4
Wrote final video with audio: out\lyrics_black_with_audio.mp4
Wrote final video with audio: out\lyrics_black_with_audio.mp4
