In [1]:
# Environment check: run this first in the WhisperX env
import sys
import importlib

def _ver(mod):
    try:
        m = importlib.import_module(mod)
        return getattr(m, "__version__", "(no __version__)"), None
    except Exception as e:
        return None, f"{type(e).__name__}: {e}"

mods = [
    ("numpy",), ("pandas",), ("torch",), ("torchaudio",), ("whisperx",), ("demucs",)
]

print("Python:", sys.version)
for (name,) in mods:
    v, err = _ver(name)
    if v:
        print(f"{name:10s}", v)
    else:
        print(f"{name:10s}", "ERROR:", err)

print("\nNote: For WhisperX, use the Python 3.13 env we created: .venv-whisperx-py313.\n"
      "In VS Code, change the Notebook kernel (top-right) to that interpreter for this notebook.")

Python: 3.13.3 (tags/v3.13.3:6280bb5, Apr  8 2025, 14:47:33) [MSC v.1943 64 bit (AMD64)]
numpy      2.2.6
pandas     2.2.3
pandas     2.2.3
torch      2.8.0+cpu
torchaudio 2.8.0+cpu
whisperx   (no __version__)
demucs     4.0.1

Note: For WhisperX, use the Python 3.13 env we created: .venv-whisperx-py313.
In VS Code, change the Notebook kernel (top-right) to that interpreter for this notebook.
torch      2.8.0+cpu
torchaudio 2.8.0+cpu
whisperx   (no __version__)
demucs     4.0.1

Note: For WhisperX, use the Python 3.13 env we created: .venv-whisperx-py313.
In VS Code, change the Notebook kernel (top-right) to that interpreter for this notebook.


In [2]:
import tempfile
import os
import whisperx
import re

# Install: pip install demucs
import torch
import torchaudio
from demucs.pretrained import get_model
from demucs.apply import apply_model

song_path = 'shoobie.wav'
lyrics_text = """
I spend my life
doing anything you like
come on, and love me like you used to
"""

# Options
use_vocal_separation = True
whisperx_model_size = "base.en"
device = "cpu"


def extract_vocals(audio_path):
    """Separate vocals using Demucs (no DLL issues on Windows)"""
    print("Separating vocals from music with Demucs...")
    
    # Load model
    model = get_model('htdemucs')
    model.cpu()
    model.eval()
    
    # Load audio
    wav, sr = torchaudio.load(audio_path)
    
    # Apply separation
    with torch.no_grad():
        sources = apply_model(model, wav[None], device='cpu')[0]
    
    # Extract vocals (index 3)
    vocals = sources[3]
    
    # Save to temp file - FIXED: Use NamedTemporaryFile instead of mktemp
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
        temp_vocals = tmp.name
    
    torchaudio.save(temp_vocals, vocals, sr)
    
    print(f"Vocals extracted to: {temp_vocals}")
    return temp_vocals


def align_lyrics_to_audio(audio_path, lyrics_text, use_vocal_separation=True):
    """
    Takes YOUR lyrics and aligns them to the audio using WhisperX
    
    Args:
        audio_path: Path to audio file
        lyrics_text: YOUR lyrics as a string
        use_vocal_separation: Whether to use Spleeter first
    
    Returns:
        List of word timings matching YOUR lyrics
    """
    # Step 1: Extract vocals if enabled
    if use_vocal_separation:
        audio_to_use = extract_vocals(audio_path)
    else:
        audio_to_use = audio_path
    
    # Step 2: Load WhisperX model
    print(f"Loading WhisperX model ({whisperx_model_size})...")
    model = whisperx.load_model(whisperx_model_size, device, compute_type="float32")
    
    # Step 3: Load audio
    audio = whisperx.load_audio(audio_to_use)
    
    # Step 4: Transcribe to get initial segments
    print("Transcribing audio...")
    result = model.transcribe(audio, batch_size=16, language="en")
    
    # Step 5: Load alignment model for word-level timestamps
    print("Loading alignment model...")
    model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
    
    # Step 6: Align to get precise word timings
    print("Aligning words...")
    result_aligned = whisperx.align(
        result["segments"], 
        model_a, 
        metadata, 
        audio, 
        device,
        return_char_alignments=False
    )
    
    # Step 7: Extract word timings from WhisperX
    whisperx_words = []
    for segment in result_aligned["segments"]:
        for word_info in segment.get("words", []):
            whisperx_words.append({
                'word': word_info['word'].strip().lower(),
                'start': word_info['start'],
                'end': word_info['end']
            })
    
    # Step 8: Parse YOUR lyrics into words
    # TODO: Use LLM to parse lyrics into usable format
    your_words = re.findall(r'\b\w+\b', lyrics_text.lower())
    
    # Step 9: Match your lyrics words to WhisperX's timed words
    word_timings = []
    whisperx_idx = 0
    
    for your_word in your_words:
        # Find matching word in WhisperX results
        while whisperx_idx < len(whisperx_words):
            whisperx_word = whisperx_words[whisperx_idx]['word']
            
            if your_word in whisperx_word or whisperx_word in your_word:
                # Found a match
                word_timings.append({
                    'word': your_word,
                    'start': whisperx_words[whisperx_idx]['start'],
                    'end': whisperx_words[whisperx_idx]['end']
                })
                whisperx_idx += 1
                break
            else:
                whisperx_idx += 1
        else:
            # No match found, estimate timing
            if word_timings:
                # Use previous word's end time
                last_end = word_timings[-1]['end']
                word_timings.append({
                    'word': your_word,
                    'start': last_end,
                    'end': last_end + 0.5  # Estimate 0.5s duration
                })
    
    print(f"Aligned {len(word_timings)} words from YOUR lyrics")
    return word_timings


# Run the alignment
word_timings = align_lyrics_to_audio(song_path, lyrics_text, use_vocal_separation)

# Display results
print("\nYour lyrics with timings:")
for i, word_data in enumerate(word_timings):
    print(f"{i}: '{word_data['word']}' - {word_data['start']:.2f}s to {word_data['end']:.2f}s")


Separating vocals from music with Demucs...


  import pkg_resources
  import pkg_resources


Vocals extracted to: C:\Users\marcu\AppData\Local\Temp\tmptd2te9ur.wav
Loading WhisperX model (base.en)...


  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.list_audio_backends()
  torchaudio.list_audio_backends()
  available_backends = torchaudio.list_audio_backends()
  available_backends = torchaudio.list_audio_backends()


2025-11-15 18:06:14 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint e:\Documents\CodeStuff\BLAiRE\.venv-whisperx-py313\Lib\site-packages\whisperx\assets\pytorch_model.bin`
  torchaudio.list_audio_backends()
  torchaudio.list_audio_backends()


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cpu. Bad things might happen unless you revert torch to 1.x.
Transcribing audio...
Transcribing audio...
Loading alignment model...
Loading alignment model...
Aligning words...
Aligning words...
Aligned 17 words from YOUR lyrics

Your lyrics with timings:
0: 'i' - 2.71s to 3.56s
1: 'spend' - 3.58s to 3.88s
2: 'my' - 4.74s to 4.78s
3: 'life' - 4.80s to 5.04s
4: 'doing' - 5.45s to 6.01s
5: 'anything' - 6.17s to 6.83s
6: 'you' - 6.89s to 7.09s
7: 'like' - 7.13s to 7.50s
8: 'come' - 7.86s to 8.12s
9: 'on' - 8.60s to 8.98s
10: 'and' - 9.16s to 9.32s
11: 'love' - 9.49s to 9.63s
12: 'me' - 9.93s to 10.27s
13: 'like' - 10.69s to 11.17s
14: 'you' - 11.19s to 11.26s
15: 'used' - 11.39s to 11.86s
16: 'to' - 11.96s to 12.00s
Aligned 17 words from YOUR lyrics

Your lyrics with timings:
0: 'i' - 2.71s to 3.56s
1: 'spen