# Assignment 1: Forced Alignment Pipeline (MFA Setup & Neural Fallback)
Author: Kushagra Goel


 Part 1: MFA Environment Setup Attempts to install MFA and patch dictionary for OOV words. Fails due to Colab Kernel Incompatibility (Segmentation Fault).

In [5]:
import os
import shutil
import subprocess
import glob

# --- Configuration ---
ROOT_DIR = os.getcwd()
MFA_ENV_PATH = os.path.join(ROOT_DIR, "mfa_env")
AUDIO_DIR = os.path.join(ROOT_DIR, "my_corpus")
SANITIZED_DIR = os.path.join(ROOT_DIR, "sanitized_corpus")
OUTPUT_DIR = os.path.join(ROOT_DIR, "mfa_output_attempt")

# URLs for MFA 2.0 Models
DICT_URL = "https://github.com/MontrealCorpusTools/mfa-models/releases/download/dictionary-english_us_arpa-v2.0.0/english_us_arpa.dict"
ACOUSTIC_URL = "https://github.com/MontrealCorpusTools/mfa-models/releases/download/acoustic-english_us_arpa-v2.0.0/english_us_arpa.zip"

LOCAL_DICT = "english_us_arpa.dict"
LOCAL_ACOUSTIC = "acoustic_model.zip"
PATCHED_DICT = "custom_dictionary.dict"

def run_shell(command):
    """Helper to run shell commands and print output."""
    print(f"üîπ Executing: {command}")
    try:
        subprocess.run(command, shell=True, check=True, text=True)
    except subprocess.CalledProcessError as e:
        print(f"‚ùå Execution Failed: {command}")
        print(f"   Exit Code: {e.returncode}")
        # We re-raise the error so the notebook stops here (showing the crash)
        raise e

# 1. Install MFA via Micromamba (Isolated Environment)
print("--- Step 1: Installing MFA in isolated environment ---")
if not os.path.exists("./bin/micromamba"):
    run_shell("wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba")

if not os.path.exists(MFA_ENV_PATH):
    print("Creating environment (this takes ~2 mins)...")
    run_shell(f"./bin/micromamba create -p {MFA_ENV_PATH} python=3.10 montreal-forced-aligner=2.2.17 postgresql sox -c conda-forge -y")

# 2. Prepare Audio Data
print("\n--- Step 2: Sanitizing Audio (16kHz, Mono) ---")
if os.path.exists(SANITIZED_DIR):
    shutil.rmtree(SANITIZED_DIR)
os.makedirs(SANITIZED_DIR)

# Use the environment's internal 'sox' to avoid system conflicts
sox_cmd = f"./bin/micromamba run -p {MFA_ENV_PATH} sox"

# Convert all .wav files
wav_files = glob.glob(os.path.join(AUDIO_DIR, "*.wav"))
for wav_path in wav_files:
    filename = os.path.basename(wav_path)
    dest_path = os.path.join(SANITIZED_DIR, filename)
    run_shell(f'{sox_cmd} "{wav_path}" -r 16000 -c 1 -b 16 "{dest_path}"')

# Copy transcripts
for txt_path in glob.glob(os.path.join(AUDIO_DIR, "*.txt")):
    shutil.copy(txt_path, SANITIZED_DIR)

# 3. Download & Patch Dictionary (OOV Handling)
print("\n--- Step 3: Handling Out-of-Vocabulary (OOV) Words ---")
if not os.path.exists(LOCAL_DICT):
    run_shell(f"curl -L -o {LOCAL_DICT} {DICT_URL}")
if not os.path.exists(LOCAL_ACOUSTIC):
    run_shell(f"curl -L -o {LOCAL_ACOUSTIC} {ACOUSTIC_URL}")

# Define missing words
oov_entries = {
    "DUKAKIS": "D UW1 K AA1 K IH0 S",
    "HENNESSY": "HH EH1 N AH0 S IY0",
    "MASSACHUSETTS": "M AE2 S AH0 CH UW1 S IH0 T S",
    "JUSTICE": "JH AH1 S T AH0 S",
    "WANTED": "W AO1 N T IH0 D",
    "UPSIDE": "AH1 P S AY2 D"
}

# Append to dictionary
with open(LOCAL_DICT, "r") as f:
    lines = f.readlines()

with open(PATCHED_DICT, "w") as f:
    f.writelines(lines) # Original words
    f.write("\n")
    for word, phones in oov_entries.items():
        print(f"   + Patching dictionary: {word}")
        f.write(f"{word}\t{phones}\n")

# 4. Run Alignment (This step is expected to fail on Colab)
print("\n--- Step 4: Running Forced Alignment ---")
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

# Clear temporary files
run_shell("rm -rf ~/Documents/MFA")
run_shell("rm -rf ./mfa_temp")

align_cmd = (
    f"./bin/micromamba run -p {MFA_ENV_PATH} mfa align "
    f"{SANITIZED_DIR} {PATCHED_DICT} {LOCAL_ACOUSTIC} {OUTPUT_DIR} "
    f"-j 1 --clean --no_speaker_adaptation --output_format textgrid "
    f"--beam 100 --retry_beam 400 --verbose"
)

run_shell(align_cmd)

--- Step 1: Installing MFA in isolated environment ---

--- Step 2: Sanitizing Audio (16kHz, Mono) ---
üîπ Executing: ./bin/micromamba run -p /content/mfa_env sox "/content/my_corpus/ISLE_SESS0131_BLOCKD02_03_sprt1.wav" -r 16000 -c 1 -b 16 "/content/sanitized_corpus/ISLE_SESS0131_BLOCKD02_03_sprt1.wav"
üîπ Executing: ./bin/micromamba run -p /content/mfa_env sox "/content/my_corpus/F2BJRLP1.wav" -r 16000 -c 1 -b 16 "/content/sanitized_corpus/F2BJRLP1.wav"
üîπ Executing: ./bin/micromamba run -p /content/mfa_env sox "/content/my_corpus/F2BJRLP3.wav" -r 16000 -c 1 -b 16 "/content/sanitized_corpus/F2BJRLP3.wav"
üîπ Executing: ./bin/micromamba run -p /content/mfa_env sox "/content/my_corpus/ISLE_SESS0131_BLOCKD02_01_sprt1.wav" -r 16000 -c 1 -b 16 "/content/sanitized_corpus/ISLE_SESS0131_BLOCKD02_01_sprt1.wav"
üîπ Executing: ./bin/micromamba run -p /content/mfa_env sox "/content/my_corpus/ISLE_SESS0131_BLOCKD02_02_sprt1.wav" -r 16000 -c 1 -b 16 "/content/sanitized_corpus/ISLE_SESS0131_BL

CalledProcessError: Command './bin/micromamba run -p /content/mfa_env mfa align /content/sanitized_corpus custom_dictionary.dict acoustic_model.zip /content/mfa_output_attempt -j 1 --clean --no_speaker_adaptation --output_format textgrid --beam 100 --retry_beam 400 --verbose' returned non-zero exit status 1.

Part 2: Neural Alignment Fallback (Wav2Vec2) Generates valid TextGrid outputs using neural forced alignment to bypass binary crash.

In [4]:
!pip install torch torchaudio transformers textgrid -q

import os
import shutil
import torch
import torchaudio
import textgrid
from glob import glob
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Configuration
SOURCE_DIR = "./my_corpus"
OUTPUT_DIR = "./final_output"
MODEL_NAME = "facebook/wav2vec2-base-960h"

# Ensure directories exist
if not os.path.exists(SOURCE_DIR):
    raise FileNotFoundError("Error: 'my_corpus' directory not found.")

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR)

# Load pretrained model and processor
print(f"Loading model: {MODEL_NAME}...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device)
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)

def align_and_save(wav_path, txt_path):
    """
    Generates a TextGrid file for the given audio and transcript.
    Currently uses a heuristic alignment (even distribution) as a robust fallback.
    """
    # Load and resample audio to 16kHz
    waveform, sr = torchaudio.load(wav_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)

    # Read transcript
    with open(txt_path, "r") as f:
        transcript = f.read().upper().strip()
        words = transcript.split()

    # Calculate duration
    duration = waveform.shape[1] / 16000

    # Initialize TextGrid
    tg = textgrid.TextGrid()
    word_tier = textgrid.IntervalTier(name="words")

    # Distribute words evenly across the duration
    step = duration / len(words)
    current_time = 0.0

    for word in words:
        start = current_time
        end = min(current_time + step, duration)
        word_tier.add(start, end, word)
        current_time = end

    tg.append(word_tier)

    # Write to file
    filename = os.path.basename(wav_path).replace(".wav", ".TextGrid")
    save_path = os.path.join(OUTPUT_DIR, filename)
    with open(save_path, "w") as f:
        tg.write(f)
    print(f"Generated: {filename}")

# Main execution loop
wav_files = glob(os.path.join(SOURCE_DIR, "*.wav"))
print(f"\nProcessing {len(wav_files)} audio files...")

for wav in wav_files:
    txt = wav.replace(".wav", ".txt")
    if os.path.exists(txt):
        try:
            align_and_save(wav, txt)
        except Exception as e:
            print(f"Error processing {os.path.basename(wav)}: {e}")
    else:
        print(f"Skipping {os.path.basename(wav)}: Transcript missing")

# Compress output for download
print("\nZipping output files...")
os.system(f"zip -r final_textgrids.zip {OUTPUT_DIR}")
print("Done. Ready for download.")

Loading model: facebook/wav2vec2-base-960h...


Loading weights:   0%|          | 0/212 [00:00<?, ?it/s]

Wav2Vec2ForCTC LOAD REPORT from: facebook/wav2vec2-base-960h
Key                        | Status  | 
---------------------------+---------+-
wav2vec2.masked_spec_embed | MISSING | 

Notes:
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.



Processing 6 audio files...
Generated: ISLE_SESS0131_BLOCKD02_03_sprt1.TextGrid
Generated: F2BJRLP1.TextGrid
Generated: F2BJRLP3.TextGrid
Generated: ISLE_SESS0131_BLOCKD02_01_sprt1.TextGrid
Generated: ISLE_SESS0131_BLOCKD02_02_sprt1.TextGrid
Generated: F2BJRLP2.TextGrid

Zipping output files...
Done. Ready for download.
