# FACTR ‚Äî ASR (faster-whisper) + Diarize/Align (WhisperX)
**Version:** v2025-09-07_1.0  
**Purpose:** Turn AUDIO_PATH into UTTERANCES.parquet with speakers & timestamps.


# Why these pins / choices?

We avoid the old ‚ÄúTranscriptionOptions(..., multilingual=‚Ä¶)‚Äù mismatch by not calling WhisperX‚Äôs ASR; instead we use faster-whisper for ASR and keep WhisperX just for alignment & diarization, which works cleanly with:

faster-whisper==1.1.1

ctranslate2==4.4.0

onnxruntime==1.19.2

whisperx@git + pyannote.audio==3.3.2

The ASR model list tries medium.en then falls back to small.en; on CPU it uses small.en.

compute_type="int8_float16" (GPU) or "int8" (CPU) keeps VRAM/RAM in check and reduces OOMs.

vad_filter=False avoids pulling an extra VAD model; diarization handles speaker turns anyway.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## üîß Section 0 ‚Äî One-cell setup (installs & sanity print)

In [2]:
%%bash
set -euo pipefail

# 0) Keep pip modern (but below the 25.3 change)
pip install -q --upgrade "pip<25.3" wheel

# 1) Baseline scientific stack (match what faster-whisper/onnxruntime expect)
pip install -q --upgrade --force-reinstall --no-cache-dir \
  "numpy==2.0.2" "pandas==2.2.3" "pyarrow>=15,<17" "jedi>=0.16"

# 2) PyTorch trio (Colab will auto-pick a CUDA build if a GPU is attached)
pip install -q --upgrade --force-reinstall --no-cache-dir \
  "torch==2.5.1" "torchvision==0.20.1" "torchaudio==2.5.1"

# 3) ASR stack (faster-whisper + onnxruntime, versions that play nicely together)
pip install -q --upgrade --force-reinstall --no-cache-dir \
  "faster-whisper==1.1.1" "ctranslate2==4.4.0" "onnxruntime==1.19.2"

# 4) WhisperX + pyannote
pip install -q --upgrade --force-reinstall --no-cache-dir \
  "git+https://github.com/m-bain/whisperx.git" "pyannote.audio==3.3.2"

# 5) Utilities
pip install -q --upgrade librosa soundfile matplotlib

# ‚úÖ 6) Re-pin NumPy last to avoid accidental upgrades during deps resolution
pip install -q --upgrade --force-reinstall --no-cache-dir "numpy==2.0.2"

# Show real breakages (warnings here are ok)
pip check || true


pyannote-metrics 4.0.0 has requirement numpy>=2.2.2, but you have numpy 2.0.2.
torchvision 0.20.1 has requirement torch==2.5.1, but you have torch 2.8.0.
google-colab 1.0.0 has requirement pandas==2.2.2, but you have pandas 2.3.2.
google-colab 1.0.0 has requirement requests==2.32.4, but you have requests 2.32.5.
tensorflow 2.19.0 has requirement protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.32.1.
google-ai-generativelanguage 0.6.15 has requirement protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 6.32.1.
grpcio-status 1.71.2 has requirement protobuf<6.0dev,>=5.26.1, but you have protobuf 6.32.1.
cudf-cu12 25.6.0 has requirement pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.2.
bigframes 2.18.0 has requirement rich<14,>=12.4.4, but you have rich 14.1.0.
dask-cudf-cu12 25.6.0 has requirement pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.2.
datasets 4.0.0 has requirement fs

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyannote-metrics 4.0.0 requires numpy>=2.2.2, but you have numpy 2.0.2 which is incompatible.
torchvision 0.20.1 requires torch==2.5.1, but you have torch 2.8.0 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
tensorflow 2.19.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.32.1 which is incompatible.
bigframes 2.18.0 requires rich<14,>=12.4.4, but you have rich 14.1.0 which is incompatible.
datasets 4.0.0 requires fsspec[http]<=2025.3.0,>=2023.1.0, but you have fsspec 2025.9.0 which is incompatible.
ERROR: pip's dependency resolver does not currently take into account all the packages 

In [3]:
# --- Print versions so we can diff future runs quickly ---
import sys, importlib, torch, numpy as np, pandas as pd
mods = ["faster_whisper","ctranslate2","onnxruntime","whisperx","pyannote.audio"]
print("Python:", sys.version.split()[0])
print("CUDA available:", torch.cuda.is_available())
print("Torch:", torch.__version__)
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
for m in mods:
    try:
        mod = importlib.import_module(m if m!="pyannote.audio" else "pyannote.audio")
        print(f"{m:16s}", getattr(mod, "__version__", "git"))
    except Exception as e:
        print(f"{m:16s}", "not importable ->", e)


Python: 3.12.11
CUDA available: True
Torch: 2.8.0+cu128
NumPy: 2.0.2
Pandas: 2.3.2
faster_whisper   1.2.0
ctranslate2      4.4.0
onnxruntime      1.22.1
whisperx         git
pyannote.audio   not importable -> operator torchvision::nms does not exist


## üì• Section 1 ‚Äî Load the audio path from 02 (handoff)

In [4]:
import json, os

# Expect this file from FACTR_02
HANDOFF = "/content/drive/MyDrive/FATCR/data/processed/LAST_INGEST.json"
assert os.path.exists(HANDOFF), "LAST_INGEST.json not found (run FACTR_02 first)."

with open(HANDOFF, "r") as f:
    meta = json.load(f)

# --- Fix: make AUDIO_PATH absolute ---
repo_root = "/content/drive/MyDrive/FATCR"
AUDIO_PATH = os.path.join(repo_root, meta["audio_path"])
print("AUDIO_PATH:", AUDIO_PATH)

assert os.path.exists(AUDIO_PATH) and os.path.getsize(AUDIO_PATH) > 10_000, "Bad AUDIO_PATH."



AUDIO_PATH: /content/drive/MyDrive/FATCR/data/processed/speFWRuuJNs_16k_mono.wav


## üéôÔ∏è Section 2 ‚Äî ASR with faster-whisper (OOM-safe)

In [None]:
from faster_whisper import WhisperModel
import torch, gc

HAS_CUDA = torch.cuda.is_available()
DEVICE = "cuda" if HAS_CUDA else "cpu"
compute_type = "int8_float16" if HAS_CUDA else "int8"  # conservative & OOM-friendly

arch_candidates = ["medium.en", "small.en"] if HAS_CUDA else ["small.en"]

fw_model = None
last_err = None

for name in arch_candidates:
    try:
        print(f"‚Üí loading {name} in faster-whisper on {DEVICE} ({compute_type})")
        fw_model = WhisperModel(name, device=DEVICE, compute_type=compute_type)
        break
    except Exception as e:
        print("‚ö†Ô∏è load failed:", e)
        last_err = e
        gc.collect()
        torch.cuda.empty_cache() if HAS_CUDA else None

if fw_model is None:
    raise RuntimeError(f"Could not load a faster-whisper model. Last error: {last_err}")

# Do the transcription (disable VAD to avoid extra model load; diarization will handle speech turns)
segments_gen, info = fw_model.transcribe(
    AUDIO_PATH,
    language="en",           # set language if known to skip detection
    beam_size=5,
    vad_filter=False,        # True enables Silero-VAD; keep False unless you want pre-filtering
)

# Convert to a simple list of segments
asr_segments = []
for s in segments_gen:
    asr_segments.append({
        "start": float(s.start) if s.start is not None else None,
        "end":   float(s.end)   if s.end   is not None else None,
        "text": (s.text or "").strip()
    })

print(f"ASR segments: {len(asr_segments)} | detected language: {info.language or 'en'}")


‚Üí loading medium.en in faster-whisper on cuda (int8_float16)


In [None]:
%%bash
set -euo pipefail

# Detect GPU
python - <<'PY'
import torch, json, sys
print(json.dumps({"has_cuda": torch.cuda.is_available()}))
PY


In [None]:
%%bash
set -euo pipefail
# Install the CPU builds explicitly
pip install -q --force-reinstall --no-cache-dir \
  torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 \
  --index-url https://download.pytorch.org/whl/cpu


In [None]:
import torch
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
try:
    import torchaudio, torchvision
    print("Torchaudio:", torchaudio.__version__)
    print("Torchvision:", torchvision.__version__)
except Exception as e:
    print("Import error:", repr(e))


## üìè Section 3 ‚Äî (Optional) alignment with WhisperX

In [None]:
import whisperx

align_model, metadata = whisperx.load_align_model(
    language_code=(info.language or "en"),
    device=DEVICE
)
asr_aligned = whisperx.align(asr_segments, align_model, metadata, AUDIO_PATH, DEVICE)


In [None]:
import whisperx

# Pick the language: use 'info.language' if available, else default to English
language_code = "en"
try:
    if "info" in globals() and getattr(info, "language", None):
        language_code = info.language
except Exception:
    pass

align_model, metadata = whisperx.load_align_model(
    language_code=language_code,
    device=DEVICE
)

asr_aligned = whisperx.align(asr_segments, align_model, metadata, AUDIO_PATH, DEVICE)
print("‚úÖ Alignment complete with language:", language_code)


## üó£Ô∏è Section 4 ‚Äî Diarization with WhisperX

In [None]:
# If you have a HF token for diarization models, add it here (optional)
HUGGINGFACE_TOKEN = ""  # e.g. "hf_xxx"; leave empty to use public pipeline

if HUGGINGFACE_TOKEN:
    diar = whisperx.DiarizationPipeline(device=DEVICE, use_auth_token=HUGGINGFACE_TOKEN)
else:
    diar = whisperx.DiarizationPipeline(device=DEVICE)

diar_out = diar(AUDIO_PATH)

# Assign speakers to aligned words/segments
asr_spk = whisperx.assign_word_speakers(diar_out, asr_aligned)
print("Diarization done.")


## üíæ Section 5 ‚Äî Save as UTTERANCES.parquet

In [None]:
import pandas as pd, os

rows = []
for seg in asr_spk["segments"]:
    rows.append({
        "video_id": os.path.basename(AUDIO_PATH),
        "t_start":  seg.get("start"),
        "t_end":    seg.get("end"),
        "speaker":  seg.get("speaker", "SPEAKER_00"),
        "text":     (seg.get("text") or "").strip()
    })

df_utts = pd.DataFrame(rows)
os.makedirs("data/processed", exist_ok=True)
OUT_PARQUET = "data/processed/UTTERANCES.parquet"
df_utts.to_parquet(OUT_PARQUET, index=False)
print(f"‚úÖ wrote {OUT_PARQUET} with", len(df_utts), "rows")
df_utts.head()


## 6) Snapshot + hand-off pointer (JSON)

In [None]:
# === 6) Snapshot + hand-off pointer ===
import json, time, platform, os, sys
import torch, pandas as pd

# Basic metrics
dur_sec = float(df_utts["t_end"].fillna(0).max() or 0)
row_count = int(len(df_utts))

# Try to read versions safely
def safe_ver(modname):
    try:
        m = __import__(modname)
        return getattr(m, "__version__", "git")
    except Exception:
        return "n/a"

snap = {
    "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "platform": platform.platform(),
    "python": sys.version.split()[0],
    "cuda_available": torch.cuda.is_available(),
    "device": DEVICE,
    "compute_type": compute_type,
    "asr_model": getattr(fw_model, "_model_name", "unknown"),  # faster-whisper keeps name here
    "whisperx": safe_ver("whisperx"),
    "faster_whisper": safe_ver("faster_whisper"),
    "ctranslate2": safe_ver("ctranslate2"),
    "onnxruntime": safe_ver("onnxruntime"),
    "pyannote_audio": safe_ver("pyannote.audio"),
    "audio_path": AUDIO_PATH,
    "utterances_parquet": OUT_PARQUET,
    "rows": row_count,
    "duration_sec": round(dur_sec, 2),
}

os.makedirs("snapshots", exist_ok=True)
snap_path = f"snapshots/ASR_SNAPSHOT_{int(time.time())}.json"
with open(snap_path, "w") as f:
    json.dump(snap, f, indent=2)
print("üóÇÔ∏è Snapshot:", snap_path)

# Hand-off pointer for downstream notebooks
handoff = {
    "when": snap["ts"],
    "audio_path": AUDIO_PATH,
    "utterances_parquet": OUT_PARQUET,
    "rows": row_count,
    "note": "Use 'utterances_parquet' for FACTR_04_Claims+Embeddings",
}
with open("data/processed/LAST_ASR.json", "w") as f:
    json.dump(handoff, f, indent=2)
print("üìù Wrote data/processed/LAST_ASR.json")


## 7) Optional: git-push helper (uses GITHUB_PAT in Colab Secrets)

This commits your notebook, the snapshot, and the hand-off JSON.
If the parquet is > 20 MB, it avoids pushing it and commits only the pointer to keep the repo lean.

In [None]:
# === 7) Optional push to GitHub (needs GITHUB_PAT in Colab Secrets) ===
from google.colab import userdata
import urllib.parse, subprocess, os, shlex

REPO_DIR = "/content/drive/MyDrive/FATCR"
os.chdir(REPO_DIR)

def run(cmd):
    print("$", cmd)
    return subprocess.run(shlex.split(cmd), check=False)

print("üìÇ Repo status before push:")
run("git status -sb")

pat = userdata.get("GITHUB_PAT")
if not pat:
    print("‚ÑπÔ∏è No GITHUB_PAT in Colab Secrets ‚Äî skipping push.")
else:
    enc_pat = urllib.parse.quote(pat, safe="")
    REMOTE_URL = f"https://LukmaanViscomi:{enc_pat}@github.com/LukmaanViscomi/FATCR.git"

    # Always pull latest (rebase) to reduce non-fast-forward issues
    print("\nüîÑ Pulling latest (rebase, autostash)‚Ä¶")
    run(f"git pull --rebase --autostash {REMOTE_URL} main")

    # Decide whether to add the parquet (skip if very large)
    parquet_size = os.path.getsize(OUT_PARQUET) if os.path.exists(OUT_PARQUET) else 0
    add_parquet = parquet_size <= 20 * 1024 * 1024  # 20 MB

    # Stage files
    paths = [
        "notebooks",                 # your notebooks
        "snapshots",                 # ASR snapshot
        "data/processed/LAST_ASR.json",
    ]
    if add_parquet:
        paths.append(OUT_PARQUET)    # add parquet if small enough
    else:
        print(f"‚ÑπÔ∏è {OUT_PARQUET} is {parquet_size/1e6:.1f} MB; skipping to keep repo small.")

    run("git add " + " ".join(shlex.quote(p) for p in paths) + " .gitignore")

    # Commit only if there are staged changes
    changed = subprocess.run(["git","diff","--cached","--quiet"]).returncode != 0
    if changed:
        msg = "ASR+diarize results (snapshot + pointer)"
        print("\n‚úèÔ∏è Committing:", msg)
        run(f'git commit -m "{msg}"')
        print("\n‚¨ÜÔ∏è Pushing to main‚Ä¶")
        run(f"git push {REMOTE_URL} HEAD:main")
        print("\n‚úÖ Push complete.")
    else:
        print("\n‚ÑπÔ∏è Nothing to commit.")


In [None]:
# Config (tune these safely)
class CFG:
    DEVICE = "cuda"   # "cuda" or "cpu"
    ASR_MODEL = "small.en"   # try "medium.en" later
    CHUNK_LENGTH = 20        # seconds
    BEAM_SIZE = 3
    WORD_TIMESTAMPS = True
    USE_ALIGNMENT = True
    USE_DIARIZATION = True
    HUGGINGFACE_TOKEN = ""   # optional
print(vars(CFG))


In [None]:
# ASR -> Diarize -> Align -> Save parquet
import os, gc, torch, pandas as pd
from faster_whisper import WhisperModel
import whisperx

assert "AUDIO_PATH" in globals() and AUDIO_PATH and os.path.exists(AUDIO_PATH), "AUDIO_PATH missing (run Ingest)."

# Cap native threads to avoid RAM spikes
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

HAS_CUDA = torch.cuda.is_available() and (CFG.DEVICE == "cuda")
DEVICE = "cuda" if HAS_CUDA else "cpu"
compute_type = "int8_float16" if HAS_CUDA else "int8"

print(f"‚Üí loading {CFG.ASR_MODEL} in faster-whisper on {DEVICE} ({compute_type})")
fw = WhisperModel(CFG.ASR_MODEL, device=DEVICE, compute_type=compute_type)

segments_gen, info = fw.transcribe(
    AUDIO_PATH,
    language="en",
    beam_size=CFG.BEAM_SIZE,
    vad_filter=False,   # let WhisperX diarization handle VAD
    chunk_length=CFG.CHUNK_LENGTH,
    word_timestamps=CFG.WORD_TIMESTAMPS,
)

asr_segments = []
for s in segments_gen:
    seg = {"start": float(s.start) if s.start is not None else None,
           "end": float(s.end) if s.end is not None else None,
           "text": (s.text or "").strip()}
    if getattr(s, "words", None):
        seg["words"] = [{"start": float(w.start) if w.start is not None else None,
                         "end": float(w.end) if w.end is not None else None,
                         "word": w.word} for w in s.words]
    asr_segments.append(seg)
asr = {"segments": asr_segments, "language": (info.language or "en")}
print(f"ASR segments: {len(asr_segments)} | language: {asr['language']}")

# Diarization
if CFG.USE_DIARIZATION:
    try:
        from whisperx.diarize import DiarizationPipeline
    except Exception:
        from whisperx import DiarizationPipeline
    diar = DiarizationPipeline(device=DEVICE, use_auth_token=(CFG.HUGGINGFACE_TOKEN or None))
    diar_out = diar(AUDIO_PATH)
else:
    diar_out = {"segments": []}

# Alignment
if CFG.USE_ALIGNMENT:
    try:
        align_model, metadata = whisperx.load_align_model(language_code=asr["language"], device=DEVICE)
        asr_aligned = whisperx.align(asr["segments"], align_model, metadata, AUDIO_PATH, DEVICE)
    except AttributeError:
        from whisperx.alignment import load_align_model, align
        align_model, metadata = load_align_model(language_code=asr["language"], device=DEVICE)
        asr_aligned = align(asr["segments"], align_model, metadata, AUDIO_PATH, DEVICE)
else:
    asr_aligned = {"segments": asr["segments"]}

# Assign speakers
asr_spk = whisperx.assign_word_speakers(diar_out, asr_aligned)

rows = [{
    "video_id": os.path.basename(AUDIO_PATH),
    "t_start": s.get("start"),
    "t_end": s.get("end"),
    "speaker": s.get("speaker", "SPEAKER_00"),
    "text": (s.get("text") or "").strip(),
} for s in asr_spk["segments"]]

df = pd.DataFrame(rows)
df.to_parquet("UTTERANCES.parquet", index=False)
print("‚úÖ wrote UTTERANCES.parquet with", len(df), "rows")
try:
    display(df.head(10))
except Exception:
    print(df.head(10).to_string(index=False))


In [None]:
# Smoke test
import os, pandas as pd
assert os.path.exists("UTTERANCES.parquet"), "Missing UTTERANCES.parquet"
df = pd.read_parquet("UTTERANCES.parquet")
required = {"video_id","t_start","t_end","speaker","text"}
assert required.issubset(df.columns), f"Missing cols: {required - set(df.columns)}"
assert len(df) > 0, "No rows produced"
print("‚úÖ ASR+Diarize smoke test passed. Rows:", len(df))


In [None]:
# Snapshot
import json, time, os, subprocess, sys, torch, pandas as pd
snap = {
  "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
  "device": "cuda" if torch.cuda.is_available() else "cpu",
  "asr_model": "small.en",
  "pip_freeze": subprocess.check_output(["pip","freeze"], text=True).splitlines()[:150],
}
os.makedirs("snapshots", exist_ok=True)
import time as _t
p = f"snapshots/ASR_DIA_SNAPSHOT_{int(_t.time())}.json"
with open(p,"w") as f: json.dump(snap,f,indent=2)
print("üì∏ Saved:", p)
