# local-transcribe · Main Notebook

This notebook runs **offline** transcription using models cached in `./models/`.

It supports two modes:
- **Dual-track**: clean merge from interviewer + participant files (no diarization)
- **Combined**: single mixed audio (ASR + diarization)

Run cells top-to-bottom. If anything fails, check the **Setup & Checks** cell first.

In [None]:
# === Parameters (edit these) ===
MODE = "dual"  # 'dual' or 'combined'
ASR_MODEL = "medium.en"  # 'medium.en' or 'large-v3-turbo'

# Absolute paths strongly recommended
INTERVIEWER_PATH = "/absolute/path/to/interviewer.m4a"  # used if MODE='dual'
PARTICIPANT_PATH = "/absolute/path/to/participant.m4a"  # used if MODE='dual'

COMBINED_PATH = "/absolute/path/to/mixed.m4a"           # used if MODE='combined'

# Where outputs go (will be created if missing)
OUTPUT_DIR = "/absolute/path/to/output/session_folder"

# Export options
WRITE_VTT = False
RENDER_BLACK = False  # burn subtitles to a black video (requires ffmpeg)


In [None]:
# === Setup & Checks ===
import os, sys, pathlib
repo_root = pathlib.Path.cwd().resolve().parent if (pathlib.Path.cwd().name == "notebooks") else pathlib.Path.cwd().resolve()
sys.path.append(str(repo_root / "src"))

# Force offline for runtime (models must already exist in ./models)
os.environ.setdefault("HF_HOME", str(repo_root / "models"))
os.environ.setdefault("TRANSFORMERS_CACHE", str(repo_root / "models"))
os.environ.setdefault("PYANNOTE_CACHE", str(repo_root / "models" / "diarization"))
os.environ.setdefault("XDG_CACHE_HOME", str(repo_root / "models" / ".xdg"))
os.environ.setdefault("HF_HUB_OFFLINE", "1")

models_dir = repo_root / "models"
assert models_dir.exists(), "models/ not found. Run scripts/download_models.py first."

print("Repo root:", repo_root)
print("Models dir:", models_dir)
print("MODE:", MODE, "ASR_MODEL:", ASR_MODEL)
print("OUTPUT_DIR:", OUTPUT_DIR)

from session import ensure_session_dirs
from audio_io import standardize_and_get_path
from asr import transcribe_with_alignment
from turns import build_turns
from merge import merge_turn_streams
from srt_vtt import write_srt, write_vtt
from txt_writer import write_timestamped_txt, write_plain_txt
from render_black import render_black_video
from diarize import diarize_mixed


In [None]:
# === Validate parameters & create session dirs ===
out_paths = ensure_session_dirs(OUTPUT_DIR)
print("Output paths:", out_paths)

if MODE not in ("dual", "combined"):
    raise ValueError("MODE must be 'dual' or 'combined'")
if ASR_MODEL not in ("medium.en", "large-v3-turbo"):
    raise ValueError("ASR_MODEL must be 'medium.en' or 'large-v3-turbo'")

if MODE == "dual":
    if not (pathlib.Path(INTERVIEWER_PATH).exists() and pathlib.Path(PARTICIPANT_PATH).exists()):
        raise FileNotFoundError("Dual mode requires valid INTERVIEWER_PATH and PARTICIPANT_PATH")
elif MODE == "combined":
    if not pathlib.Path(COMBINED_PATH).exists():
        raise FileNotFoundError("Combined mode requires valid COMBINED_PATH")


In [None]:
# === Dual-track pipeline (skip if MODE='combined') ===
if MODE == "dual":
    # 1) Standardize inputs (convert to WAV 16k mono on temp paths if needed)
    std_int_path = standardize_and_get_path(INTERVIEWER_PATH)
    std_part_path = standardize_and_get_path(PARTICIPANT_PATH)
    
    # 2) Transcribe + align per track
    interviewer_words = transcribe_with_alignment(std_int_path, asr_model=ASR_MODEL, role="Interviewer")
    participant_words = transcribe_with_alignment(std_part_path, asr_model=ASR_MODEL, role="Participant")

    # 3) Build turns per track
    interviewer_turns = build_turns(interviewer_words, speaker_label="Interviewer")
    participant_turns  = build_turns(participant_words,  speaker_label="Participant")

    # 4) Merge by time with overlap handling
    merged_turns = merge_turn_streams(interviewer_turns, participant_turns)

    # 5) Write per-speaker artifacts
    write_timestamped_txt(interviewer_turns, out_paths["speaker_interviewer"] / "interviewer.timestamped.txt")
    write_plain_txt(interviewer_turns,        out_paths["speaker_interviewer"] / "interviewer.txt")
    write_timestamped_txt(participant_turns,  out_paths["speaker_participant"] / "participant.timestamped.txt")
    write_plain_txt(participant_turns,        out_paths["speaker_participant"] / "participant.txt")

    # 6) Write merged artifacts
    write_timestamped_txt(merged_turns, out_paths["merged"] / "transcript.timestamped.txt")
    write_plain_txt(merged_turns,       out_paths["merged"] / "transcript.txt")
    srt_path = out_paths["merged"] / "subtitles.srt"
    write_srt(merged_turns, srt_path)
    if WRITE_VTT:
        write_vtt(merged_turns, out_paths["merged"] / "subtitles.vtt")
    if RENDER_BLACK:
        render_black_video(srt_path, out_paths["merged"] / "black_subtitled.mp4", audio_path=std_int_path)  # you can swap audio_path

    print("Dual-track processing complete.")


In [None]:
# === Combined (mixed) pipeline (skip if MODE='dual') ===
if MODE == "combined":
    # 1) Standardize input
    std_mix_path = standardize_and_get_path(COMBINED_PATH)

    # 2) Transcribe + align whole file
    words = transcribe_with_alignment(std_mix_path, asr_model=ASR_MODEL, role=None)

    # 3) Diarize into speakers (pyannote)
    diarized_turns = diarize_mixed(std_mix_path, words)

    # 4) Write merged artifacts
    write_timestamped_txt(diarized_turns, out_paths["merged"] / "transcript.timestamped.txt")
    write_plain_txt(diarized_turns,       out_paths["merged"] / "transcript.txt")
    srt_path = out_paths["merged"] / "subtitles.srt"
    write_srt(diarized_turns, srt_path)
    if WRITE_VTT:
        write_vtt(diarized_turns, out_paths["merged"] / "subtitles.vtt")
    if RENDER_BLACK:
        render_black_video(srt_path, out_paths["merged"] / "black_subtitled.mp4", audio_path=std_mix_path)

    print("Combined (mixed) processing complete.")


In [None]:
# === Summary ===
from pathlib import Path
print("Artifacts in:", out_paths["root"]) 
for p in sorted(Path(out_paths["root"]).rglob("*")):
    if p.is_file():
        print(" -", p.relative_to(out_paths["root"]))
