In [None]:
!pip install -q faster-whisper pydub
!apt-get -y install ffmpeg > /dev/null

In [None]:
import torch
from faster_whisper import WhisperModel

# Pick device + compute_type
if torch.cuda.is_available():
    device = "cuda"
    compute_type = "int8_float16"  # good mix of speed + quality
    print("Using CUDA GPU")
else:
    device = "cpu"
    compute_type = "int8"          # CPU will be slow, but works
    print("Using CPU (this will be slow)")

model = WhisperModel(
    "large-v3",
    device=device,
    compute_type=compute_type,
)

print("Model loaded.")

In [None]:
from pydub import AudioSegment
from pathlib import Path
import json

def make_chunks_with_overlap(
    audio_path: str,
    chunk_minutes: int = 5,
    overlap_seconds: int = 30,
):
    """
    Returns list of (chunk_audio_segment, chunk_start_time_sec).

    - Each chunk is ~chunk_minutes long.
    - Each chunk (except the first) starts `overlap_seconds` earlier than the
      previous chunk ended.
    """
    audio = AudioSegment.from_file(audio_path)
    total_ms = len(audio)

    chunk_ms = chunk_minutes * 60 * 1000
    overlap_ms = overlap_seconds * 1000

    chunks = []
    start_ms = 0

    while start_ms < total_ms:
        end_ms = min(start_ms + chunk_ms, total_ms)
        chunk = audio[start_ms:end_ms]
        start_sec = start_ms / 1000.0
        chunks.append((chunk, start_sec))

        if end_ms >= total_ms:
            break

        # Move start of next chunk: end - overlap
        start_ms = end_ms - overlap_ms

    return chunks

In [None]:
def transcribe_with_5min_chunks(
    audio_path: str,
    chunk_minutes: int = 5,
    overlap_seconds: int = 30,
    prompt_tail_chars: int = 300,
):
    """
    - Splits audio into ~5-min chunks with 30s overlap.
    - Uses faster-whisper large-v3 in translate mode (→ English).
    - Carries context forward via `initial_prompt` (last N chars of previous text).
    - Prints each chunk's transcript as soon as it's processed.
    - Returns:
        full_text: combined transcript with overlap de-duplicated
        all_segments: list of segments with GLOBAL timestamps + language info
        language_per_chunk: language metadata per 5-min chunk
    """
    chunks = make_chunks_with_overlap(
        audio_path,
        chunk_minutes=chunk_minutes,
        overlap_seconds=overlap_seconds,
    )

    print(f"Total chunks: {len(chunks)}")

    full_text_parts = []
    all_segments = []
    language_per_chunk = []   # NEW: language info per chunk

    prev_tail = ""          # for initial_prompt to next chunk
    overlap = overlap_seconds

    for idx, (chunk_audio, chunk_start_sec) in enumerate(chunks):
        print(f"\n========== Chunk {idx+1}/{len(chunks)} | starts at {chunk_start_sec:.1f}s ==========")

        # Save chunk to a temporary file for faster-whisper
        tmp_path = f"/tmp/chunk_{idx}.wav"
        chunk_audio.export(tmp_path, format="wav")

        # Run transcription on this chunk
        segments, info = model.transcribe(
            tmp_path,
            task="translate",                 # any language -> English
            beam_size=5,
            vad_filter=True,
            condition_on_previous_text=True,  # long-form behavior within chunk
            initial_prompt=prev_tail or None,
        )

        # record language info for this chunk
        language_per_chunk.append(
            {
                "chunk_index": idx,
                "start_time_sec": float(chunk_start_sec),
                "language": info.language,
                "language_probability": float(info.language_probability),
            }
        )
        print(
            f"Detected language for this chunk: {info.language} "
            f"(prob={info.language_probability:.2f})"
        )

        # Build text for this chunk (for printing & for global transcript)
        chunk_text_for_print = []
        chunk_text_for_merge = []

        for seg in segments:
            local_start = seg.start           # within this chunk
            local_end = seg.end
            text = seg.text

            # For printing: show everything from this chunk
            chunk_text_for_print.append(text)

            # For the global merged transcript:
            #  - convert to global timestamps
            #  - skip overlap at START of chunk (to reduce duplicates)
            global_start = chunk_start_sec + local_start
            global_end = chunk_start_sec + local_end

            if idx > 0 and local_start < overlap:
                # This is inside the overlapped region at the start of the chunk.
                # We *skip* adding it to the global transcript, because it should
                # already be covered by the previous chunk.
                continue

            chunk_text_for_merge.append(text)
            all_segments.append(
                {
                    "chunk_index": idx,
                    "start": float(global_start),
                    "end": float(global_end),
                    "text": text,
                    # NEW: attach language metadata from this chunk
                    "language": info.language,
                    "language_probability": float(info.language_probability),
                }
            )

        # Join texts
        chunk_text_print = "".join(chunk_text_for_print)
        chunk_text_merge = "".join(chunk_text_for_merge)

        #  Print this chunk’s transcript immediately
        print(chunk_text_print.strip())

        # Add to global transcript
        full_text_parts.append(chunk_text_merge)

        # Update tail prompt for next chunk
        if chunk_text_merge:
            prev_tail = chunk_text_merge[-prompt_tail_chars:]
        elif chunk_text_print:
            # fallback: if nothing added (because all in overlap), at least use some print text
            prev_tail = chunk_text_print[-prompt_tail_chars:]
        else:
            prev_tail = prev_tail  # no change if nothing

    full_text = "".join(full_text_parts)
    return full_text, all_segments, language_per_chunk

In [None]:

# ==== RUN IT ====
audio_file = '/content/drive/MyDrive/R20251107205247.WAV'

full_text, segments, language_per_chunk = transcribe_with_5min_chunks(
    audio_path=audio_file,
    chunk_minutes=5,
    overlap_seconds=30,
    prompt_tail_chars=300,   # you can tweak this
)

# Save combined transcript + segments with timestamps + language metadata
Path("transcript_en_merged.txt").write_text(full_text, encoding="utf-8")
Path("transcript_en_segments.json").write_text(
    json.dumps(segments, ensure_ascii=False, indent=2),
    encoding="utf-8",
)
Path("language_per_chunk.json").write_text(
    json.dumps(language_per_chunk, ensure_ascii=False, indent=2),
    encoding="utf-8",
)

print("\nSaved transcript_en_merged.txt, transcript_en_segments.json, language_per_chunk.json")
