<a href="https://colab.research.google.com/github/JulzDave/transcriber/blob/main/transcriber_snapshot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-i22_sayk
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-i22_sayk
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!sudo apt update && sudo apt install ffmpeg

[33m0% [Working][0m            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
49 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install pydub



In [None]:
pip install tqdm



This script takes a long audio file, splits it into 5-minute chunks, transcribes each chunk into text using OpenAI's Whisper model, and saves the transcriptions in .srt subtitle format. It processes the chunks sequentially using ffmpeg for audio splitting, Whisper for transcription, and cleans up temporary files after each step.

In [None]:
import whisper
import subprocess
from tqdm import tqdm
import os
import gc

# Constants
input_file = "/content/drive/My Drive/Audiobooks/Howard Gardner - Frames of Mind (Unabridged)/Howard Gardner - Frames of Mind.m4b"
output_dir = "/content/drive/My Drive/Chunks"
chunk_length_sec = 5 * 60  # 5 minutes
model = whisper.load_model("base")  # Try 'tiny' if memory is still tight

# Make sure output folder exists
os.makedirs(output_dir, exist_ok=True)

# Get duration of the audio using ffmpeg
def get_audio_duration(filepath):
    result = subprocess.run(
        ["ffprobe", "-v", "error", "-show_entries",
         "format=duration", "-of",
         "default=noprint_wrappers=1:nokey=1", filepath],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT
    )
    return float(result.stdout)

total_duration = get_audio_duration(input_file)
num_chunks = int(total_duration // chunk_length_sec) + 1

def format_srt(segments):
    srt_content = ""
    for i, segment in enumerate(segments, 1):
        start = whisper.utils.format_timestamp(segment["start"])
        end = whisper.utils.format_timestamp(segment["end"])
        text = segment["text"]
        srt_content += f"{i}\n{start} --> {end}\n{text}\n\n"
    return srt_content

# Process each chunk separately
with tqdm(total=num_chunks, desc="Processing chunks", unit="chunk") as pbar:
    for i in range(num_chunks):
        start = i * chunk_length_sec
        chunk_file = os.path.join(output_dir, f"chunk_{i}.wav")

        # Extract audio chunk using ffmpeg
        subprocess.run([
            "ffmpeg", "-y",
            "-ss", str(start),
            "-t", str(chunk_length_sec),
            "-i", input_file,
            "-ar", "16000",  # downsample
            "-ac", "1",      # mono
            "-f", "wav", chunk_file
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        # Transcribe using Whisper
        result = model.transcribe(chunk_file)

        # Save SRT
        srt_path = os.path.join(output_dir, f"chunk_{i}.srt")
        with open(srt_path, "w", encoding="utf-8") as f:
            f.write(format_srt(result["segments"]))

        # Clean up
        os.remove(chunk_file)
        gc.collect()
        pbar.update(1)

print("✅ All chunks processed successfully.")


Processing chunks: 100%|██████████| 243/243 [5:49:26<00:00, 86.28s/chunk]

✅ All chunks processed successfully.





Aggregate all the SRT chunks into a single SRT file.

In [None]:
import os
import re
from datetime import timedelta, datetime

chunks_dir = "/content/drive/My Drive/Chunks"
merged_srt_path = "/content/drive/My Drive/merged_output.srt"

# Helper: Parse SRT file into blocks
def parse_srt(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        blocks = re.split(r"\n\s*\n", content)
        return blocks

# Helper: Parse timestamp string to timedelta
def timestamp_to_timedelta(ts):
    # Handle HH:MM:SS,mmm
    try:
        return datetime.strptime(ts, "%H:%M:%S,%f") - datetime(1900, 1, 1)
    except ValueError:
        # Handle MM:SS.mmm or similar
        try:
            return datetime.strptime(ts, "%M:%S.%f") - datetime(1900, 1, 1)
        except ValueError:
            raise ValueError(f"Unrecognized timestamp format: {ts}")

# Helper: Format timedelta to timestamp string
def timedelta_to_timestamp(td):
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    milliseconds = td.microseconds // 1000
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

# Collect and sort chunk SRT files
srt_files = sorted(
    [f for f in os.listdir(chunks_dir) if f.endswith(".srt")],
    key=lambda name: int(re.search(r'\d+', name).group())
)

# Merge with proper numbering and adjusted timestamps
all_blocks = []
counter = 1
chunk_duration = timedelta(minutes=5)

for chunk_index, filename in enumerate(srt_files):
    time_offset = chunk_duration * chunk_index
    blocks = parse_srt(os.path.join(chunks_dir, filename))

    for block in blocks:
        lines = block.strip().split("\n")
        if len(lines) >= 2:
            lines[0] = str(counter)  # Replace the subtitle index
            counter += 1

            # Adjust timestamps
            timestamp_line = lines[1]
            start_ts, end_ts = timestamp_line.split(" --> ")
            new_start = timedelta_to_timestamp(timestamp_to_timedelta(start_ts) + time_offset)
            new_end = timedelta_to_timestamp(timestamp_to_timedelta(end_ts) + time_offset)
            lines[1] = f"{new_start} --> {new_end}"

            all_blocks.append("\n".join(lines))

# Write to final SRT
with open(merged_srt_path, "w", encoding="utf-8") as f:
    f.write("\n\n".join(all_blocks))

print(f"✅ Merged {len(srt_files)} SRT files into: {merged_srt_path}")


✅ Merged 243 SRT files into: /content/drive/My Drive/merged_output.srt


In [None]:
import shutil
import os

chunks_path = "/content/drive/My Drive/Chunks"

# Check if the directory exists before deleting
if os.path.exists(chunks_path):
    shutil.rmtree(chunks_path)
    print(f"✅ Deleted folder and all contents: {chunks_path}")
else:
    print(f"⚠️ Folder not found: {chunks_path}")


✅ Deleted folder and all contents: /content/drive/My Drive/Chunks
