<a href="https://colab.research.google.com/github/Jamie643/Whisperer/blob/main/notebooks/LibriSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üîπ Cell 1: Install Dependencies


In [1]:
# Install the necessary libraries for Whisper transcription and audio processing
!pip install -q openai-whisper pydub
!pip install torchaudio jiwer
!apt-get install ffmpeg

[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m798.7/803.2 kB[0m [31m27.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m803.2/803.2 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-c

# üîπ Cell 2: Imports & Setup

In [None]:
import whisper
import os
import pandas as pd
import torch  # <--- Added this to fix the NameError
import time

# --- Configuration ---
model_name = "small"
model = whisper.load_model(model_name)

# --- Path to your Video/Audio ---
audio_file_path = "/content/MostIsolatedCity_In_The_World.mp3"

if not os.path.exists(audio_file_path):
    print(f"‚ùå Error: File not found at '{audio_file_path}'. Please check the file name in the left sidebar.")
else:
    print(f"üöÄ Transcribing with '{model_name}' model... (Word-level timestamps enabled)")
    start_time = time.time()

    # --- PRO TRANSCRIPTION ---
    # fp16=True uses the GPU (T4) to make this much faster
    result = model.transcribe(
        audio_file_path,
        fp16=True if torch.cuda.is_available() else False,
        word_timestamps=True,
        initial_prompt="Arthritis, health, medical discussion, Diary of a CEO."
    )

    whisper_output = result["segments"]

    end_time = time.time()
    elapsed = end_time - start_time
    print(f"‚úÖ Transcription complete in {elapsed:.2f} seconds.")

üöÄ Transcribing with 'small' model... (Word-level timestamps enabled)


# üîπ Cell 3: Upload Audio File

In [3]:
import json
import re

def create_formatted_transcript(whisper_output):
    """
    Enhanced version: Adds word-level chunking for high-impact
    YouTube captions while keeping structured sections.
    """
    if not whisper_output:
        print("Whisper output is empty.")
        return []

    formatted_transcript = []
    current_section = {
        "heading": "Introduction",
        "parts": []
    }

    new_section_threshold = 15
    prev_end = 0

    for i, segment in enumerate(whisper_output):
        # Section break logic
        if i > 0 and segment['start'] - prev_end > new_section_threshold:
            formatted_transcript.append(current_section)
            current_section = {
                "heading": f"Section {len(formatted_transcript) + 1}",
                "parts": []
            }

        # Timestamp formatting
        start_seconds = int(segment['start'])
        timestamp_str = f"{start_seconds // 60}:{start_seconds % 60:02d}"

        # --- PRO ADDITION: Word-Level Chunking ---
        # We extract individual words to allow for 'Hormozi' style rendering
        words_data = []
        if 'words' in segment:
            for w in segment['words']:
                words_data.append({
                    "word": w['word'].strip().upper(), # Uppercase is more 'Pro'
                    "start": round(w['start'], 2),
                    "end": round(w['end'], 2)
                })

        current_section["parts"].append({
            "timestamp": timestamp_str,
            "text": segment['text'].strip(),
            "words": words_data # This allows for word-by-word highlighting
        })

        prev_end = segment['end']

    if current_section["parts"]:
        formatted_transcript.append(current_section)

    return formatted_transcript

# Process and save
formatted_data = create_formatted_transcript(whisper_output)

output_filename = 'pro_transcript.json'
with open(output_filename, 'w') as f:
    json.dump(formatted_data, f, indent=2)

print(f"‚úÖ Pro transcript with word-level timing saved to '{output_filename}'")

NameError: name 'whisper_output' is not defined

# **Cell 4**

In [None]:
from docx import Document
import json
from google.colab import files

def format_timestamp_srt(seconds):
    """Converts seconds to SRT timestamp format: HH:MM:SS,mmm"""
    td_hours = int(seconds // 3600)
    td_mins = int((seconds % 3600) // 60)
    td_secs = int(seconds % 60)
    td_millis = int((seconds % 1) * 1000)
    return f"{td_hours:02d}:{td_mins:02d}:{td_secs:02d},{td_millis:03d}"

# Load the data
output_filename = 'pro_transcript.json' # Using the improved JSON name from Cell 3
try:
    with open(output_filename, 'r') as f:
        formatted_data = json.load(f)
except FileNotFoundError:
    print("‚ùå Error: JSON file not found.")
    formatted_data = None

if formatted_data:
    # --- 1. Create the Word Document (For Reading) ---
    doc = Document()
    doc.add_heading('Video Transcript & Content Breakdown', 0)

    # --- 2. Create the SRT File (For Video Editing) ---
    srt_content = []
    counter = 1

    for section in formatted_data:
        doc.add_heading(section.get("heading", "Section"), level=1)

        for part in section.get("parts"):
            # Add to Word Doc
            doc.add_paragraph(f'[{part["timestamp"]}] {part["text"]}')

            # Add to SRT (Using segments for readability)
            # A 'Pro' move is to use the word-level data to create shorter SRT blocks
            start_srt = format_timestamp_srt(part.get("start", 0)) # Ensure Cell 3 passes 'start'
            end_srt = format_timestamp_srt(part.get("end", 0))

            srt_content.append(f"{counter}\n{start_srt} --> {end_srt}\n{part['text']}\n")
            counter += 1

    # Save files
    word_name = 'transcript_client_copy.docx'
    srt_name = 'video_captions_upload_me.srt'

    doc.save(word_name)
    with open(srt_name, 'w', encoding='utf-8') as f:
        f.writelines(srt_content)

    print(f"‚úÖ Created: {word_name} AND {srt_name}")

    # Download both
    for file in [word_name, srt_name]:
        try:
            files.download(file)
        except:
            print(f"Manual download required for {file}")