<a href="https://colab.research.google.com/github/Jamie643/Whisperer/blob/main/notebooks/LibriSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üîπ Cell 1: Install Dependencies


In [None]:
# Install the necessary libraries for Whisper transcription and audio processing
!pip install -q openai-whisper pydub
!pip install torchaudio jiwer
!apt-get install ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


# üîπ Cell 2: Imports & Setup

In [None]:
import whisper
import os
import pandas as pd

# --- Configuration ---
# "small" is much more accurate for YouTube/Medical terms than "base"
# "medium" is better but slower. "base" is fastest.
model_name = "small"
model = whisper.load_model(model_name)

# --- Path to your Video/Audio ---
audio_file_path = "/Do Arthritis Drugs Cause Cancer Find Out Here!.mp4"

if not os.path.exists(audio_file_path):
    print(f"‚ùå Error: File not found at '{audio_file_path}'. Check your upload.")
else:
    print(f"üöÄ Transcribing with '{model_name}' model... (Word-level timestamps enabled)")

    # --- PRO TRANSCRIPTION ---
    # word_timestamps=True allows us to create "Hormozi style" captions later.
    # initial_prompt helps the AI with punctuation and specific terminology.
    result = model.transcribe(
        audio_file_path,
        fp16=True if torch.cuda.is_available() else False,
        word_timestamps=True,
        initial_prompt="Arthritis, health, medical discussion, Diary of a CEO."
    )

    # whisper_output now contains segments AND individual word timings
    whisper_output = result["segments"]

    print("‚úÖ Transcription complete.")

    # Optional: Preview the first 2 words to verify timestamps
    first_word = whisper_output[0]['words'][0]
    print(f"Sample: '{first_word['word']}' starts at {first_word['start']}s")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 139M/139M [00:05<00:00, 27.7MiB/s]


Transcribing '/Do Arthritis Drugs Cause Cancer Find Out Here!.mp4' using the 'base' model...
Transcription complete. Output saved to 'whisper_output' variable.


# üîπ Cell 3: Upload Audio File

In [None]:
import json
import re

def create_formatted_transcript(whisper_output):
    """
    Enhanced version: Adds word-level chunking for high-impact
    YouTube captions while keeping structured sections.
    """
    if not whisper_output:
        print("Whisper output is empty.")
        return []

    formatted_transcript = []
    current_section = {
        "heading": "Introduction",
        "parts": []
    }

    new_section_threshold = 15
    prev_end = 0

    for i, segment in enumerate(whisper_output):
        # Section break logic
        if i > 0 and segment['start'] - prev_end > new_section_threshold:
            formatted_transcript.append(current_section)
            current_section = {
                "heading": f"Section {len(formatted_transcript) + 1}",
                "parts": []
            }

        # Timestamp formatting
        start_seconds = int(segment['start'])
        timestamp_str = f"{start_seconds // 60}:{start_seconds % 60:02d}"

        # --- PRO ADDITION: Word-Level Chunking ---
        # We extract individual words to allow for 'Hormozi' style rendering
        words_data = []
        if 'words' in segment:
            for w in segment['words']:
                words_data.append({
                    "word": w['word'].strip().upper(), # Uppercase is more 'Pro'
                    "start": round(w['start'], 2),
                    "end": round(w['end'], 2)
                })

        current_section["parts"].append({
            "timestamp": timestamp_str,
            "text": segment['text'].strip(),
            "words": words_data # This allows for word-by-word highlighting
        })

        prev_end = segment['end']

    if current_section["parts"]:
        formatted_transcript.append(current_section)

    return formatted_transcript

# Process and save
formatted_data = create_formatted_transcript(whisper_output)

output_filename = 'pro_transcript.json'
with open(output_filename, 'w') as f:
    json.dump(formatted_data, f, indent=2)

print(f"‚úÖ Pro transcript with word-level timing saved to '{output_filename}'")

---
Formatted Transcript Output:
[
  {
    "heading": "Introduction",
    "parts": [
      {
        "timestamp": "0:00",
        "text": "What anti-romantic drugs can increase the risk of cancer?"
      },
      {
        "timestamp": "0:05",
        "text": "And this is a common question that I get from my patients."
      },
      {
        "timestamp": "0:08",
        "text": "Many of you read on the internet,"
      },
      {
        "timestamp": "0:10",
        "text": "and the first thing that you find out on Google"
      },
      {
        "timestamp": "0:13",
        "text": "is the worst side effects of these drugs"
      },
      {
        "timestamp": "0:16",
        "text": "with cancer being very often on the top of the list."
      },
      {
        "timestamp": "0:21",
        "text": "But let me tell you what..."
      }
    ]
  }
]
---
Formatted transcript saved to 'formatted_transcript.json'


# **Cell 4**

In [None]:
from docx import Document
import json
from google.colab import files

def format_timestamp_srt(seconds):
    """Converts seconds to SRT timestamp format: HH:MM:SS,mmm"""
    td_hours = int(seconds // 3600)
    td_mins = int((seconds % 3600) // 60)
    td_secs = int(seconds % 60)
    td_millis = int((seconds % 1) * 1000)
    return f"{td_hours:02d}:{td_mins:02d}:{td_secs:02d},{td_millis:03d}"

# Load the data
output_filename = 'pro_transcript.json' # Using the improved JSON name from Cell 3
try:
    with open(output_filename, 'r') as f:
        formatted_data = json.load(f)
except FileNotFoundError:
    print("‚ùå Error: JSON file not found.")
    formatted_data = None

if formatted_data:
    # --- 1. Create the Word Document (For Reading) ---
    doc = Document()
    doc.add_heading('Video Transcript & Content Breakdown', 0)

    # --- 2. Create the SRT File (For Video Editing) ---
    srt_content = []
    counter = 1

    for section in formatted_data:
        doc.add_heading(section.get("heading", "Section"), level=1)

        for part in section.get("parts"):
            # Add to Word Doc
            doc.add_paragraph(f'[{part["timestamp"]}] {part["text"]}')

            # Add to SRT (Using segments for readability)
            # A 'Pro' move is to use the word-level data to create shorter SRT blocks
            start_srt = format_timestamp_srt(part.get("start", 0)) # Ensure Cell 3 passes 'start'
            end_srt = format_timestamp_srt(part.get("end", 0))

            srt_content.append(f"{counter}\n{start_srt} --> {end_srt}\n{part['text']}\n")
            counter += 1

    # Save files
    word_name = 'transcript_client_copy.docx'
    srt_name = 'video_captions_upload_me.srt'

    doc.save(word_name)
    with open(srt_name, 'w', encoding='utf-8') as f:
        f.writelines(srt_content)

    print(f"‚úÖ Created: {word_name} AND {srt_name}")

    # Download both
    for file in [word_name, srt_name]:
        try:
            files.download(file)
        except:
            print(f"Manual download required for {file}")

Word document saved as 'formatted_transcript.docx'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'formatted_transcript.docx' downloaded successfully.
