<a href="https://colab.research.google.com/github/Jamie643/Whisperer/blob/main/notebooks/LibriSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üîπ Cell 1: Install Dependencies


In [4]:
# Install the necessary libraries for Whisper transcription and audio processing
!pip install -q openai-whisper pydub
!pip install torchaudio jiwer
!apt-get install ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


# üîπ Cell 2: Imports & Setup

In [7]:
import whisper
import os
import torchaudio
from pydub import AudioSegment
import pandas as pd

# --- Configuration ---
# Choose a Whisper model. "base" is a good balance of speed and accuracy.
# You can choose a smaller model like "tiny" for faster processing.
model_name = "base"
model = whisper.load_model(model_name) # This will be loaded later in cell 5

# --- Transcribe the Audio ---
# Replace 'your_audio_file.mp3' with the path to your audio file.
# You can upload files to Colab by clicking the folder icon on the left.
audio_file_path = "/Do Arthritis Drugs Cause Cancer Find Out Here!.mp4"

if not os.path.exists(audio_file_path):
    print(f"Error: Audio file not found at '{audio_file_path}'. Please upload your file or correct the path.")
else:
    print(f"Transcribing '{audio_file_path}' using the '{model_name}' model...")

    # The result contains the transcription details, including segments.
    result = model.transcribe(audio_file_path, fp16=False)

    # We extract the 'segments' key, which is the input for our formatter.
    whisper_output = result["segments"]

    print("Transcription complete. Output saved to 'whisper_output' variable.")
    # For a quick preview of the raw output:
    # print(whisper_output)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 139M/139M [00:05<00:00, 27.7MiB/s]


Transcribing '/Do Arthritis Drugs Cause Cancer Find Out Here!.mp4' using the 'base' model...
Transcription complete. Output saved to 'whisper_output' variable.


# üîπ Cell 3: Upload Audio File

In [8]:
import json
import re

def create_formatted_transcript(whisper_output):
    """
    Takes the raw output from a Whisper transcription and formats it into
    a structured JSON-like list with headings and parts.

    This function uses a simple heuristic to create new sections based on
    pauses in the speech (when the 'start' timestamp is significantly
    later than the previous part's 'end' timestamp).

    Args:
        whisper_output (list): A list of dictionaries from the Whisper model,
                               where each dict has 'text', 'start', and 'end' keys.

    Returns:
        list: A list of dictionaries representing the formatted transcript.
    """
    if not whisper_output:
        print("Whisper output is empty. No transcript to format.")
        return []

    formatted_transcript = []
    current_section = {
        "heading": "Introduction", # Default heading, can be changed
        "parts": []
    }

    # Set the threshold for a new section in seconds. Adjust as needed.
    new_section_threshold = 15

    # Placeholder for the previous segment's end time to detect pauses.
    prev_end = 0

    for i, segment in enumerate(whisper_output):
        # Check for a long pause to create a new section
        if i > 0 and segment['start'] - prev_end > new_section_threshold:
            # Append the completed section and start a new one
            formatted_transcript.append(current_section)
            current_section = {
                "heading": f"Section {len(formatted_transcript) + 1}",
                "parts": []
            }

        # Format the timestamp from seconds to a "MM:SS" string
        start_seconds = int(segment['start'])
        minutes = start_seconds // 60
        seconds = start_seconds % 60
        timestamp_str = f"{minutes}:{seconds:02d}"

        current_section["parts"].append({
            "timestamp": timestamp_str,
            "text": segment['text'].strip()
        })

        # Update the previous end time
        prev_end = segment['end']

    # Add the final section after the loop
    if current_section["parts"]:
        formatted_transcript.append(current_section)

    return formatted_transcript

# --- How to use this function with the Whisper output ---
# The 'whisper_output' variable from the previous cell is used here.

# Process the output
formatted_data = create_formatted_transcript(whisper_output)

# Save the data to a JSON file
output_filename = 'formatted_transcript.json'
with open(output_filename, 'w') as f:
    json.dump(formatted_data, f, indent=2)

print("---")
print("Formatted Transcript Output:")
print(json.dumps(formatted_data, indent=2))
print("---")
print(f"Formatted transcript saved to '{output_filename}'")

---
Formatted Transcript Output:
[
  {
    "heading": "Introduction",
    "parts": [
      {
        "timestamp": "0:00",
        "text": "What anti-romantic drugs can increase the risk of cancer?"
      },
      {
        "timestamp": "0:05",
        "text": "And this is a common question that I get from my patients."
      },
      {
        "timestamp": "0:08",
        "text": "Many of you read on the internet,"
      },
      {
        "timestamp": "0:10",
        "text": "and the first thing that you find out on Google"
      },
      {
        "timestamp": "0:13",
        "text": "is the worst side effects of these drugs"
      },
      {
        "timestamp": "0:16",
        "text": "with cancer being very often on the top of the list."
      },
      {
        "timestamp": "0:21",
        "text": "But let me tell you what..."
      }
    ]
  }
]
---
Formatted transcript saved to 'formatted_transcript.json'


# **Cell 4**

In [9]:
# Install python-docx library if you haven't already
!pip install -q python-docx

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m253.0/253.0 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
from docx import Document
import json
from google.colab import files

# Load the formatted transcript from the JSON file
output_filename = 'formatted_transcript.json'
try:
    with open(output_filename, 'r') as f:
        formatted_data = json.load(f)
except FileNotFoundError:
    print(f"Error: '{output_filename}' not found. Please ensure Cell 3 has been run successfully.")
    formatted_data = None

if formatted_data:
    # Create a new Word document
    document = Document()

    # Add content to the document
    for section in formatted_data:
        document.add_heading(section.get("heading", "Section"), level=1)
        for part in section.get("parts", []):
            document.add_paragraph(f'{part.get("timestamp", "")} {part.get("text", "")}')

    # Save the document
    word_output_filename = 'formatted_transcript.docx'
    document.save(word_output_filename)

    print(f"Word document saved as '{word_output_filename}'")

    # Download the document
    try:
        files.download(word_output_filename)
        print(f"'{word_output_filename}' downloaded successfully.")
    except Exception as e:
        print(f"Error downloading the file: {e}")
        print("You can manually download the file from the files sidebar on the left.")

Word document saved as 'formatted_transcript.docx'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'formatted_transcript.docx' downloaded successfully.
