<a href="https://colab.research.google.com/github/Jamie643/Whisperer/blob/main/notebooks/LibriSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🔹 CELL 1: Setup + Imports


In [6]:
# Install the latest OpenAI Whisper library and dependencies
!pip uninstall -y whisper
!pip install git+https://github.com/openai/whisper.git
!pip install torchaudio jiwer

# Import dependencies
import os
import numpy as np

# In Colab, this avoids protobuf compatibility issues
try:
    import tensorflow
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm

# Set device to GPU if available, else CPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {DEVICE.upper()}")



[0mCollecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-j99zgymr
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-j99zgymr
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-

# 🔹 CELL 1.5: Install Pydub

In [5]:
!apt-get install ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


# Upload File

In [3]:
from google.colab import files
uploaded = files.upload()


Saving 01_A_Face_On_A_Billboard.mp3 to 01_A_Face_On_A_Billboard.mp3


# 🔹 CELL 2: Device Check + Whisper Setup

In [4]:
# Import necessary libraries
from pydub import AudioSegment
import torchaudio
import torch
import whisper
import os
import pandas as pd
from tqdm.notebook import tqdm

# Set device for running the model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Define function to convert MP3 to WAV and resample
def convert_mp3_to_wav(file_path, output_path):
    audio = AudioSegment.from_mp3(file_path).set_frame_rate(16000).set_channels(1)
    audio.export(output_path, format="wav")
    print(f"Audio file saved to {output_path}")

# Load MP3 file (after uploading)
file_path = "/content/01_A_Face_On_A_Billboard.mp3"  # Update to the path you uploaded to Colab
output_path = "/content/converted_audio.wav"
convert_mp3_to_wav(file_path, output_path)

# Load the Whisper model (English-only version)
model = whisper.load_model("base.en")

# Prepare the dataset for inference
def prepare_audio_for_inference(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    assert sample_rate == 16000, "Sample rate must be 16000 Hz"
    audio = whisper.pad_or_trim(waveform.flatten()).to(DEVICE)
    mel = whisper.log_mel_spectrogram(audio)
    return mel

# Run inference on the audio file
mel = prepare_audio_for_inference(output_path)

# Set options for the transcription (with timestamps)
options = whisper.DecodingOptions(language="en", with_timestamps=True)

# Decode the audio
result = model.decode(mel, options)

# Show results with timestamps
for segment in result["segments"]:
    print(f"[{segment['start']} - {segment['end']}] {segment['text']}")

# Store results in a DataFrame
timestamps = [(segment['start'], segment['end'], segment['text']) for segment in result["segments"]]
df = pd.DataFrame(timestamps, columns=["Start Time (s)", "End Time (s)", "Transcription"])
df.head()


ModuleNotFoundError: No module named 'pydub'

# 🔹 CELL 3: Dataset Class for Your MP3 Files

In [None]:
model = whisper.load_model("base.en", device=DEVICE)
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"with {sum(p.numel() for p in model.parameters()):,} parameters."
)


# 🔹 CELL 4: Point to Your MP3 Folder + Load

In [None]:
# Define the folder where your MP3 files are located
audio_folder = "/path/to/your/mp3/folder"  # Change this
audio_paths = [
    os.path.join(audio_folder, f)
    for f in os.listdir(audio_folder)
    if f.endswith(".mp3")
]

print(f"Found {len(audio_paths)} MP3 files.")


# 🔹 CELL 5: Transcribe the Audio

In [None]:
transcriptions = []

for path in tqdm(audio_paths):
    result = model.transcribe(path, language="en")  # Automatically handles long audio
    segments = result["segments"]
    full_text = result["text"]

    for segment in segments:
        transcriptions.append({
            "filename": os.path.basename(path),
            "start": segment["start"],
            "end": segment["end"],
            "text": segment["text"]
        })

df = pd.DataFrame(transcriptions)
df.head()



# 🔹 CELL 6: Save to CSV

In [None]:
normalizer = EnglishTextNormalizer()

df["cleaned_text"] = df["text"].map(normalizer)

# Optional WER calc if you have references:
# df["reference_clean"] = ...
# wer = jiwer.wer(df["reference_clean"], df["cleaned_text"])
# print(f"WER: {wer * 100:.2f} %")


# 🔹 CELL 7: Display Audio Player + Transcription

In [None]:
import IPython.display as ipd

def play_audio_segment(filename, start_sec=0, end_sec=10):
    path = os.path.join(audio_folder, filename)
    waveform, sample_rate = torchaudio.load(path)
    start_sample = int(start_sec * sample_rate)
    end_sample = int(end_sec * sample_rate)
    segment = waveform[:, start_sample:end_sample]
    display(ipd.Audio(segment.numpy(), rate=sample_rate))

# Example
sample_row = df.iloc[0]
play_audio_segment(sample_row["filename"], sample_row["start"], sample_row["end"])
print(f"Transcript: {sample_row['text']}")



# 🔹 CELL 8: Upload MP3 Files in Colab

In [None]:
from google.colab import files

uploaded = files.upload()
audio_paths = list(uploaded.keys())
print("Files uploaded:")
print(audio_paths)
