<a href="https://colab.research.google.com/github/Jamie643/Whisperer/blob/main/notebooks/LibriSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🔹 Cell 1: Install Dependencies


In [None]:
!pip uninstall -y whisper
!pip install git+https://github.com/openai/whisper.git
!pip install torchaudio jiwer pydub
!apt-get install ffmpeg

[0mCollecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-49hhru3k
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-49hhru3k
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-

# 🔹 Cell 2: Imports & Setup

In [None]:
import os
import numpy as np
import torch
import pandas as pd
import whisper
import torchaudio
from pydub import AudioSegment
from tqdm.notebook import tqdm

# Fix for protobuf issue on Colab
try:
    import tensorflow
except ImportError:
    pass

# Device setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {DEVICE.upper()}")

Running on CUDA


# 🔹 Cell 3: Upload Audio File

In [None]:
from google.colab import files
uploaded = files.upload()

Saving 01_A_Face_On_A_Billboard.mp3 to 01_A_Face_On_A_Billboard.mp3


# 🔹 Cell 4: Convert MP3 to WAV

In [None]:
def convert_mp3_to_wav(file_path, output_path):
    audio = AudioSegment.from_mp3(file_path).set_frame_rate(16000).set_channels(1)
    audio.export(output_path, format="wav")
    print(f"Audio file saved to {output_path}")

file_path = "/content/01_A_Face_On_A_Billboard.mp3"  # Change name if needed
output_path = "/content/converted_audio.wav"
convert_mp3_to_wav(file_path, output_path)


Audio file saved to /content/converted_audio.wav


# 🔹 Cell 5: Load Model & Prepare Audio

In [None]:
model = whisper.load_model("base.en")

def prepare_audio_for_inference(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    assert sample_rate == 16000, "Sample rate must be 16000 Hz"
    audio = whisper.pad_or_trim(waveform.flatten()).to(DEVICE)
    mel = whisper.log_mel_spectrogram(audio)
    return mel

mel = prepare_audio_for_inference(output_path)

100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 58.6MiB/s]


# 🔹 Cell 6: Run Inference & Display Transcription

In [None]:
# Run transcription using the transcribe method (includes timestamps)
result = model.transcribe(output_path, language="en", verbose=True)

# Display transcribed segments with timestamps
for segment in result["segments"]:
    print(f"[{segment['start']} - {segment['end']}] {segment['text']}")

# Store results in a DataFrame
timestamps = [(segment['start'], segment['end'], segment['text']) for segment in result["segments"]]
df = pd.DataFrame(timestamps, columns=["Start Time (s)", "End Time (s)", "Transcription"])
df.head()

[00:00.000 --> 00:23.400]  It's a cold and cloudy winter afternoon in San Luis Obispo, and I'm retracing missing
[00:23.400 --> 00:27.280]  Cal Poly student Kristin Smart's last known steps.
[00:27.280 --> 00:39.880]  In May of 1996 from a house just off campus, to the entrance of her red brick dorm building.
[00:39.880 --> 00:45.280]  On perimeter road, seagulls are screaming overhead as I pass a huge, modern looking
[00:45.280 --> 00:50.400]  rec center with students on the second story looking down at me through Florida ceiling
[00:50.400 --> 00:54.960]  glass windows while they run on treadmills.
[00:54.960 --> 01:04.160]  Other students walk by me having loud conversations on their cell phones.
[01:04.160 --> 01:09.920]  Maybe I'm just self-conscious but it feels like some of them are looking at me suspiciously.
[01:09.920 --> 01:14.520]  I'm dressed pretty modestly and wearing a hat and a backpack, so I'm not sure what
[01:14.520 --> 01:19.520]  exactly they're picking up on, but

Unnamed: 0,Start Time (s),End Time (s),Transcription
0,0.0,23.4,It's a cold and cloudy winter afternoon in Sa...
1,23.4,27.28,Cal Poly student Kristin Smart's last known s...
2,27.28,39.88,"In May of 1996 from a house just off campus, ..."
3,39.88,45.28,"On perimeter road, seagulls are screaming ove..."
4,45.28,50.4,rec center with students on the second story ...


# 🔹 Cell 7: Save Transcriptions as DataFrame

In [None]:
timestamps = [(seg['start'], seg['end'], seg['text']) for seg in result["segments"]]
df = pd.DataFrame(timestamps, columns=["Start Time (s)", "End Time (s)", "Transcription"])
df.head()

Unnamed: 0,Start Time (s),End Time (s),Transcription
0,0.0,23.4,It's a cold and cloudy winter afternoon in Sa...
1,23.4,27.28,Cal Poly student Kristin Smart's last known s...
2,27.28,39.88,"In May of 1996 from a house just off campus, ..."
3,39.88,45.28,"On perimeter road, seagulls are screaming ove..."
4,45.28,50.4,rec center with students on the second story ...


# 🔹 Cell 8: Export as TXT format

In [None]:
# Export transcription to TXT file
txt_path = "/content/transcription.txt"

with open(txt_path, "w", encoding="utf-8") as f:
    for start, end, text in timestamps:
        f.write(f"[{start:.2f} - {end:.2f}] {text}\n")

print(f"Transcription saved to {txt_path}")


Transcription saved to /content/transcription.txt


# 🔹 Cell 8: Download to Local device

In [None]:
# Save the transcription text to a txt file
transcription_text = "\n".join([f"[{segment['start']} - {segment['end']}] {segment['text']}" for segment in result["segments"]])

# Write the transcription to a .txt file
file_name = "/content/transcription.txt"
with open(file_name, "w") as file:
    file.write(transcription_text)

# Provide a download link for the user
from google.colab import files
files.download(file_name)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download("/content/transcription.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>