# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [5]:
# installs

#! pip install git+https://github.com/openai/whisper.git
#! pip install jiwer
#! pip install --upgrade pip
#! brew install ffmpeg

# install incase tiktoken does not provide a pre-built wheel for my platform
#! pip install setuptools-rust

Collecting setuptools-rust
  Downloading setuptools_rust-1.12.0-py3-none-any.whl.metadata (9.6 kB)
Collecting semantic_version<3,>=2.8.2 (from setuptools-rust)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Downloading setuptools_rust-1.12.0-py3-none-any.whl (28 kB)
Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: semantic_version, setuptools-rust
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [setuptools-rust]
[1A[2KSuccessfully installed semantic_version-2.10.0 setuptools-rust-1.12.0


In [None]:
# base.en vs. medium.en

# turbo model
# to transcribe speech in audio files using the turbo model:
# whisper audio.flac audio.mp3 audio.wav --model turbo

# Loading the LibriSpeech dataset

The following will load the test-clean split of the LibriSpeech corpus using torchaudio.
We won't be using this data set

In [2]:
import os
import numpy as np
import time
import jiwer

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
class ICUAudio(torch.utils.data.Dataset):
    """
    A simple class to wrap custom audio files and their transcripts.
    """
    def __init__(self, audio_files, references, device=DEVICE):
        self.audio_files = audio_files
        self.references = references
        self.device = device

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, item):
        audio = whisper.load_audio(self.audio_files[item])
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(self.device)
        text = self.references[item]
        return (mel, text)

In [4]:
audio_files = ["clip1.mp3", "clip2.mp3", "clip3.mp3"] # include audios Step 6
references  = ["first transcript", "second transcript", "third transcript"] # transcripts Step 1.5

dataset = CustomAudioDataset(audio_files, references)
loader = torch.utils.data.DataLoader(dataset, batch_size=1)

# Transcription portion-- replace audio.mp3 with proper mp3 file.

The following will take a few minutes to transcribe all utterances in the dataset.

In [5]:
def test_models(audio_files, references):
    models_to_test = ["base.en", "small.en", "medium.en", "large", "turbo"]
    results = []

    transformation = jiwer.Compose([
        jiwer.ToLowerCase(),
        jiwer.RemovePunctuation(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip()
    ])

    for model_name in models_to_test:
        print(f"Testing {model_name}...")
        model = whisper.load_model(model_name)

        hypotheses = []
        start_time = time.time()

        for audio_path in audio_files:
            result = model.transcribe(audio_path)
            hypotheses.append(result["text"])

        elapsed_time = time.time() - start_time

        error = jiwer.wer(references, hypotheses,
                          truth_transform=transformation,
                          hypothesis_transform=transformation)

        results.append({
            "model": model_name,
            "WER": f"{error:.2%}",
            "runtime_seconds": round(elapsed_time, 2)
        })

        print(f"  WER: {error:.2%} | Runtime: {elapsed_time:.2f}s")

    return pd.DataFrame(results)

results_df = test_models(audio_files, references)
print(results_df)

Model is English-only and has 71,825,408 parameters.


In [None]:
model = whisper.load_model("base.en") # choose model based on function test_models
# result = model.transcribe("audio.mp3")
# print(result["text"])

In [6]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)

In [7]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

  0%|          | 0/164 [00:00<?, ?it/s]

In [8]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...
...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [9]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [10]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuffered into you his belly counseled him,stuff it into you his belly counseled him
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nelly is waiting on you good n...,number 10 fresh nelly is waiting on you good n...
...,...,...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...,0 to shoot my soul is full meaning into future...,0 to shoot my soul is full meaning into future...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...,then i long tried by natural ills received the...,then i long tried by natural ills received the...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...,i love thee freely as men strive for right i l...,i love thee freely as men strive for right i l...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...,i love thee with the passion put to use in my ...,i love thee with the passion put to use in my ...


In [11]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 4.26 %
