# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

# Install these Dependencies in the terminal on a fresh Conda environment

**Use python 3.9.19**

pip install git+https://github.com/openai/whisper.git

pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git

pip install setuptools-rust

pip install pandas

pip install torchaudio 

pip install --upgrade jupyter

pip install --upgrade --quiet jupyter_client ipywidgets

pip install notebook==6.1.5

jupyter nbextension enable --py widgetsnbextension

conda install -c conda-forge 'ffmpeg<7'

brew install wget

In [1]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /private/var/folders/5c/82jhhgtd42x9kx80ygmpcshm0000gn/T/pip-req-build-641ov0wf
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /private/var/folders/5c/82jhhgtd42x9kx80ygmpcshm0000gn/T/pip-req-build-641ov0wf
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting jiwer
  Using cached jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting click<9.0.0,>=8.1.3 (from jiwer)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Using cached rapidfuzz-3.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Using cached jiwer-3.0.4-py3-none-any.whl (21 kB)
Using cached click-8.1.

# Loading the LibriSpeech dataset

The following will load the test-clean split of the LibriSpeech corpus using torchaudio.

In [None]:
import os

# Define the path to the text file containing the URLs and the destination folder
url_file_path = '/Users/aliyadaire/final-project/final-project-dialect-dynamics/data/urls_coraal.txt'
destination_folder = '/Users/aliyadaire/final-project/final-project-dialect-dynamics/'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Read the URLs from the text file
with open(url_file_path, 'r') as file:
    urls = file.readlines()

# Download each file using wget
for url in urls:
    url = url.strip()  # Remove any leading/trailing whitespace
    if url:
        os.system(f'wget -P {destination_folder} {url}')

In [None]:
import zipfile

with zipfile.ZipFile('/Users/aliyadaire/final-project/final-project-dialect-dynamics/data/txt.zip', 'r') as zip_ref:
    zip_ref.extractall('/Users/aliyadaire/final-project/final-project-dialect-dynamics/')

In [None]:
import os
import pandas as pd

directory = '/Users/aliyadaire/final-project/final-project-dialect-dynamics/data/txt'

for filename in os.listdir(directory):
  f = os.path.join(directory, filename)
  if f.startswith('/Users/aliyadaire/final-project/final-project-dialect-dynamics/data/txt/ROC'):
    # print(f)
    df = pd.read_table(f)
    # df = pd.DataFrame
# df
pause = df[df['Content'].str.contains('pause')]
print(pause.count())


In [None]:
import os
import pandas as pd
import re

# Directory containing the text files
directory = '/Users/aliyadaire/final-project/final-project-dialect-dynamics/data/txt'

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if f.startswith('/Users/aliyadaire/final-project/final-project-dialect-dynamics/data/txt/ROC'):
        # Read the current text file into a DataFrame
        df = pd.read_table(f)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Exclude rows where the 'Content' column contains 'pause'
filtered_df = combined_df[~combined_df['Content'].str.contains('pause', na=False)].copy()

# Function to normalize 'Content' values
def normalize_content(content):
    # Remove brackets and convert to lowercase
    content = re.sub(r'[\[\]]', '', content)
    return content.lower().strip()

# Apply normalization and create a new column
filtered_df.loc[:, 'NormalizedContent'] = filtered_df['Content'].apply(normalize_content)

# Further filter out rows where 'NormalizedContent' contains '<' or '>'
filtered_df = filtered_df[~filtered_df['NormalizedContent'].str.contains(r'<.*?>', na=False)]

# Get value counts for the filtered and normalized 'Content'
value_counts = filtered_df['NormalizedContent'].value_counts()[:10]

# Print the value counts
print(value_counts)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
value_counts.plot(kind='bar')
plt.title('Content Value Counts Excluding Pauses')
plt.xlabel('Content')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
value_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.title('Content Value Counts Excluding Pauses')
plt.show()

In [None]:
# Directory containing the text files
directory = '/content/txt'

# Create a dictionary to store the counts for each state
state_counts = {'ATL': 0, 'DCA': 0, 'DCB': 0, 'DTA': 0, 'LES': 0, 'PRV': 0, 'VLD': 0}  # Add more states if needed

# Loop through each file in the directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # Check if the file starts with a known state prefix
    if f.startswith('/content/txt/ATL'):
        state_counts['ATL'] += 1
    elif f.startswith('/content/txt/DCA'):
        state_counts['DCA'] += 1
    elif f.startswith('/content/txt/DCB'):
        state_counts['DCB'] += 1
    elif f.startswith('/content/txt/DTA'):
        state_counts['DTA'] += 1
    elif f.startswith('/content/txt/LES'):
        state_counts['LES'] += 1
    elif f.startswith('/content/txt/PRV'):
        state_counts['PRV'] += 1
    elif f.startswith('/content/txt/VLD'):
        state_counts['VLD'] += 1    

# Convert the state counts dictionary to a DataFrame
state_counts_df = pd.DataFrame(list(state_counts.items()), columns=['State', 'Count'])

# Display the DataFrame
print(state_counts_df)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(state_counts_df['State'], state_counts_df['Count'], color='blue')
plt.xlabel('State')
plt.ylabel('Number of Files')
plt.title('Number of Files per State')
plt.show()

In [2]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm

DEVICE = torch.device('cpu')

# Check that MPS is available
# if not torch.backends.mps.is_available():
#     if not torch.backends.mps.is_built():
#         print("MPS not available because the current PyTorch install was not "
#               "built with MPS enabled.")
#     else:
#         print("MPS not available because the current MacOS version is not 12.3+ "
#               "and/or you do not have an MPS-enabled device on this machine.")

# else:
#     DEVICE = torch.device("mps")
#     ...

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        
        return (mel, text)

In [8]:
dataset = LibriSpeech("test-clean")
# For example, take the first 100 samples
subset_indices = list(range(40))
# Step 3: Create the subset
subset_dataset = torch.utils.data.Subset(dataset, subset_indices)
#test
loader = torch.utils.data.DataLoader(subset_dataset, batch_size=16)

# Running inference on the dataset using a base Whisper model

The following will take a few minutes to transcribe all utterances in the dataset.

In [9]:
#model = whisper.load_model("base.en")
model = whisper.load_model("tiny.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 37,184,256 parameters.


In [10]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)

In [11]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...
1,"Stuffered into you, his belly, counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
2,"After early nightfall, the yellow lamps would ...",AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND
4,"Number 10, fresh Nelly is waiting on you. Good...",NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...
5,The music came nearer and he recalled the word...,THE MUSIC CAME NEARER AND HE RECALLED THE WORD...
6,The dull light fell more faintly upon the page...,THE DULL LIGHT FELL MORE FAINTLY UPON THE PAGE...
7,"a cold, lucid indifference rained in his soul.",A COLD LUCID INDIFFERENCE REIGNED IN HIS SOUL
8,The chaos in which his order extinguished itse...,THE CHAOS IN WHICH HIS ARDOUR EXTINGUISHED ITS...
9,"At most, by an arms given to a beggar whose bl...",AT MOST BY AN ALMS GIVEN TO A BEGGAR WHOSE BLE...


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [13]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [14]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuffered into you, his belly, counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuffered into you his belly counseled him,stuff it into you his belly counseled him
2,"After early nightfall, the yellow lamps would ...",AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,"Number 10, fresh Nelly is waiting on you. Good...",NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nelly is waiting on you good n...,number 10 fresh nelly is waiting on you good n...
5,The music came nearer and he recalled the word...,THE MUSIC CAME NEARER AND HE RECALLED THE WORD...,the music came nearer and he recalled the word...,the music came nearer and he recalled the word...
6,The dull light fell more faintly upon the page...,THE DULL LIGHT FELL MORE FAINTLY UPON THE PAGE...,the dull light fell more faintly upon the page...,the dull light fell more faintly upon the page...
7,"a cold, lucid indifference rained in his soul.",A COLD LUCID INDIFFERENCE REIGNED IN HIS SOUL,a cold lucid indifference rained in his soul,a cold lucid indifference reigned in his soul
8,The chaos in which his order extinguished itse...,THE CHAOS IN WHICH HIS ARDOUR EXTINGUISHED ITS...,the chaos in which his order extinguished itse...,the chaos in which his ardor extinguished itse...
9,"At most, by an arms given to a beggar whose bl...",AT MOST BY AN ALMS GIVEN TO A BEGGAR WHOSE BLE...,at most by an arms given to a beggar whose ble...,at most by an alms given to a beggar whose ble...


In [15]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 2.28 %
