# Modeling with Whisper:

### Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
import plotly.express as px
import plotly.graph_objects as go
import soundfile as sf
import sklearn
from sklearn.model_selection import train_test_split
from datasets import load_dataset 
from datasets import Dataset
from datasets import DatasetDict
from transformers import Seq2SeqTrainingArguments 
from transformers import Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

### Load data

In [3]:
# These are just how those files are named in the loaded data, it has nothing to do with the actual training, validation and testing we know.
train_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04/en/train.csv', on_bad_lines = 'skip', sep='\t')
val_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04/en/validated.csv', on_bad_lines = 'skip', sep='\t')
test_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04/en/test.csv', on_bad_lines = 'skip', sep='\t')

In [4]:
token = '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'
gs = load_dataset("speechcolab/gigaspeech", "xs", use_auth_token=token)

### Preprocess data

In [5]:
# dropping the columns we don't need
train_df = train_df[['path', 'sentence']]
val_df = val_df[['path', 'sentence']]
test_df = test_df[['path', 'sentence']]

# concatenating the dataframes so we can do the train_test_split
df = pd.concat([train_df, val_df, test_df], ignore_index=True)
df = df.drop_duplicates(subset=['path'])

# parsing sentences
df['sentence'] = df['sentence'].str.lower()

prefix = '/Users/alijanatiidr/Desktop/Columbia/Applied ML/Project/cv-corpus-10.0-delta-2022-07-04/en/clips/'

df['path'] = prefix + df['path']

df

Unnamed: 0,path,sentence
0,/Users/alijanatiidr/Desktop/Columbia/Applied M...,the following were the members for hastings.
1,/Users/alijanatiidr/Desktop/Columbia/Applied M...,in the afternoon the prussian troops withdrew ...
2,/Users/alijanatiidr/Desktop/Columbia/Applied M...,wang went to school in new mexico and nevada.
3,/Users/alijanatiidr/Desktop/Columbia/Applied M...,john and lydia swift.
4,/Users/alijanatiidr/Desktop/Columbia/Applied M...,she grew up in the township of pleasant point ...
...,...,...
11772,/Users/alijanatiidr/Desktop/Columbia/Applied M...,he had two daughters with her.
12285,/Users/alijanatiidr/Desktop/Columbia/Applied M...,walls and crowe later divorced.
12299,/Users/alijanatiidr/Desktop/Columbia/Applied M...,the land which is now the recreation ground wa...
12497,/Users/alijanatiidr/Desktop/Columbia/Applied M...,he later became the master in charge of cricke...


In [6]:
gs_train = pd.DataFrame(gs['train'])
gs_test = pd.DataFrame(gs['test'])
gs_val = pd.DataFrame(gs['validation'])

gs = pd.concat([gs_train, gs_test, gs_val], ignore_index=True)

gs = gs[['audio', 'text']]

In [7]:
gs = gs.rename(columns={'audio': 'audio_signal', 'text': 'sentence'})
#we are only going to take the first 2000 data points since we don't have any GPU for our training
gs = gs[:2000]

In [8]:
# Getting fine tuning dataset of the form (path of mp3, sentence)
gs = gs.rename(columns={'audio_signal': 'path', 'sentence': 'text'})
gs['path'] = gs['path'].apply(lambda x: x['path'])

gs['text'] = gs['text'].str.lower()
gs['text'] = gs['text'].str.replace('<comma>', ',')
gs['text'] = gs['text'].str.replace('<period>', '.')
gs['text'] = gs['text'].str.replace('<questionmark>', '?')
gs['text'] = gs['text'].str.replace('<exclamationmark>', '!')

In [9]:
df = df.rename(columns={'audio_signal': 'path', 'sentence': 'text'})

In [10]:
fine_tuning_df = pd.concat([gs, df], ignore_index=True)

In [11]:
# path to audio signal using librosa
def read_audio_signal(row):
    path = row['path']
    try:
        audio_signal, sample_rate = sf.read(path)
        return audio_signal
    except Exception as e:
        # Handle any exceptions that might occur during file reading
        print(f"Error reading file '{path}': {e}")
        return None

fine_tuning_df['audio'] = fine_tuning_df.apply(read_audio_signal, axis=1)

fine_tuning_df['sampling_rate'] = 16000

fine_tuning_df = fine_tuning_df[['path', 'audio', 'sampling_rate' , 'text']]

In [12]:
fine_tuning_df

Unnamed: 0,path,audio,sampling_rate,text
0,/Users/alijanatiidr/.cache/huggingface/dataset...,"[0.000518798828125, 0.0008544921875, 0.0001220...",16000,"as they're leaving , can kash pull zahra aside..."
1,/Users/alijanatiidr/.cache/huggingface/dataset...,"[0.001434326171875, 0.001373291015625, 0.00131...",16000,six tomatoes .
2,/Users/alijanatiidr/.cache/huggingface/dataset...,"[-0.000457763671875, -0.000335693359375, -0.00...",16000,and something brought back restored from the r...
3,/Users/alijanatiidr/.cache/huggingface/dataset...,"[0.000213623046875, 0.0003662109375, 0.0005493...",16000,to help screen reader users in the midst of di...
4,/Users/alijanatiidr/.cache/huggingface/dataset...,"[0.006195068359375, 0.0052490234375, 0.0039672...",16000,for alice had read several nice little stories...
...,...,...,...,...
11585,/Users/alijanatiidr/Desktop/Columbia/Applied M...,"[0.0, 1.7661124299128694e-12, 1.56589411576257...",16000,he had two daughters with her.
11586,/Users/alijanatiidr/Desktop/Columbia/Applied M...,"[0.0, -8.515981160268221e-15, -4.4422456571503...",16000,walls and crowe later divorced.
11587,/Users/alijanatiidr/Desktop/Columbia/Applied M...,"[0.0, 6.933249488104963e-16, -8.30595425879883...",16000,the land which is now the recreation ground wa...
11588,/Users/alijanatiidr/Desktop/Columbia/Applied M...,"[0.0, -2.5103812258120417e-13, -2.844536238499...",16000,he later became the master in charge of cricke...


In [13]:
train_size = 0.8
val_size = 0.1
test_size = 0.1

fine_tuning_df_train = fine_tuning_df[:int(len(fine_tuning_df)*train_size)]
fine_tuning_df_val = fine_tuning_df[int(len(fine_tuning_df)*train_size):int(len(fine_tuning_df)*(train_size+val_size))]
fine_tuning_df_test = fine_tuning_df[int(len(fine_tuning_df)*(train_size+val_size)):]

fine_tuning_df_train = fine_tuning_df_train.reset_index(drop=True)
fine_tuning_df_val = fine_tuning_df_val.reset_index(drop=True)
fine_tuning_df_test = fine_tuning_df_test.reset_index(drop=True)

In [14]:
Data = DatasetDict({'train': Dataset.from_pandas(fine_tuning_df_train), 'validation': Dataset.from_pandas(fine_tuning_df_val), 'test': Dataset.from_pandas(fine_tuning_df_test)})

In [15]:
Data = Data.remove_columns(['path'])

In [16]:
Data

DatasetDict({
    train: Dataset({
        features: ['audio', 'sampling_rate', 'text'],
        num_rows: 9272
    })
    validation: Dataset({
        features: ['audio', 'sampling_rate', 'text'],
        num_rows: 1159
    })
    test: Dataset({
        features: ['audio', 'sampling_rate', 'text'],
        num_rows: 1159
    })
})

# Performance Assessment without fine tuning

### Whisper tiny

In [17]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer


processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny.en")
tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny.en")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
# import wer metric
from datasets import load_metric
wer_metric = load_metric("wer")

test_predictions = []



with torch.no_grad():
    for example in Data['test']:
        input_features = processor(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        generated_ids = model.generate(input_features.input_features)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
        test_predictions.append(transcription[0])

In [19]:
test_text = []
for sentence in Data['test']['text']:
    test_text.append([sentence])

In [20]:
test_text = [item for sublist in test_text for item in sublist]

In [21]:
wer = wer_metric.compute(predictions=test_predictions, references=test_text)
print("WER: {}".format(round(wer, 2)))

WER: 2.92


In [22]:
cer_metric = load_metric("cer")

cer_whisper_tiny = cer_metric.compute(predictions=test_predictions, references=test_text)

print(f"CER: {cer_whisper_tiny}")

CER: 2.2840200699407025


In [23]:
# numbers of parameters in the model
model.num_parameters()

37760256

### Whisper Large v3 

In [24]:
# load whisper large v3 model
processor_large_v3 = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model_large_v3 = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3")
tokenizer_large_v3 = AutoTokenizer.from_pretrained("openai/whisper-large-v3")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
# predictions on test set
test_predictions_large_v3 = []

with torch.no_grad():
    for example in Data['test']:
        input_features = processor_large_v3(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        generated_ids = model_large_v3.generate(input_features.input_features)
        transcription = processor_large_v3.batch_decode(generated_ids, skip_special_tokens=True)
        test_predictions_large_v3.append(transcription)

In [35]:
test_predictions_large_v3_list = [item for sublist in test_predictions_large_v3 for item in sublist]

In [34]:
# wer score
wer_whisper_large_v3 = wer_metric.compute(predictions=test_predictions_large_v3_list, references=test_text)

# cer score
cer_whisper_large_v3 = cer_metric.compute(predictions=test_predictions_large_v3_list, references=test_text)

print(f"WER: {wer_whisper_large_v3}")

print(f"CER: {cer_whisper_large_v3}")

WER: 0.49178069611534575
CER: 0.2261213319142466


In [36]:
# numbers of parameters in the model
model_large_v3.num_parameters()

1543490560

### Whisper medium 

In [37]:
# load whisper medium model
processor_medium = AutoProcessor.from_pretrained("openai/whisper-medium")
model_medium = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-medium")
tokenizer_medium = AutoTokenizer.from_pretrained("openai/whisper-medium")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
# predictions on test set
test_predictions_medium = []

with torch.no_grad():
    for example in Data['test']:
        input_features = processor_medium(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        generated_ids = model_medium.generate(input_features.input_features)
        transcription = processor_medium.batch_decode(generated_ids, skip_special_tokens=True)
        test_predictions_medium.append(transcription[0])

In [42]:

# wer score
wer_whisper_medium = wer_metric.compute(predictions=test_predictions_medium, references=test_text)

# cer score
cer_whisper_medium = cer_metric.compute(predictions=test_predictions_medium, references=test_text)

In [43]:
print(f"WER: {wer_whisper_medium}")

print(f"CER: {cer_whisper_medium}")

WER: 0.563779961428965
CER: 0.27809031473316104


In [44]:
# number of parameters in the model
model_medium.num_parameters()

763857920

### Whisper base 

In [45]:
# load whisper base model
processor_base = AutoProcessor.from_pretrained("openai/whisper-base")
model_base = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-base")
tokenizer_base = AutoTokenizer.from_pretrained("openai/whisper-base")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [46]:
# predictions on test set
test_predictions_base = []


with torch.no_grad():
    for example in Data['test']:
        input_features = processor_base(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        generated_ids = model_base.generate(input_features.input_features)
        transcription = processor_base.batch_decode(generated_ids, skip_special_tokens=True)
        test_predictions_base.append(transcription[0])

In [47]:


# wer score
wer_whisper_base = wer_metric.compute(predictions=test_predictions_base, references=test_text)

# cer score
cer_whisper_base = cer_metric.compute(predictions=test_predictions_base, references=test_text)

In [49]:
print(f"WER: {wer_whisper_base}")

print(f"CER: {cer_whisper_base}")

WER: 2.270089080723666
CER: 1.618823171658811


In [48]:
# number of parameters in the model
model_base.num_parameters()

72593920

### Whisper small 

In [50]:
# load whisper-small.en model
processor_small = AutoProcessor.from_pretrained("openai/whisper-small.en")
model_small = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small.en")
tokenizer_small = AutoTokenizer.from_pretrained("openai/whisper-small.en")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [51]:
# predictions on test set
test_predictions_small = []


with torch.no_grad():
    for example in Data['test']:
        input_features = processor_small(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        generated_ids = model_small.generate(input_features.input_features)
        transcription = processor_small.batch_decode(generated_ids, skip_special_tokens=True)
        test_predictions_small.append(transcription[0])

In [52]:
# wer score
wer_whisper_small = wer_metric.compute(predictions=test_predictions_small, references=test_text)

# cer score
cer_whisper_small = cer_metric.compute(predictions=test_predictions_small, references=test_text)

print(f"WER: {wer_whisper_small}")

print(f"CER: {cer_whisper_small}")

WER: 0.8979704288731748
CER: 0.5407784704272465


In [53]:
# number of parameters in the model
model_small.num_parameters()

241734144

In [60]:
# plot wer and cer scores vs number of parameters
fig = go.Figure()

fig.add_trace(go.Line(x=[model.num_parameters(), model_large_v3.num_parameters(), model_medium.num_parameters(), model_base.num_parameters(), model_small.num_parameters()], y=[wer, wer_whisper_large_v3, wer_whisper_medium, wer_whisper_base, wer_whisper_small], mode='markers', name='WER'))
fig.add_trace(go.Line(x=[model.num_parameters(), model_large_v3.num_parameters(), model_medium.num_parameters(), model_base.num_parameters(), model_small.num_parameters()], y=[cer_whisper_tiny, cer_whisper_large_v3, cer_whisper_medium, cer_whisper_base, cer_whisper_small], mode='markers', name='CER'))

fig.update_layout(title='WER and CER vs Number of Parameters', xaxis_title='Number of Parameters', yaxis_title='WER and CER')

fig.show()

### Performance assessment summary

In [None]:
df = pd.DataFrame(columns=['Model', 'WER', 'CER', 'Number of parameters'])
df = df.append({'Model': 'Whisper-Tiny', 'WER': wer, 'CER': cer_whisper_tiny, 'Number of parameters': model.num_parameters()}, ignore_index=True)
df = df.append({'Model': 'Whisper-Large-V3', 'WER': wer_whisper_large_v3, 'CER': cer_whisper_large_v3, 'Number of parameters': model_large_v3.num_parameters()}, ignore_index=True)
df = df.append({'Model': 'Whisper-Medium', 'WER': wer_whisper_medium, 'CER': cer_whisper_medium, 'Number of parameters': model_medium.num_parameters()}, ignore_index=True)
df = df.append({'Model': 'Whisper-Base', 'WER': wer_whisper_base, 'CER': cer_whisper_base, 'Number of parameters': model_base.num_parameters()}, ignore_index=True)
df = df.append({'Model': 'Whisper-Small', 'WER': wer_whisper_small, 'CER': cer_whisper_small, 'Number of parameters': model_small.num_parameters()}, ignore_index=True)

# order by number of parameters
df = df.sort_values(by=['Number of parameters'])

# reset index
df = df.reset_index(drop=True)

df

# Fine tuning whisper tiny on the training set

In [None]:
class whisper(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny.en")
        self.processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny.en")
    
    def forward(self, input_features, decoder_input_ids):
        return self.model(input_features, decoder_input_ids=decoder_input_ids)
    
    def generate(self, input_features):
        return self.model.generate(input_features)


    def process_audio(self, audio):
        return self.processor(audio, sampling_rate=16000, return_tensors="pt")
    
    def tokenize_text(self, text):
        return self.tokenizer(text, return_tensors="pt")


In [None]:
# get number of parameters
model = whisper()
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


37184256


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
epochs = 10
batch_size = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

trainloader = torch.utils.data.DataLoader(Data['train'], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valloader = torch.utils.data.DataLoader(Data['validation'], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

train_loss = []
val_loss = []


for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
    model.train()
    batch_train_loss = []
    for batch in trainloader:
        input_features = batch[0]
        input_features = model.process_audio(input_features)
        labels = batch[1]
        labels = model.tokenize_text(labels)
        outputs = model(input_features, decoder_input_ids=labels)
        loss = outputs.loss
        batch_train_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss.append(sum(batch_train_loss)/len(batch_train_loss))
    print(f"Training loss: {train_loss[-1]}")
    model.eval()
    batch_val_loss = []
    for batch in valloader:
        input_features = batch[0]
        labels = batch[1]
        with torch.no_grad():
            outputs = model(input_features, decoder_input_ids=labels)
            loss = outputs.loss
            batch_val_loss.append(loss.item())
    val_loss.append(sum(batch_val_loss)/len(batch_val_loss))
    print(f"Validation loss: {val_loss[-1]}")