# Modeling with Whisper:

### Import libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
import plotly.express as px
import plotly.graph_objects as go
import soundfile as sf
import sklearn
from sklearn.model_selection import train_test_split
from datasets import load_dataset 
from datasets import Dataset
from datasets import DatasetDict
from transformers import Seq2SeqTrainingArguments 
from transformers import Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

### Load data

In [4]:
# These are just how those files are named in the loaded data, it has nothing to do with the actual training, validation and testing we know.
#train_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04/en/train.csv', on_bad_lines = 'skip', sep='\t')
#val_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04/en/validated.csv', on_bad_lines = 'skip', sep='\t')
#test_df = pd.read_csv('cv-corpus-10.0-delta-2022-07-04/en/test.csv', on_bad_lines = 'skip', sep='\t')

In [5]:
token = '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'
gs = load_dataset("speechcolab/gigaspeech", "xs", use_auth_token=token)

Found cached dataset gigaspeech (C:/Users/Pc/.cache/huggingface/datasets/speechcolab___gigaspeech/xs/0.0.0/0db31224ad43470c71b459deb2f2b40956b3a4edfde5fb313aaec69ec7b50d3c)


  0%|          | 0/3 [00:00<?, ?it/s]

### Preprocess data

In [6]:
# dropping the columns we don't need
#train_df = train_df[['path', 'sentence']]
#val_df = val_df[['path', 'sentence']]
#test_df = test_df[['path', 'sentence']]

# concatenating the dataframes so we can do the train_test_split
#df = pd.concat([train_df, val_df, test_df], ignore_index=True)
#df = df.drop_duplicates(subset=['path'])

# parsing sentences
#df['sentence'] = df['sentence'].str.lower()

#prefix = '/Users/alijanatiidr/Desktop/Columbia/Applied ML/Project/cv-corpus-10.0-delta-2022-07-04/en/clips/'

#df['path'] = prefix + df['path']

#df

In [7]:
gs_train = pd.DataFrame(gs['train'])
gs_test = pd.DataFrame(gs['test'])
gs_val = pd.DataFrame(gs['validation'])

gs = pd.concat([gs_train, gs_test, gs_val], ignore_index=True)

gs = gs[['audio', 'text']]

In [8]:
gs

Unnamed: 0,audio,text
0,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,AS THEY'RE LEAVING <COMMA> CAN KASH PULL ZAHRA...
1,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,SIX TOMATOES <PERIOD>
2,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,AND SOMETHING BROUGHT BACK RESTORED FROM THE R...
3,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,TO HELP SCREEN READER USERS IN THE MIDST OF DI...
4,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,FOR ALICE HAD READ SEVERAL NICE LITTLE STORIES...
...,...,...
41753,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,THEIR ORIGINAL IDEA IS NOT MEAL KITS <PERIOD> ...
41754,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,<SIL>
41755,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,ANGUILLA <PERIOD> WHAT MAKES ANGUILLA'S BEACHE...
41756,{'path': 'C:\Users\Pc\.cache\huggingface\datas...,HAS ANYBODY <COMMA> DOES ANYBODY HAVE IDEA WHY...


In [9]:
gs = gs.rename(columns={'audio': 'audio_signal', 'text': 'sentence'})
#we are only going to take the first 10000 data points since we don't have any GPU for our training
gs = gs[:10000]

In [10]:
# Getting fine tuning dataset of the form (path of mp3, sentence)
gs = gs.rename(columns={'audio_signal': 'path', 'sentence': 'text'})
gs['path'] = gs['path'].apply(lambda x: x['path'])

gs['text'] = gs['text'].str.lower()
gs['text'] = gs['text'].str.replace('<comma>', ',')
gs['text'] = gs['text'].str.replace('<period>', '.')
gs['text'] = gs['text'].str.replace('<questionmark>', '?')
gs['text'] = gs['text'].str.replace('<exclamationmark>', '!')

In [11]:
#df = df.rename(columns={'audio_signal': 'path', 'sentence': 'text'})

In [12]:
fine_tuning_df = gs

In [13]:
# path to audio signal using librosa
def read_audio_signal(row):
    path = row['path']
    try:
        audio_signal, sample_rate = sf.read(path)
        return audio_signal
    except Exception as e:
        # Handle any exceptions that might occur during file reading
        print(f"Error reading file '{path}': {e}")
        return None

fine_tuning_df['audio'] = fine_tuning_df.apply(read_audio_signal, axis=1)

fine_tuning_df['sampling_rate'] = 16000

fine_tuning_df = fine_tuning_df[['path', 'audio', 'sampling_rate' , 'text']]

In [14]:
fine_tuning_df

Unnamed: 0,path,audio,sampling_rate,text
0,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[0.000518798828125, 0.0008544921875, 0.0001220...",16000,"as they're leaving , can kash pull zahra aside..."
1,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[0.001434326171875, 0.001373291015625, 0.00131...",16000,six tomatoes .
2,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[-0.000457763671875, -0.000335693359375, -0.00...",16000,and something brought back restored from the r...
3,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[0.000213623046875, 0.0003662109375, 0.0005493...",16000,to help screen reader users in the midst of di...
4,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[0.006195068359375, 0.0052490234375, 0.0039672...",16000,for alice had read several nice little stories...
...,...,...,...,...
9995,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[-0.069061279296875, -0.041534423828125, -0.04...",16000,writer director eugene ashe combines romantic ...
9996,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[0.00299072265625, 0.0023193359375, 0.00128173...",16000,right .
9997,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[-0.001678466796875, -0.00091552734375, -0.000...",16000,"in fact , i made up my mind to find a career t..."
9998,C:\Users\Pc\.cache\huggingface\datasets\downlo...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16000,would've done something like that .


In [15]:
train_size = 0.8
val_size = 0.1
test_size = 0.1

fine_tuning_df_train = fine_tuning_df[:int(len(fine_tuning_df)*train_size)]
fine_tuning_df_val = fine_tuning_df[int(len(fine_tuning_df)*train_size):int(len(fine_tuning_df)*(train_size+val_size))]
fine_tuning_df_test = fine_tuning_df[int(len(fine_tuning_df)*(train_size+val_size)):]

fine_tuning_df_train = fine_tuning_df_train.reset_index(drop=True)
fine_tuning_df_val = fine_tuning_df_val.reset_index(drop=True)
fine_tuning_df_test = fine_tuning_df_test.reset_index(drop=True)

In [16]:
Data = DatasetDict({'train': Dataset.from_pandas(fine_tuning_df_train), 'validation': Dataset.from_pandas(fine_tuning_df_val), 'test': Dataset.from_pandas(fine_tuning_df_test)})

In [17]:
Data = Data.remove_columns(['path'])

In [18]:
Data

DatasetDict({
    train: Dataset({
        features: ['audio', 'sampling_rate', 'text'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['audio', 'sampling_rate', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['audio', 'sampling_rate', 'text'],
        num_rows: 1000
    })
})

# Performance Assessment without fine tuning

### Whisper tiny

In [19]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer


processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny.en")
tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny.en")

In [20]:
# import wer metric
from datasets import load_metric
wer_metric = load_metric("wer")

test_predictions = []



with torch.no_grad():
    for example in Data['test']:
        input_features = processor(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        generated_ids = model.generate(input_features.input_features)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
        test_predictions.append(transcription[0])

In [21]:
test_text = []
for sentence in Data['test']['text']:
    test_text.append([sentence])

In [22]:
test_text = [item for sublist in test_text for item in sublist]

In [23]:
wer = wer_metric.compute(predictions=test_predictions, references=test_text)
print("WER: {}".format(round(wer, 2)))

WER: 0.45


In [24]:
cer_metric = load_metric("cer")

cer_whisper_tiny = cer_metric.compute(predictions=test_predictions, references=test_text)

print(f"CER: {cer_whisper_tiny}")

CER: 0.2038523909346201


In [25]:
# numbers of parameters in the model
model.num_parameters()

37760256

### Whisper Large v3 

In [26]:
# load whisper large v3 model
#processor_large_v3 = AutoProcessor.from_pretrained("openai/whisper-large-v3")
#model_large_v3 = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3")
#tokenizer_large_v3 = AutoTokenizer.from_pretrained("openai/whisper-large-v3")

In [27]:
# predictions on test set
#test_predictions_large_v3 = []

#with torch.no_grad():
    #for example in Data['test']:
        #input_features = processor_large_v3(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        #generated_ids = model_large_v3.generate(input_features.input_features)
        #transcription = processor_large_v3.batch_decode(generated_ids, skip_special_tokens=True)
        #test_predictions_large_v3.append(transcription)

In [28]:
#test_predictions_large_v3_list = [item for sublist in test_predictions_large_v3 for item in sublist]

In [29]:
# wer score
#wer_whisper_large_v3 = wer_metric.compute(predictions=test_predictions_large_v3_list, references=test_text)

# cer score
#cer_whisper_large_v3 = cer_metric.compute(predictions=test_predictions_large_v3_list, references=test_text)

#print(f"WER: {wer_whisper_large_v3}")

#print(f"CER: {cer_whisper_large_v3}")

In [30]:
# numbers of parameters in the model
#model_large_v3.num_parameters()

### Whisper medium 

In [31]:
# load whisper medium model
#processor_medium = AutoProcessor.from_pretrained("openai/whisper-medium")
#model_medium = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-medium")
#tokenizer_medium = AutoTokenizer.from_pretrained("openai/whisper-medium")

In [32]:
# predictions on test set
#test_predictions_medium = []

#with torch.no_grad():
    #for example in Data['test']:
        #input_features = processor_medium(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        #generated_ids = model_medium.generate(input_features.input_features)
        #transcription = processor_medium.batch_decode(generated_ids, skip_special_tokens=True)
        #test_predictions_medium.append(transcription[0])

In [33]:

# wer score
#wer_whisper_medium = wer_metric.compute(predictions=test_predictions_medium, references=test_text)

# cer score
#cer_whisper_medium = cer_metric.compute(predictions=test_predictions_medium, references=test_text)

In [34]:
#print(f"WER: {wer_whisper_medium}")

#print(f"CER: {cer_whisper_medium}")

In [35]:
# number of parameters in the model
#model_medium.num_parameters()

### Whisper base 

In [36]:
# load whisper base model
#processor_base = AutoProcessor.from_pretrained("openai/whisper-base")
#model_base = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-base")
#tokenizer_base = AutoTokenizer.from_pretrained("openai/whisper-base")

In [37]:
# predictions on test set
#test_predictions_base = []


#with torch.no_grad():
    #for example in Data['test']:
        #input_features = processor_base(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        #generated_ids = model_base.generate(input_features.input_features)
        #transcription = processor_base.batch_decode(generated_ids, skip_special_tokens=True)
        #test_predictions_base.append(transcription[0])

In [38]:


# wer score
#wer_whisper_base = wer_metric.compute(predictions=test_predictions_base, references=test_text)

# cer score
#cer_whisper_base = cer_metric.compute(predictions=test_predictions_base, references=test_text)

In [39]:
#print(f"WER: {wer_whisper_base}")

#print(f"CER: {cer_whisper_base}")

In [40]:
# number of parameters in the model
#model_base.num_parameters()

### Whisper small 

In [41]:
# load whisper-small.en model
#processor_small = AutoProcessor.from_pretrained("openai/whisper-small.en")
#model_small = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small.en")
#tokenizer_small = AutoTokenizer.from_pretrained("openai/whisper-small.en")

In [42]:
# predictions on test set
#test_predictions_small = []


#with torch.no_grad():
    #for example in Data['test']:
        #input_features = processor_small(example['audio'], sampling_rate=example['sampling_rate'], return_tensors="pt")
        #generated_ids = model_small.generate(input_features.input_features)
        #transcription = processor_small.batch_decode(generated_ids, skip_special_tokens=True)
        #test_predictions_small.append(transcription[0])

In [43]:
# wer score
#wer_whisper_small = wer_metric.compute(predictions=test_predictions_small, references=test_text)

# cer score
#cer_whisper_small = cer_metric.compute(predictions=test_predictions_small, references=test_text)

#print(f"WER: {wer_whisper_small}")

#print(f"CER: {cer_whisper_small}")

In [44]:
# number of parameters in the model
#model_small.num_parameters()

In [45]:
# plot wer and cer scores vs number of parameters
#fig = go.Figure()

#fig.add_trace(go.Line(x=[model.num_parameters(), model_large_v3.num_parameters(), model_medium.num_parameters(), model_base.num_parameters(), model_small.num_parameters()], y=[wer, wer_whisper_large_v3, wer_whisper_medium, wer_whisper_base, wer_whisper_small], mode='markers', name='WER'))
#fig.add_trace(go.Line(x=[model.num_parameters(), model_large_v3.num_parameters(), model_medium.num_parameters(), model_base.num_parameters(), model_small.num_parameters()], y=[cer_whisper_tiny, cer_whisper_large_v3, cer_whisper_medium, cer_whisper_base, cer_whisper_small], mode='markers', name='CER'))

#fig.update_layout(title='WER and CER vs Number of Parameters', xaxis_title='Number of Parameters', yaxis_title='WER and CER')

#fig.show()

### Performance assessment summary

In [46]:
#df = pd.DataFrame(columns=['Model', 'WER', 'CER', 'Number of parameters'])
#df = df.append({'Model': 'Whisper-Tiny', 'WER': wer, 'CER': cer_whisper_tiny, 'Number of parameters': model.num_parameters()}, ignore_index=True)
#df = df.append({'Model': 'Whisper-Large-V3', 'WER': wer_whisper_large_v3, 'CER': cer_whisper_large_v3, 'Number of parameters': model_large_v3.num_parameters()}, ignore_index=True)
#df = df.append({'Model': 'Whisper-Medium', 'WER': wer_whisper_medium, 'CER': cer_whisper_medium, 'Number of parameters': model_medium.num_parameters()}, ignore_index=True)
#df = df.append({'Model': 'Whisper-Base', 'WER': wer_whisper_base, 'CER': cer_whisper_base, 'Number of parameters': model_base.num_parameters()}, ignore_index=True)
#df = df.append({'Model': 'Whisper-Small', 'WER': wer_whisper_small, 'CER': cer_whisper_small, 'Number of parameters': model_small.num_parameters()}, ignore_index=True)

# order by number of parameters
#df = df.sort_values(by=['Number of parameters'])

# reset index
#df = df.reset_index(drop=True)

#df

# Fine tuning whisper tiny on the training set

In [47]:
class whisper(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny.en")
        self.processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny.en")
    
    def forward(self, input_features, decoder_input_ids):
        return self.model(input_features, decoder_input_ids=decoder_input_ids)
    
    def generate(self, input_features):
        return self.model.generate(input_features)


In [48]:
# get number of parameters
model = whisper()
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

37760256


In [49]:
Data['train'][0]

{'audio': [0.000518798828125,
  0.0008544921875,
  0.0001220703125,
  -0.00048828125,
  -0.000335693359375,
  -0.001495361328125,
  -0.001861572265625,
  -0.00128173828125,
  -0.00140380859375,
  -0.001007080078125,
  -0.00140380859375,
  -0.001556396484375,
  -0.000518798828125,
  -0.00042724609375,
  3.0517578125e-05,
  0.000152587890625,
  0.000457763671875,
  -0.0013427734375,
  -0.00213623046875,
  -0.000701904296875,
  -0.001190185546875,
  -0.0008544921875,
  -0.001739501953125,
  -0.001953125,
  -0.001220703125,
  -0.000335693359375,
  0.00091552734375,
  0.0028076171875,
  0.004364013671875,
  0.003631591796875,
  0.003326416015625,
  0.00311279296875,
  0.001434326171875,
  0.000244140625,
  -0.00018310546875,
  -0.00054931640625,
  -0.00189208984375,
  -0.00274658203125,
  -0.0023193359375,
  -0.00238037109375,
  -0.001922607421875,
  -0.00115966796875,
  -0.000701904296875,
  0.0008544921875,
  0.001800537109375,
  0.001495361328125,
  0.001800537109375,
  0.001678466796875

In [50]:
model.processor(Data['train'][0]['audio'], sampling_rate=16000, return_tensors="pt").input_features

tensor([[[-0.4609,  0.0631, -0.1337,  ..., -0.7091, -0.7091, -0.7091],
         [-0.2867,  0.1746,  0.1899,  ..., -0.7091, -0.7091, -0.7091],
         [ 0.0259,  0.1283,  0.2454,  ..., -0.7091, -0.7091, -0.7091],
         ...,
         [-0.6831, -0.7091, -0.7091,  ..., -0.7091, -0.7091, -0.7091],
         [-0.7091, -0.7091, -0.7091,  ..., -0.7091, -0.7091, -0.7091],
         [-0.7091, -0.7091, -0.7091,  ..., -0.7091, -0.7091, -0.7091]]])

In [51]:
model(model.processor(Data['train'][0]['audio'], sampling_rate=16000, return_tensors="pt").input_features, decoder_input_ids=model.tokenizer(Data['train'][0]['text'], return_tensors="pt").input_ids)

Seq2SeqLMOutput(loss=None, logits=tensor([[[ 4.2794,  4.5108,  4.3167,  ...,  6.7437,  6.3536,  5.6533],
         [ 8.0787,  5.5243,  5.1981,  ...,  4.8603,  3.5680,  2.7399],
         [10.7076,  9.7527,  6.1483,  ...,  7.3184,  7.8902,  6.5994],
         ...,
         [27.5923, 24.7009, 18.3103,  ..., 15.5341, 15.6958, 13.9922],
         [23.9836, 22.3882, 17.9989,  ..., 13.9224, 14.3146, 12.7540],
         [14.5958, 14.3323, 13.9694,  ...,  8.6555,  8.7807,  7.5432]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 1.6553e-01,  2.1587e+00, -1.1847e-01,  ..., -1.1724e+00,
           -5.9878e-01, -7.0421e-02],
          [-1.3405e+00, -1.0674e+00, -2.3007e-01,  ..., -2.0404e+00,
           -1.1308e-01, -1.1258e+00],
          [-2.8784e-01, -4.7903e-01,  3.4219e-02,  ..., -1.0254e+00,
            1.3288e+00, -1.3271e+00],
          ...,
          [-3.0651e-01, -1.2189e+00, -8.8127e-02,  ..., -3.4772e-02,
            1.5490e-01, -8.6309e-01],
          [-3.6881e-03, 

In [53]:
def data_collator(batch):
    input_features = [model.processor(item['audio'], sampling_rate=16000, return_tensors="pt").input_features.squeeze() for item in batch]
    labels = [model.tokenizer(item['text'], return_tensors="pt").input_ids.squeeze() for item in batch]

    input_features = pad_sequence(input_features, batch_first=True)
    labels = pad_sequence(labels, batch_first=True)

    return {"input_features": input_features, "labels": labels}



In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
epochs = 10
batch_size = 32
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [None]:
trainloader = DataLoader(Data['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
valloader = DataLoader(Data['validation'], batch_size=batch_size, shuffle=True, collate_fn=data_collator)

In [57]:
train_loss = []
val_loss = []
for epoch in range(epochs):
    model.train()
    train_batch_loss = []
    for batch in trainloader:
        input_features = batch['input_features'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_features, labels)
        loss = criterion(outputs.logits.permute(0, 2, 1), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_batch_loss.append(loss.item())
    train_loss.append(np.mean(train_batch_loss))
    model.eval()
    val_batch_loss = []
    for batch in valloader:
        input_features = batch['input_features'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_features, labels)
        loss = criterion(outputs.logits.permute(0, 2, 1), labels)
        val_batch_loss.append(loss.item())
    val_loss.append(np.mean(val_batch_loss))
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss[-1]}, Val Loss: {val_loss[-1]}")

Epoch: 1, Train Loss: 0.9197832295149565, Val Loss: 0.09085851034615189
Epoch: 2, Train Loss: 0.016946754531003534, Val Loss: 0.10725139733403921
