In [1]:
!pip install jiwer openai-whisper torchcodec
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from transformers import AutoFeatureExtractor, WhisperModel
from transformers import LlamaTokenizer
from datasets import load_dataset
import torch, torchaudio
from torch import nn
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from jiwer import wer as calculate_wer
import pickle
from datasets import Dataset, Audio, Value
import os, random
from typing import Optional
from whisper.normalizers import EnglishTextNormalizer
import math
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
from pathlib import Path
import whisper
import copy, heapq
import pandas as pd



In [2]:
from google.colab import drive
from google.colab import userdata
userdata.get('HF_TOKEN')
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', language='en', task='transcribe')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
normalizer = EnglishTextNormalizer()


In [4]:

def train(model, train_dataset, val_dataset, tokenizer, feature_extractor, device, num_epochs=2, batch_size=8, learning_rate=1e-5):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    best_wer = float("inf")
    best_state_dict = copy.deepcopy(model.state_dict())

    for epoch in range(num_epochs):
        model.train()
        random.shuffle(train_dataset)

        for i in range(0, len(train_dataset), batch_size):
            batch = train_dataset[i:i + batch_size]

            feats_list = [item["input_features"].squeeze(0) for item in batch]
            inputs = [{"input_features": f} for f in feats_list]
            feats_padded = feature_extractor.pad(inputs, return_tensors="pt")["input_features"]

            labels_list = [item["labels"] for item in batch]
            labels_padded = pad_sequence(labels_list,batch_first=True,padding_value=tokenizer.pad_token_id)
            labels_padded[labels_padded == tokenizer.pad_token_id] = -100

            feats_padded = feats_padded.to(device)
            labels_padded = labels_padded.to(device)

            optimizer.zero_grad()
            outputs = model(input_features=feats_padded, labels=labels_padded)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        model.eval()
        val_wer = evaluate(model, val_dataset, tokenizer, device)
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation WER: {val_wer:.4f}")

        if val_wer < best_wer:
            best_wer = val_wer
            best_state_dict = copy.deepcopy(model.state_dict())
            print(f"  -> New best model (WER={best_wer:.4f}), saving in memory")

    model.load_state_dict(best_state_dict)
    print(f"Training done. Best WER: {best_wer:.4f}")
    return model


def evaluate(model, dataset, tokenizer, device, batch_size=4):
    model.eval()
    wer_scores = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]

        input_features_list = [sample["input_features"].squeeze(0) for sample in batch]
        inputs = [{"input_features": f} for f in input_features_list]
        feats_padded = feature_extractor.pad(inputs, return_tensors="pt")["input_features"].to(device)

        with torch.no_grad():
            generated_ids = model.generate(feats_padded)

        transcriptions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        for transcription, sample in zip(transcriptions, batch):
            transcription = transcription.lower().strip()
            reference = sample["text"].lower().strip()
            wer_score = calculate_wer(reference, transcription)
            wer_scores.append(wer_score)

    return np.mean(wer_scores)



In [5]:
df_train = pd.read_csv('/content/drive/MyDrive/data/final_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/data/final_test.csv')
df_accents = df_train['accents'].unique().tolist() + df_test['accents'].unique().tolist()
list_accents = list(set(df_accents))
del df_train
del df_test
del df_accents

for accent in list_accents:
    test_dataset = torch.load(f'/content/drive/MyDrive/data/{accent}/test.pt')
    model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')
    model.to(device)
    print(f'zero-shot WER for {accent} = {evaluate(model, test_dataset, tokenizer, device)}')

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


zero-shot WER for India and South Asia (India, Pakistan, Sri Lanka) = 0.18404442779442778
zero-shot WER for Singaporean English = 0.5047703685203685
zero-shot WER for Southern African (South Africa, Zimbabwe, Namibia) = 0.1972739482739483
zero-shot WER for Australian English = 0.15097341547341547


### Selft-trained dataset

In [6]:
for accent in list_accents:
    df_pseudo_training = torch.load(f'/content/drive/MyDrive/data/{accent}/df_pseudo_training.pt')
    train_list = df_pseudo_training

    test_dataset = torch.load(f'/content/drive/MyDrive/data/{accent}/test.pt')
    test_list = test_dataset
    model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')
    model.to(device)
    trained_model = train(model, train_list, test_list, tokenizer, feature_extractor, device, batch_size=4)
    print(f'Final WER after fine-tuning for {accent} = {evaluate(trained_model, test_dataset, tokenizer, device)}')
    torch.save(trained_model.state_dict(), f'/content/drive/MyDrive/data/{accent}/self-trained_model.pt')

Epoch 1/2, Validation WER: 0.1790
  -> New best model (WER=0.1790), saving in memory
Epoch 2/2, Validation WER: 0.1560
  -> New best model (WER=0.1560), saving in memory
Training done. Best WER: 0.1560
Final WER after fine-tuning for India and South Asia (India, Pakistan, Sri Lanka) = 0.15600235875235877
Epoch 1/2, Validation WER: 0.3677
  -> New best model (WER=0.3677), saving in memory
Epoch 2/2, Validation WER: 0.3936
Training done. Best WER: 0.3677
Final WER after fine-tuning for Singaporean English = 0.36765431790431796
Epoch 1/2, Validation WER: 0.1733
  -> New best model (WER=0.1733), saving in memory
Epoch 2/2, Validation WER: 0.1624
  -> New best model (WER=0.1624), saving in memory
Training done. Best WER: 0.1624
Final WER after fine-tuning for Southern African (South Africa, Zimbabwe, Namibia) = 0.16242465867465866
Epoch 1/2, Validation WER: 0.1412
  -> New best model (WER=0.1412), saving in memory
Epoch 2/2, Validation WER: 0.1342
  -> New best model (WER=0.1342), saving in