In [None]:
!pip install jiwer openai-whisper torchcodec
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from transformers import AutoFeatureExtractor, WhisperModel
from transformers import LogitsProcessorList, EpsilonLogitsWarper

from transformers import LlamaTokenizer
from datasets import load_dataset
import torch, torchaudio
from torch import nn
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from jiwer import wer as calculate_wer
import pickle
from datasets import Dataset, Audio, Value
import os, random
from typing import Optional
from whisper.normalizers import EnglishTextNormalizer
import math
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
from pathlib import Path
import whisper
import copy, heapq
import pandas as pd

In [None]:
from google.colab import drive
from google.colab import userdata
userdata.get('HF_TOKEN')
drive.mount('/content/drive')

In [17]:
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', language='en', task='transcribe')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
normalizer = EnglishTextNormalizer()


In [None]:
def train(model,train_dataset,val_dataset,tokenizer,feature_extractor,device,num_epochs=4,batch_size=1,learning_rate=1e-5,gradient_accumulation_steps=16):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    best_wer = float("inf")
    best_state_dict = copy.deepcopy(model.state_dict())
    for epoch in range(num_epochs):
        model.train()
        random.shuffle(train_dataset)

        optimizer.zero_grad()
        for step, i in enumerate(range(0, len(train_dataset), batch_size)):
            batch = train_dataset[i:i + batch_size]

            input_features_batch = []
            for item in batch:
                audio_array = item["audio"]["array"]
                sampling_rate = item["audio"]["sampling_rate"]
                input_features_batch.append(feature_extractor(audio_array, sampling_rate=sampling_rate).input_features[0])

            padded_input_features_dicts = [{"input_features": feat_tensor} for feat_tensor in input_features_batch]
            feats_padded = feature_extractor.pad(padded_input_features_dicts, return_tensors="pt")["input_features"]

            labels_batch = []
            for item in batch:
                tokenized_labels = tokenizer(item["text"], add_special_tokens=True).input_ids
                labels_batch.append(torch.tensor(tokenized_labels, dtype=torch.long))

            labels_padded = pad_sequence(
                labels_batch,
                batch_first=True,
                padding_value=tokenizer.pad_token_id
            )
            labels_padded[labels_padded == tokenizer.pad_token_id] = -100

            feats_padded = feats_padded.to(device)
            labels_padded = labels_padded.to(device)

            outputs = model(input_features=feats_padded, labels=labels_padded)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            loss.backward()

            if step % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()
        val_wer = evaluate(model, val_dataset, tokenizer, device)
        print(f"Epoch {epoch + 1}/{num_epochs}, Validation WER: {val_wer:.4f}")

        if val_wer < best_wer:
            best_wer = val_wer
            best_state_dict = copy.deepcopy(model.state_dict())
            print(f"  -> New best model (WER={best_wer:.4f}), saving in memory")

    model.load_state_dict(best_state_dict)
    print(f"Training done. Best WER: {best_wer:.4f}")
    return model

def evaluate(model,dataset,tokenizer,device,batch_size=16):
    model.eval()
    wer_scores = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i + batch_size]

        input_features_batch = []
        for sample in batch:
            audio_array = sample["audio"]["array"]
            sampling_rate = sample["audio"]["sampling_rate"]
            input_features_batch.append(feature_extractor(audio_array, sampling_rate=sampling_rate).input_features[0])

        padded_input_features_dicts = [{"input_features": feat_tensor} for feat_tensor in input_features_batch]
        feats_padded = feature_extractor.pad(padded_input_features_dicts, return_tensors="pt")["input_features"].to(device)

        with torch.no_grad():
            generated_ids = model.generate(feats_padded)

        transcriptions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        for transcription, sample in zip(transcriptions, batch):
            transcription = transcription.lower().strip()
            reference = sample["text"].lower().strip()
            wer_score = calculate_wer(reference, transcription)
            wer_scores.append(wer_score)

    return np.mean(wer_scores)


In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/data/final_train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/data/final_test.csv')
df_accents = df_train['accents'].unique().tolist() + df_test['accents'].unique().tolist()
list_accents = list(set(df_accents))
list_accents = [list_accents[1]]
del df_train
del df_test
del df_accents

for accent in list_accents:
    test_dataset = torch.load(f'/content/drive/MyDrive/data/{accent}/test.pt')
    model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')
    model.to(device)
    print(f'zero-shot WER for {accent} = {evaluate(model, test_dataset, tokenizer, device)}')

### Real-Labels dataset

In [33]:
training = torch.load(f'/content/drive/MyDrive/data/train.pt', weights_only=False)
train_list = training
validation = torch.load(f'/content/drive/MyDrive/data/validation.pt', weights_only=False)
val_list = validation
model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny')
model.to(device)
trained_model = train(model, train_list,val_list, tokenizer, feature_extractor, device)
torch.save(trained_model.state_dict(), f'/content/drive/MyDrive/data/trained_model.pth')

Epoch 1/4, Validation WER: 0.2626
  -> New best model (WER=0.2626), saving in memory
Epoch 2/4, Validation WER: 0.2676
Epoch 3/4, Validation WER: 0.4248
Epoch 4/4, Validation WER: 0.6084
Training done. Best WER: 0.2626
