In [9]:
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from transformers import AutoFeatureExtractor, WhisperModel
from transformers import LlamaTokenizer
from datasets import load_dataset
import torch, torchaudio
from torch import nn
import numpy as np
from jiwer import wer as calculate_wer
import pickle
from datasets import Dataset, Audio, Value
import os, random
from typing import Optional
from whisper.normalizers import EnglishTextNormalizer
import math
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
from pathlib import Path
import whisper
import copy, heapq
import pandas as pd

In [19]:
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', language='en', task='transcribe')  
processor = WhisperProcessor.from_pretrained('openai/whisper-small', language='en', task='transcribe')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
normalizer = EnglishTextNormalizer()


In [None]:
def data_preparation(
    csv_path, 
    file_list_path,    
    feature_extractor,
    tokenizer=None
):
    df_text = pd.read_csv(csv_path)

    with open(file_list_path, "r") as f:
        file_paths = [line.strip() for line in f if line.strip()]

    dataset = []

    for file_path in file_paths:
        row = df_text[df_text["path"] == file_path]
        if row.empty:
            print(f"[WARN] No transcript found for: {file_path}")
            continue
        text = str(row["sentence"].values[0]).lower().strip()

        # load audio
        audio, sr = torchaudio.load("data/test_data/" + file_path)
        if sr != 16000:
            audio = torchaudio.functional.resample(audio, sr, 16000)

        # precompute features
        feats = feature_extractor(
            audio.squeeze(0).numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        )["input_features"][0]

        item = {
            "input_features": feats,
            "text": text
        }

        if tokenizer is not None:
            labels = tokenizer(
                text,
                return_tensors="pt",
                padding="longest",
                truncation=True
            )["input_ids"][0]
            item["labels"] = labels

        dataset.append(item)

    return dataset

def evaluate(model, dataset, tokenizer, device):
    model.eval()
    wer_scores = []

    for sample in dataset:
        input_features = sample["input_features"].unsqueeze(0).to(device)

        with torch.no_grad():
            generated_ids = model.generate(input_features)

        transcription = tokenizer.batch_decode(
            generated_ids, skip_special_tokens=True
        )[0].lower().strip()

        reference = sample["text"].lower().strip()

        wer_score = calculate_wer(reference, transcription)
        wer_scores.append(wer_score)

    return np.mean(wer_scores)




In [None]:
test_data_csv = 'data/final_test.csv'
test_audio_list = 'data/test_files.txt'  
test_dataset = data_preparation(test_data_csv, test_audio_list, feature_extractor, tokenizer)
hf_dataset = Dataset.from_list(test_dataset)
torch.save(test_dataset, f'data/test.pt')
device = 'cpu'
model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')
model.eval()
print(f'zero-shot WER = {evaluate(model, test_dataset, tokenizer, device)}')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


zero-shot = 0.26164054001554
