# Benchmarking Automation Speech Recognition Models

## Kaggle Specific

In [None]:
!pip install --quiet --upgrade transformers datasets accelerate timm datasets[audio]

In [None]:
!pip install --quiet evaluate jiwer mistral-common bitsandbytes

In [None]:
# Setup ugging Face
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

login(token=HF_TOKEN)

## Dependencies

In [None]:
import json
import os
import re
import time
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from evaluate import load
from torchinfo import summary
from tqdm.auto import tqdm
from transformers import WhisperForConditionalGeneration, WhisperProcessor, BitsAndBytesConfig

## Utilities

In [None]:
main_path = Path("/kaggle/working")

In [None]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    text = text.strip()

    return text


@torch.inference_mode()
def predict_whisper(model, processor, sample):
    result = {
        "id": sample["id"],
        "reference": normalize_text(sample["text"]),
    }

    audio = sample["audio"]
    inputs = processor(
        audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt"
    )
    inputs = inputs.to(model.device, dtype=torch.float16)

    outputs = model.generate(**inputs, do_sample=False)
    predicted_ids = outputs[0]
    transcription = processor.decode(predicted_ids)

    result["prediction"] = normalize_text(transcription)

    return result

In [None]:
def benchmark(model, processor, predict_fn, dataset, max_samples: int | None = None):
    benchmark_results = []
    wer = load("wer")

    for i, sample in enumerate(tqdm(dataset, desc="Benchmarking", total=max_samples)):
        if max_samples is not None and i >= max_samples:
            break

        start_time = time.perf_counter()
        prediction = predict_fn(model, processor, sample)

        end_time = time.perf_counter()

        inference_time = end_time - start_time
        wer_result = 100 * wer.compute(
            references=[prediction["reference"]],
            predictions=[prediction["prediction"]],
        )

        benchmark_results.append(
            {**prediction, "inference_time": inference_time, "wer": wer_result}
        )

    total_samples = len(benchmark_results)
    average_inference_time = np.mean(
        [result["inference_time"] for result in benchmark_results]
    ).item()
    average_wer = np.mean([result["wer"] for result in benchmark_results]).item()

    benchmark_summary = {
        "total_samples": total_samples,
        "average_inference_time": average_inference_time,
        "average_wer": average_wer,
    }

    result = {
        "benchmark_results": benchmark_results,
        "benchmark_summary": benchmark_summary,
    }

    return result

In [None]:
def save_results(result, folder_path: Path):
    benchmark_results = pd.DataFrame(result["benchmark_results"])
    benchmark_summary = result["benchmark_summary"]

    os.makedirs(folder_path, exist_ok=True)

    benchmark_results.to_csv(folder_path / "benchmark_results.csv", index=False)

    with open(folder_path / "benchmark_summary.json", "w") as f:
        json.dump(benchmark_summary, f, indent=4)

## Dataset

In [None]:
dataset = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)

## Models

### openai/whisper-large-v2

In [None]:
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)
whisper_model_8_bit = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large-v2",
    quantization_config=quantization_config,
    device_map="auto"
)

In [None]:
summary(whisper_model_8_bit)

In [None]:
!nvidia-smi

In [None]:
results = benchmark(
    whisper_model_8_bit, whisper_processor, predict_whisper, dataset, max_samples=1000
)
output_folder = main_path / "openai/whisper-large-v2/8-bit" 
save_results(results, output_folder)