# Benchmarking Automation Speech Recognition Models

## Kaggle Specific

In [1]:
!pip install --quiet --upgrade transformers datasets accelerate timm datasets[audio]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m108.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.1/367.1 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install --quiet evaluate jiwer mistral-common

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m102.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Setup ugging Face
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

login(token=HF_TOKEN)

## Dependencies

In [None]:
import json
import os
import re
import time
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from evaluate import load
from torchinfo import summary
from tqdm.auto import tqdm
from transformers import AutoProcessor, VoxtralForConditionalGeneration

2025-07-30 15:23:46.268928: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753889026.451153      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753889026.503344      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Utilities

In [5]:
main_path = Path("/kaggle/working")

In [6]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    text = text.strip()

    return text


@torch.inference_mode()
def predict_voxtral_mini(model, processor, sample):
    result = {
        "id": sample["id"],
        "reference": normalize_text(sample["text"]),
    }

    audio = sample["audio"]
    inputs = processor.apply_transcription_request(
        language="en",
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        model_id="mistralai/Voxtral-Mini-3B-2507",
        format=["flac"],
    )
    inputs = inputs.to(model.device, dtype=torch.bfloat16)
    input_len = inputs.input_ids.shape[1]

    outputs = model.generate(**inputs, do_sample=False, max_new_tokens=10000)
    predicted_ids = outputs[0, input_len:]  # Only take the generated output
    transcription = processor.decode(predicted_ids, skip_special_tokens=True)

    result["prediction"] = normalize_text(transcription)

    return result

In [7]:
def benchmark(model, processor, predict_fn, dataset, max_samples: int | None = None):
    benchmark_results = []
    wer = load("wer")

    for i, sample in enumerate(tqdm(dataset, desc="Benchmarking", total=max_samples)):
        if max_samples is not None and i >= max_samples:
            break

        start_time = time.perf_counter()
        prediction = predict_fn(model, processor, sample)

        end_time = time.perf_counter()

        inference_time = end_time - start_time
        wer_result = 100 * wer.compute(
            references=[prediction["reference"]],
            predictions=[prediction["prediction"]],
        )

        benchmark_results.append(
            {**prediction, "inference_time": inference_time, "wer": wer_result}
        )

    total_samples = len(benchmark_results)
    average_inference_time = np.mean(
        [result["inference_time"] for result in benchmark_results]
    ).item()
    average_wer = np.mean([result["wer"] for result in benchmark_results]).item()

    benchmark_summary = {
        "total_samples": total_samples,
        "average_inference_time": average_inference_time,
        "average_wer": average_wer,
    }

    result = {
        "benchmark_results": benchmark_results,
        "benchmark_summary": benchmark_summary,
    }

    return result

In [8]:
def save_results(result, folder_path: Path):
    benchmark_results = pd.DataFrame(result["benchmark_results"])
    benchmark_summary = result["benchmark_summary"]

    os.makedirs(folder_path, exist_ok=True)

    benchmark_results.to_csv(folder_path / "benchmark_results.csv", index=False)

    with open(folder_path / "benchmark_summary.json", "w") as f:
        json.dump(benchmark_summary, f, indent=4)

## Dataset

In [9]:
dataset = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True)

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

## Models

### mistralai/Voxtral-Mini-3B-2507

In [10]:
voxtral_mini_processor = AutoProcessor.from_pretrained("mistralai/Voxtral-Mini-3B-2507")
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained(
    "mistralai/Voxtral-Mini-3B-2507", torch_dtype=torch.bfloat16, device_map="auto"
)

preprocessor_config.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tekken.json:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.38G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/108 [00:00<?, ?B/s]

In [11]:
summary(voxtral_mini_model)

Layer (type:depth-idx)                             Param #
VoxtralForConditionalGeneration                    --
├─VoxtralEncoder: 1-1                              --
│    └─Conv1d: 2-1                                 492,800
│    └─Conv1d: 2-2                                 4,916,480
│    └─Embedding: 2-3                              (1,920,000)
│    └─ModuleList: 2-4                             --
│    │    └─VoxtralEncoderLayer: 3-1               19,676,160
│    │    └─VoxtralEncoderLayer: 3-2               19,676,160
│    │    └─VoxtralEncoderLayer: 3-3               19,676,160
│    │    └─VoxtralEncoderLayer: 3-4               19,676,160
│    │    └─VoxtralEncoderLayer: 3-5               19,676,160
│    │    └─VoxtralEncoderLayer: 3-6               19,676,160
│    │    └─VoxtralEncoderLayer: 3-7               19,676,160
│    │    └─VoxtralEncoderLayer: 3-8               19,676,160
│    │    └─VoxtralEncoderLayer: 3-9               19,676,160
│    │    └─VoxtralEncoderLayer: 3-10 

In [12]:
!nvidia-smi

Wed Jul 30 15:25:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             30W /  250W |    9195MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [13]:
results = benchmark(
    voxtral_mini_model,
    voxtral_mini_processor,
    predict_voxtral_mini,
    dataset,
    max_samples=1000,
)
output_folder = main_path / "mistralai/Voxtral-Mini-3B-2507"
save_results(results, output_folder)

Downloading builder script: 0.00B [00:00, ?B/s]

Benchmarking:   0%|          | 0/1000 [00:00<?, ?it/s]