In [7]:
import os
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
import evaluate
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from dataclasses import dataclass
import gc

def flush_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("✅ GPU memory flushed.")

flush_gpu()

@dataclass
class Config:
    MANIFEST_CSV = "./manifest/preprocessed-segments-index.csv"
    TEST_RATIO   = 0.1
    SEED         = 42
    BATCH_SIZE   = 4
    DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
    # list your fine-tuned model and any baselines you want to compare
    MODEL_LIST   = [
        "./whisper-ja-asmr-distil-whisper-large-v3-ja-reazonspeech-all-1-earlyst-normalize-warm-lora-baonly/final-merged",
        "./whisper-ja-asmr-tiny-1-earlyst/final",
        "./whisper-ja-asmr-small-1-earlyst/final",
        "./whisper-ja-asmr-tiny-2-earlyst-normalize-warm/final",
        "openai/whisper-tiny",
        "openai/whisper-small",
        "openai/whisper-medium",
        "openai/whisper-large-v3",
        "japanese-asr/distil-whisper-large-v3-ja-reazonspeech-all",
        # "japanese-asr/distil-whisper-large-v3-ja-reazonspeech-tiny",
        # "japanese-asr/distil-whisper-large-v3-ja-reazonspeech-medium",
        # "japanese-asr/distil-whisper-large-v3-ja-reazonspeech-small",
    ]
    # these hyperparameters will be *identical* across every model:
    GENERATION_KWARGS = {
        "num_beams":            3,
        "no_repeat_ngram_size": 2,
        "repetition_penalty":   1.5,
        "length_penalty":       1.0,
        "early_stopping":       True,
        "max_new_tokens":       50,
    }

# 1) load & split test set
df = pd.read_csv(Config.MANIFEST_CSV)
df = df[df.lang == "ja"].reset_index(drop=True)
_, test_df = np.split(
    df.sample(frac=1, random_state=Config.SEED),
    [ int(len(df)*(1 - Config.TEST_RATIO)) ]
)
print(f"▶ Test examples: {len(test_df)}")

✅ GPU memory flushed.
▶ Test examples: 1698


  return bound(*args, **kwds)


In [8]:
import os
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm
import evaluate
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from dataclasses import dataclass

# 2) prepare metrics
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

# 3) batched evaluation function
import torch.nn.functional as F  # ← add this at top of your script

def evaluate_model(model_name: str):
    is_local = model_name.startswith("./")
    print(f"\n→ Evaluating: {model_name} ({'local' if is_local else 'hub'})")
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name).to(Config.DEVICE)

    # clear any forced tokens
    model.generation_config.forced_decoder_ids = None
    model.generation_config.suppress_tokens      = []

    # apply shared hyperparameters
    for k, v in Config.GENERATION_KWARGS.items():
        setattr(model.generation_config, k, v)

    preds, refs = [], []
    n = len(test_df)
    for start in tqdm(range(0, n, Config.BATCH_SIZE), desc="  batches"):
        batch = test_df.iloc[start : start + Config.BATCH_SIZE]
        audio_list, txt_list = [], []
        for npz_path in batch.npz_path:
            data = np.load(npz_path, allow_pickle=True)
            audio_list.append(data["audio"].astype(np.float32))
            txt_list.append(str(data["text"]))
        refs.extend(txt_list)

        # --- FIXED BLOCK: pad/trim to fixed mel‐feature length ---
        inputs = processor(
            audio_list,
            sampling_rate=16_000,
            return_tensors="pt",
            padding=True,
            return_attention_mask=True,
        )
        # shape: (batch, n_mels, seq_len)
        input_feats    = inputs.input_features.to(Config.DEVICE)
        # shape: (batch, seq_len)
        attention_mask = inputs.attention_mask.to(Config.DEVICE)

        # compute required length: max_source_positions * conv1.stride * conv2.stride
        c1 = model.model.encoder.conv1.stride[0]
        c2 = model.model.encoder.conv2.stride[0]
        expected_len = model.config.max_source_positions * c1 * c2

        seq_len = input_feats.shape[-1]
        if seq_len < expected_len:
            pad_amt = expected_len - seq_len
            # pad last dimension (time) by pad_amt
            input_feats    = F.pad(input_feats,    (0, pad_amt))
            attention_mask = F.pad(attention_mask, (0, pad_amt))
        elif seq_len > expected_len:
            input_feats    = input_feats[..., :expected_len]
            attention_mask = attention_mask[..., :expected_len]
        # -----------------------------------------------------------

        with torch.no_grad():
            gen_ids = model.generate(
                input_feats,
                attention_mask=attention_mask,
            )

        batch_preds = processor.batch_decode(gen_ids, skip_special_tokens=True)
        preds.extend(batch_preds)

    wer = wer_metric.compute(predictions=preds, references=refs)
    cer = cer_metric.compute(predictions=preds, references=refs)
    print(f"    WER: {wer:.3f}   CER: {cer:.3f}")
    return {"wer": wer, "cer": cer}

In [9]:
# 4) loop through all models
results = {}
for model_name in Config.MODEL_LIST:
    results[model_name] = evaluate_model(model_name)


→ Evaluating: ./whisper-ja-asmr-distil-whisper-large-v3-ja-reazonspeech-all-1-earlyst-normalize-warm-lora-baonly/final-merged (local)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProce

    WER: 0.917   CER: 0.292

→ Evaluating: ./whisper-ja-asmr-tiny-1-earlyst/final (local)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 1.097   CER: 0.942

→ Evaluating: ./whisper-ja-asmr-small-1-earlyst/final (local)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 1.096   CER: 0.611

→ Evaluating: ./whisper-ja-asmr-tiny-2-earlyst-normalize-warm/final (local)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 1.127   CER: 1.270

→ Evaluating: openai/whisper-tiny (hub)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 4.770   CER: 1.674

→ Evaluating: openai/whisper-small (hub)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 4.852   CER: 1.867

→ Evaluating: openai/whisper-medium (hub)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 5.025   CER: 1.915

→ Evaluating: openai/whisper-large-v3 (hub)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 5.342   CER: 1.988

→ Evaluating: japanese-asr/distil-whisper-large-v3-ja-reazonspeech-all (hub)


  batches:   0%|          | 0/425 [00:00<?, ?it/s]

    WER: 0.953   CER: 0.405


In [10]:
# 5) summary
df_results = pd.DataFrame([
    {"model": m, **metrics}
    for m, metrics in results.items()
])
print("\n### Comparison")
print(df_results)
df_results.to_csv("./manifest/model-comparison-evals.csv", index=False)


### Comparison
                                               model       wer       cer
0  ./whisper-ja-asmr-distil-whisper-large-v3-ja-r...  0.916911  0.291941
1             ./whisper-ja-asmr-tiny-1-earlyst/final  1.097356  0.941794
2            ./whisper-ja-asmr-small-1-earlyst/final  1.095678  0.610665
3  ./whisper-ja-asmr-tiny-2-earlyst-normalize-war...  1.126731  1.270193
4                                openai/whisper-tiny  4.770038  1.674027
5                               openai/whisper-small  4.852287  1.866936
6                              openai/whisper-medium  5.025178  1.914799
7                            openai/whisper-large-v3  5.341586  1.988444
8  japanese-asr/distil-whisper-large-v3-ja-reazon...  0.953000  0.405017
