In [1]:
import sys
from pathlib import Path

# Add parent directory to Python path
parent_dir = Path('.').absolute().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

In [2]:
import os
import torch
import pandas as pd
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

from datasets import DatasetDict

DATASET_DIR = "../datasets"
DATASET_PATH = os.path.join(DATASET_DIR, "combined__case_sensitive_part2.csv")
SEED = 137

OUTPUT_DIR = "../outputs/part2"
PROCESSED_DATASET_DIR = os.path.abspath(os.path.join(OUTPUT_DIR, "processed_uzbek_asr_dataset"))

MODEL_NAME = "openai/whisper-small"

In [5]:
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="uz", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)

model.generation_config.language = "uz"
model.generation_config.task = "transcribe"

# Force decoder to generate in Uzbek
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="uz",
    task="transcribe"
)

# Set pad token to avoid attention mask warning
if processor.tokenizer.pad_token_id is None:
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

print(processor.feature_extractor)
model

WhisperFeatureExtractor {
  "chunk_length": 30,
  "dither": 0.0,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}



WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [6]:
import gc
import os
from datasets import load_from_disk


def prepare_dataset_for_training(batch):
    """Preprocess a batch for training"""
    # Load and process audio from the 'audio' column
    audios = batch["audio"]

    # Compute log-Mel input features
    input_features = processor.feature_extractor(
        [audio["array"] for audio in audios],
        sampling_rate=audios[0]["sampling_rate"]
    ).input_features

    # Use __call__ method (faster) - just call the tokenizer directly
    encoded = processor.tokenizer(
        batch["ref_normalized"],
        truncation=True,
        padding=False  # Don't pad here, let data collator handle it
    )

    return {
        "input_features": input_features,
        "labels": encoded.input_ids,  # Extract input_ids from the result
        "dataset": batch["dataset"]  # As metadata for evaluation
    }


def process_in_chunks(dataset_split, split_name, output_dir, chunk_size=10000):
    """Process large dataset in chunks to avoid finalization OOM"""
    num_samples = len(dataset_split)
    num_chunks = (num_samples + chunk_size - 1) // chunk_size
    chunk_dir = os.path.join(output_dir, f"{split_name}_chunks")
    os.makedirs(chunk_dir, exist_ok=True)

    for i in range(num_chunks):
        chunk_path = os.path.join(chunk_dir, f"chunk_{i}")

        if os.path.exists(chunk_path):
            print(f"  Chunk {i + 1}/{num_chunks} exists, skipping...")
            continue

        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, num_samples)
        print(f"  Processing chunk {i + 1}/{num_chunks} (samples {start_idx}-{end_idx})")

        chunk = dataset_split.select(range(start_idx, end_idx))

        processed_chunk = chunk.map(
            prepare_dataset_for_training,
            batched=True,
            batch_size=64,
            num_proc=4,
            keep_in_memory=False,
            writer_batch_size=1000,
        )

        processed_chunk.save_to_disk(chunk_path)

        del chunk, processed_chunk
        gc.collect()

    # Now concatenate chunks (memory-mapped, should be safe)
    print(f"  Concatenating {num_chunks} chunks...")
    chunks = [
        load_from_disk(os.path.join(chunk_dir, f"chunk_{i}"))
        for i in range(num_chunks)
    ]

    # concatenate_datasets uses memory mapping, doesn't load everything
    final_dataset = concatenate_datasets(chunks)

    return final_dataset


ds_dict = {}  # Should be able to load from already processed dataset

# Check if fully processed
splits = ["validation", "test"]
all_exist = all(
    os.path.exists(os.path.join(PROCESSED_DATASET_DIR, split))
    for split in splits
)

if all_exist:
    print(f"--- Found existing processed dataset at {PROCESSED_DATASET_DIR} ---")
    print("Loading from disk to save time...")
    dataset = DatasetDict({
        split: load_from_disk(os.path.join(PROCESSED_DATASET_DIR, split))
        for split in splits
    })
    print("✓ Preprocessed dataset loaded from disk!")
else:
    print(f"--- Processed dataset not found or incomplete at {PROCESSED_DATASET_DIR} ---")
    print("Starting the heavy preprocessing (this will take a while)...")
    os.makedirs(PROCESSED_DATASET_DIR, exist_ok=True)

    for split_name in ["validation", "test"]:
        split_output_path = os.path.join(PROCESSED_DATASET_DIR, split_name)

        if os.path.exists(split_output_path):
            print(f"✓ {split_name} already exists, skipping...")
            continue

        print(f"\nProcessing {split_name} split...")

        if split_name == "train":
            # Use chunked processing for large train split
            processed_split = process_in_chunks(
                ds_dict[split_name],
                split_name,
                PROCESSED_DATASET_DIR,
                chunk_size=10000  # ~10GB chunks
            )
        else:
            # Regular processing for smaller splits
            processed_split = ds_dict[split_name].map(
                prepare_dataset_for_training,
                batched=True,
                batch_size=32,
                num_proc=8,
                keep_in_memory=False,
                writer_batch_size=1000,
            )

        processed_split.save_to_disk(split_output_path)
        print(f"✓ {split_name} saved: {len(processed_split)} samples")

        del processed_split
        gc.collect()

    # Load the complete dataset
    dataset = DatasetDict({
        split: load_from_disk(os.path.join(PROCESSED_DATASET_DIR, split))
        for split in splits
    })
    print("\n✓ All splits processed and saved!")

print(f"✓ Validation: {len(dataset['validation'])} samples")
print(f"✓ Test: {len(dataset['test'])} samples")

# Show sample
print("\nSample processed data:")
sample = dataset["test"][0]
print(f"  Input features shape: {len(sample['input_features'])}")
print(f"  Labels length: {len(sample['labels'])}")
print(f"  First few label IDs: {sample['labels'][:10]}")

--- Found existing processed dataset at /root/uzbek-automatic-speech-recognition/outputs/part2/processed_uzbek_asr_dataset ---
Loading from disk to save time...


Loading dataset from disk:   0%|          | 0/17 [00:00<?, ?it/s]

✓ Preprocessed dataset loaded from disk!
✓ Validation: 5728 samples
✓ Test: 6994 samples

Sample processed data:
  Input features shape: 80
  Labels length: 29
  First few label IDs: [50258, 50337, 50359, 50363, 35, 268, 271, 12810, 6981, 1726]


In [9]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad labels
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Remove BOS token if present
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Base Model - Whisper Small

In [8]:
from scripts.whisper_utils import evaluate_by_dataset_with_trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./base_model_eval_temp",
    per_device_eval_batch_size=192,
    predict_with_generate=True,
    generation_max_length=225,
    fp16=True,
    generation_num_beams=1,
    dataloader_num_workers=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    processing_class=processor.feature_extractor,
)

evaluate_by_dataset_with_trainer(
    trainer,
    processor,
    dataset["test"],
    "test",
    should_always_normalize_apostrophes=True
)

del model, trainer, processor
gc.collect()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: TEST

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)             116.27%
CER (normalized)              45.34%
Sequence Similarity           58.79%
WER (raw)                    118.03%
CER (raw)                     46.87%
Seq Similarity (raw)          57.12%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)             116.25%
CER (normalized)              41.51%
Sequence Similarity           67.83%
WER (raw)                    117.47%
CER (raw)                     42.39%
Seq Similarity (raw)          66.52%

it
--------------------------------------------------------------------------------
WER (normalized)             114.97%
CER (normalized)              60.47%
Sequence Similarity           26.50%
WER (raw)                    116.37%
CER (raw)                     61.80%
Seq Similarity (raw)          23.92%

news
---

2

In [13]:
import re
import warnings

# Suppress the specific deprecation warning
warnings.filterwarnings("ignore", message=".*return_token_timestamps.*")


def test_model(model_path: str, audio_file: str):
    """
    Test the fine-tuned model on a sample audio file

    Args:
        model_path: Path to fine-tuned model
        audio_file: Path to audio file to transcribe
    """

    def group_words_by_sentences(chunks):
        """Group word-level timestamps into sentences"""
        sentences = []
        current_sentence = {'words': [], 'start': None, 'end': None, 'text': ''}

        for chunk in chunks:
            text = chunk['text'].strip()
            start, end = chunk['timestamp']

            # Initialize start time
            if current_sentence['start'] is None:
                current_sentence['start'] = start

            current_sentence['words'].append(text)
            current_sentence['end'] = end

            # Check if sentence ends
            if text.endswith('.') or text.endswith('!') or text.endswith('?'):
                current_sentence['text'] = ' '.join(current_sentence['words'])
                sentences.append(current_sentence.copy())
                current_sentence = {'words': [], 'start': None, 'end': None, 'text': ''}

        # Add remaining words as final sentence
        if current_sentence['words']:
            current_sentence['text'] = ' '.join(current_sentence['words'])
            sentences.append(current_sentence)

        return sentences

    print("\n" + "=" * 50)
    print("TESTING MODEL")
    print("=" * 50)

    import torch
    from transformers import pipeline

    # Load model
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_path,
        device=0 if torch.cuda.is_available() else -1,
    )

    print(f"✓ Loaded model from {model_path}")
    print(f"✓ Processing: {audio_file}")

    # Transcribe
    result = pipe(
        audio_file,
        language="uz",
        task="transcribe",
        return_timestamps="word"
    )
    sentences = group_words_by_sentences(result['chunks'])

    print(f"\nTranscription: {result['text']}")
    print(f"\nSentence-based timestamps:")
    for sent in sentences:
        start_str = f"{sent['start']:.2f}s" if sent['start'] is not None else "start"
        end_str = f"{sent['end']:.2f}s" if sent['end'] is not None else "end"
        print(f"[{start_str} - {end_str}]: {re.sub(r" '", "'", sent['text'])}")

    return result

## My Final Model

In [10]:
from scripts.whisper_utils import load_model, evaluate_by_dataset_with_trainer

FINAL_MODEL_PATH = os.path.abspath(os.path.join(OUTPUT_DIR, "whisper-uzbek-final"))

final_model, final_trainer, final_processor = load_model(FINAL_MODEL_PATH, dataset, data_collator, eval_batch_size=192)
evaluate_by_dataset_with_trainer(
    final_trainer,
    final_processor,
    dataset["validation"],
    "validation",
    should_always_normalize_apostrophes=True
)
evaluate_by_dataset_with_trainer(
    final_trainer,
    final_processor,
    dataset["test"],
    "test",
    should_always_normalize_apostrophes=True
)

del final_model, final_trainer, final_processor
gc.collect()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: VALIDATION

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)               7.44%
CER (normalized)               2.05%
Sequence Similarity           95.31%
WER (raw)                     11.35%
CER (raw)                      2.65%
Seq Similarity (raw)          94.30%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)               7.39%
CER (normalized)               1.65%
Sequence Similarity           98.80%
WER (raw)                     11.37%
CER (raw)                      2.29%
Seq Similarity (raw)          98.33%

it
--------------------------------------------------------------------------------
WER (normalized)              16.52%
CER (normalized)               5.96%
Sequence Similarity           79.90%
WER (raw)                     23.84%
CER (raw)                      7.19%
Seq Similarity (raw)          75.99%

ne

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: TEST

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)               8.52%
CER (normalized)               2.60%
Sequence Similarity           94.62%
WER (raw)                     12.86%
CER (raw)                      3.32%
Seq Similarity (raw)          92.92%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)               8.16%
CER (normalized)               1.98%
Sequence Similarity           98.52%
WER (raw)                     12.46%
CER (raw)                      2.64%
Seq Similarity (raw)          98.03%

it
--------------------------------------------------------------------------------
WER (normalized)              15.18%
CER (normalized)               5.88%
Sequence Similarity           83.65%
WER (raw)                     21.72%
CER (raw)                      7.05%
Seq Similarity (raw)          77.03%

news
---

5756

In [11]:
test_audio = os.path.join(DATASET_DIR, "devona_sample.wav")
if os.path.exists(test_audio):
    test_model(FINAL_MODEL_PATH, test_audio)


TESTING MODEL


Device set to use cuda:0


✓ Loaded model from /root/uzbek-automatic-speech-recognition/outputs/part2/whisper-uzbek-final
✓ Processing: ../datasets/devona_sample.wav


`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.



Transcription: Deonining juda ko'p g'alati savollari bor edi. Ba'zan duch kelgan odamlardan qayerga ketyapsizlar, deb so'rar. Odamlar ham bozorga, qahvaxonaga. Uyga ketayotganlardan aytishsa, boshini ikki yoqqa silkitib, yo'q, topolmadingiz, deyar pastda yurib ketardi. Deoni odamlar bilan ish yuzasidan bo'ladigan munosabatda ham haq va huquqqa qattiq rioya etar. Bir kishining ishini bajarayotganda chin dildan ishlardi.

Sentence-based timestamps:
[0.00s - 3.42s]: Deonining juda ko'p g'alati savollari bor edi.
[3.42s - 8.56s]: Ba'zan duch kelgan odamlardan qayerga ketyapsizlar, deb so'rar.
[8.56s - 11.10s]: Odamlar ham bozorga, qahvaxonaga.
[11.10s - 19.38s]: Uyga ketayotganlardan aytishsa, boshini ikki yoqqa silkitib, yo'q, topolmadingiz, deyar pastda yurib ketardi.
[20.16s - 26.80s]: Deoni odamlar bilan ish yuzasidan bo'ladigan munosabatda ham haq va huquqqa qattiq rioya etar.
[26.80s - 29.90s]: Bir kishining ishini bajarayotganda chin dildan ishlardi.


## OvozifyLabs/whisper-small-uz-v1

In [12]:
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="uz", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("OvozifyLabs/whisper-small-uz-v1")

model.generation_config.language = "uz"
model.generation_config.task = "transcribe"

# Force decoder to generate in Uzbek
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="uz",
    task="transcribe"
)

# Set pad token to avoid attention mask warning
if processor.tokenizer.pad_token_id is None:
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

print(processor.feature_extractor)
model

WhisperFeatureExtractor {
  "chunk_length": 30,
  "dither": 0.0,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}



WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [13]:
# from scripts.whisper_utils import evaluate_by_dataset_with_trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./base_model_eval_temp",
    per_device_eval_batch_size=192,
    predict_with_generate=True,
    generation_max_length=225,
    fp16=True,
    generation_num_beams=1,
    dataloader_num_workers=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    processing_class=processor.feature_extractor,
)

evaluate_by_dataset_with_trainer(
    trainer,
    processor,
    dataset["test"],
    "test",
    should_always_normalize_apostrophes=True
)

# del model, trainer, processor
# gc.collect()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: TEST

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)               7.59%
CER (normalized)               2.50%
Sequence Similarity           94.95%
WER (raw)                     25.90%
CER (raw)                      5.09%
Seq Similarity (raw)          89.87%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)               6.83%
CER (normalized)               1.68%
Sequence Similarity           98.76%
WER (raw)                     35.44%
CER (raw)                      6.07%
Seq Similarity (raw)          93.80%

it
--------------------------------------------------------------------------------
WER (normalized)              12.98%
CER (normalized)               5.47%
Sequence Similarity           84.96%
WER (raw)                     28.21%
CER (raw)                      8.03%
Seq Similarity (raw)          72.91%

news
---

{'wer': 0.0759,
 'cer': 0.025,
 'sequence_similarity': 0.9495,
 'wer_raw': 0.259,
 'cer_raw': 0.0509,
 'sequence_similarity_raw': 0.8987,
 'by_dataset': {'uzbek_voice': {'wer': 0.036,
   'cer': 0.0102,
   'sequence_similarity': 0.9934,
   'wer_raw': 0.3437,
   'cer_raw': 0.0514,
   'sequence_similarity_raw': 0.9459},
  'news': {'wer': 0.1821,
   'cer': 0.0711,
   'sequence_similarity': 0.8009,
   'wer_raw': 0.3652,
   'cer_raw': 0.1034,
   'sequence_similarity_raw': 0.6714},
  'common_voice': {'wer': 0.0683,
   'cer': 0.0168,
   'sequence_similarity': 0.9876,
   'wer_raw': 0.3544,
   'cer_raw': 0.0607,
   'sequence_similarity_raw': 0.938},
  'it': {'wer': 0.1298,
   'cer': 0.0547,
   'sequence_similarity': 0.8496,
   'wer_raw': 0.2821,
   'cer_raw': 0.0803,
   'sequence_similarity_raw': 0.7291}}}

In [14]:
test_audio = os.path.join(DATASET_DIR, "devona_sample.wav")
if os.path.exists(test_audio):
    test_model("OvozifyLabs/whisper-small-uz-v1", test_audio)


TESTING MODEL


Device set to use cuda:0


✓ Loaded model from OvozifyLabs/whisper-small-uz-v1
✓ Processing: ../datasets/devona_sample.wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.



Transcription:  devonining juda ko‘p g‘alati savollari bor edi. ba’zan duch kelgan odamlardan qayerga ketyapsizlar, deb so‘rar. odamlar ham bozorga, qahvaxonaga, uyga ketayotganlardan aytishsa, boshini ikki yoqqa silkitib, yo‘q, topolmadingiz, der, asta yurib ketardi. devona odamlar bilan ish yuzasidan bo‘ladigan munosabatda ham haq va huquqqa qattiq rioya etar. bir kishining ishini bajarayotganda chin dildan ishlardi.

Sentence-based timestamps:
[0.00s - 3.52s]: devonining juda ko‘p g‘alati savollari bor edi.
[3.52s - 8.60s]: ba’zan duch kelgan odamlardan qayerga ketyapsizlar, deb so‘rar.
[8.60s - 20.08s]: odamlar ham bozorga, qahvaxonaga, uyga ketayotganlardan aytishsa, boshini ikki yoqqa silkitib, yo‘q, topolmadingiz, der, asta yurib ketardi.
[20.08s - 26.90s]: devona odamlar bilan ish yuzasidan bo‘ladigan munosabatda ham haq va huquqqa qattiq rioya etar.
[26.90s - 29.90s]: bir kishining ishini bajarayotganda chin dildan ishlardi.


## islomov/rubaistt_v2_medium

In [3]:
df = pd.read_csv(DATASET_PATH, index_col="id", low_memory=False)

# Shuffle dataset
df = df.sample(frac=1, random_state=SEED)

# Create full absolute path to audio
df["path"] = df.apply(
    lambda row: os.path.abspath(
        os.path.join(DATASET_DIR, row["dataset"], "sampled_audio", row["path"])
    ),
    axis=1
)

# Remove unnecessary columns
cols = ["path", "type", "dataset", "duration", "ref_normalized"]
df = df[cols]

# Make sure ref_normalized is never NaN
# There was an exception thrown while processing dataset
none_mask = df["ref_normalized"].isna() | df["ref_normalized"].isnull()
df.loc[none_mask, "ref_normalized"] = ""

print("Dataset Statistics:")
print(f"  Total duration: {df['duration'].sum() / 3600:.2f} hours")
print(f"  Avg duration: {df['duration'].mean():.2f} seconds")
print(f"  By Dataset:")
print((df.groupby(["dataset", "type"])["duration"].sum() / 3600))
print(f"Total training samples: {len(df[df["type"] == "train"]):,}")
print(f"Total validation samples: {len(df[df["type"] == "validation"]):,}")
print(f"Total test samples: {len(df[df["type"] == "test"]):,}")

df.to_csv(os.path.join(DATASET_DIR, "combined_dataset_part2.csv"), index_label="id")
df

Dataset Statistics:
  Total duration: 161.00 hours
  Avg duration: 7.31 seconds
  By Dataset:
dataset        type      
common_voice   test           2.751890
               validation     1.328719
feruza_speech  train          3.217020
it             test           0.737151
               train          7.940193
               validation     0.735187
news           test           1.726184
               train         51.826843
               validation     1.106672
uzbek_voice    test           7.160320
               train         75.527433
               validation     6.939970
Name: duration, dtype: float64
Total training samples: 66,545
Total validation samples: 5,728
Total test samples: 6,994


Unnamed: 0_level_0,path,type,dataset,duration,ref_normalized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
69899,/root/uzbek-automatic-speech-recognition/datas...,train,news,8.2418,Bir vaqtda sodir bo'ldi. Nega aynan shunday za...
64879,/root/uzbek-automatic-speech-recognition/datas...,train,news,5.7762,Rahmat berib o'tgan fikrlaringiz va ma'lumotla...
6928,/root/uzbek-automatic-speech-recognition/datas...,train,uzbek_voice,5.6160,"O'yinchoqni olamiz, bolani qorni bilan fitbol ..."
24018,/root/uzbek-automatic-speech-recognition/datas...,validation,uzbek_voice,6.0840,E'londa kriptovalyutadagi narx vaqtincha belgi...
36042,/root/uzbek-automatic-speech-recognition/datas...,train,uzbek_voice,6.1136,Mazkur anjumanda iqlim bo'yicha yangi xalqaro ...
...,...,...,...,...,...
20665,/root/uzbek-automatic-speech-recognition/datas...,train,uzbek_voice,7.1280,Organizmda yo'qotilgan unsurning o'rnini to'ld...
35905,/root/uzbek-automatic-speech-recognition/datas...,train,uzbek_voice,5.1304,"To'fon pasayganiga qaramay, xavf hali ham saql..."
24573,/root/uzbek-automatic-speech-recognition/datas...,validation,uzbek_voice,3.8160,Bu haqda hokimlik axborot xizmati xabar berdi.
44775,/root/uzbek-automatic-speech-recognition/datas...,train,uzbek_voice,5.0760,Shu yerda qishloq xo'jaligida amalga oshirilad...


In [4]:
from datasets import Audio, Dataset

# Convert to Hugging Face Dataset
ds_dict = DatasetDict({
    "test": Dataset.from_pandas(df[df["type"] == "test"]),
})
ds_dict = ds_dict.remove_columns(["type", "duration"])

# Cast the path column to Audio
ds_dict = ds_dict.cast_column("path", Audio(sampling_rate=16_000))

# Rename columns for clarity
ds_dict = ds_dict.rename_column("path", "audio")

ds_dict

DatasetDict({
    test: Dataset({
        features: ['audio', 'dataset', 'ref_normalized', 'id'],
        num_rows: 6994
    })
})

In [5]:
processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="uz", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("islomov/rubaistt_v2_medium")

model.generation_config.language = "uz"
model.generation_config.task = "transcribe"

# Force decoder to generate in Uzbek
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="uz",
    task="transcribe"
)

# Set pad token to avoid attention mask warning
if processor.tokenizer.pad_token_id is None:
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

print(processor.feature_extractor)
model

WhisperFeatureExtractor {
  "chunk_length": 30,
  "dither": 0.0,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}



WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=Tru

In [6]:
import gc
import os
from datasets import load_from_disk, concatenate_datasets

NEW_PROCESSED_DATASET_DIR = os.path.join("../outputs/whisper_medium/processed_dataset")


def prepare_dataset_for_training(batch):
    """Preprocess a batch for training"""
    # Load and process audio from the 'audio' column
    audios = batch["audio"]

    # Compute log-Mel input features
    input_features = processor.feature_extractor(
        [audio["array"] for audio in audios],
        sampling_rate=audios[0]["sampling_rate"]
    ).input_features

    # Use __call__ method (faster) - just call the tokenizer directly
    encoded = processor.tokenizer(
        batch["ref_normalized"],
        truncation=True,
        padding=False  # Don't pad here, let data collator handle it
    )

    return {
        "input_features": input_features,
        "labels": encoded.input_ids,  # Extract input_ids from the result
        "dataset": batch["dataset"]  # As metadata for evaluation
    }


def process_in_chunks(dataset_split, split_name, output_dir, chunk_size=10000):
    """Process large dataset in chunks to avoid finalization OOM"""
    num_samples = len(dataset_split)
    num_chunks = (num_samples + chunk_size - 1) // chunk_size
    chunk_dir = os.path.join(output_dir, f"{split_name}_chunks")
    os.makedirs(chunk_dir, exist_ok=True)

    for i in range(num_chunks):
        chunk_path = os.path.join(chunk_dir, f"chunk_{i}")

        if os.path.exists(chunk_path):
            print(f"  Chunk {i + 1}/{num_chunks} exists, skipping...")
            continue

        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, num_samples)
        print(f"  Processing chunk {i + 1}/{num_chunks} (samples {start_idx}-{end_idx})")

        chunk = dataset_split.select(range(start_idx, end_idx))

        processed_chunk = chunk.map(
            prepare_dataset_for_training,
            batched=True,
            batch_size=64,
            num_proc=4,
            keep_in_memory=False,
            writer_batch_size=1000,
        )

        processed_chunk.save_to_disk(chunk_path)

        del chunk, processed_chunk
        gc.collect()

    # Now concatenate chunks (memory-mapped, should be safe)
    print(f"  Concatenating {num_chunks} chunks...")
    chunks = [
        load_from_disk(os.path.join(chunk_dir, f"chunk_{i}"))
        for i in range(num_chunks)
    ]

    # concatenate_datasets uses memory mapping, doesn't load everything
    final_dataset = concatenate_datasets(chunks)

    return final_dataset


# Check if fully processed
splits = ["test"]
all_exist = all(
    os.path.exists(os.path.join(NEW_PROCESSED_DATASET_DIR, split))
    for split in splits
)

if all_exist:
    print(f"--- Found existing processed dataset at {NEW_PROCESSED_DATASET_DIR} ---")
    print("Loading from disk to save time...")
    dataset = DatasetDict({
        split: load_from_disk(os.path.join(NEW_PROCESSED_DATASET_DIR, split))
        for split in splits
    })
    print("✓ Preprocessed dataset loaded from disk!")
else:
    print(f"--- Processed dataset not found or incomplete at {NEW_PROCESSED_DATASET_DIR} ---")
    print("Starting the heavy preprocessing (this will take a while)...")
    os.makedirs(NEW_PROCESSED_DATASET_DIR, exist_ok=True)

    for split_name in ["test"]:
        split_output_path = os.path.join(NEW_PROCESSED_DATASET_DIR, split_name)

        if os.path.exists(split_output_path):
            print(f"✓ {split_name} already exists, skipping...")
            continue

        print(f"\nProcessing {split_name} split...")

        if split_name == "train":
            # Use chunked processing for large train split
            processed_split = process_in_chunks(
                ds_dict[split_name],
                split_name,
                NEW_PROCESSED_DATASET_DIR,
                chunk_size=10000  # ~10GB chunks
            )
        else:
            # Regular processing for smaller splits
            processed_split = ds_dict[split_name].map(
                prepare_dataset_for_training,
                batched=True,
                batch_size=32,
                num_proc=8,
                keep_in_memory=False,
                writer_batch_size=1000,
            )

        processed_split.save_to_disk(split_output_path)
        print(f"✓ {split_name} saved: {len(processed_split)} samples")

        del processed_split
        gc.collect()

    # Load the complete dataset
    dataset = DatasetDict({
        split: load_from_disk(os.path.join(NEW_PROCESSED_DATASET_DIR, split))
        for split in splits
    })
    print("\n✓ All splits processed and saved!")

--- Found existing processed dataset at ../outputs/whisper_medium/processed_dataset ---
Loading from disk to save time...


Loading dataset from disk:   0%|          | 0/17 [00:00<?, ?it/s]

✓ Preprocessed dataset loaded from disk!


In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./base_model_eval_temp",
    per_device_eval_batch_size=128,
    predict_with_generate=True,
    generation_max_length=225,
    fp16=True,
    generation_num_beams=1,
    dataloader_num_workers=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    processing_class=processor.feature_extractor,
)

evaluate_by_dataset_with_trainer(
    trainer,
    processor,
    dataset["test"],
    "test",
    should_always_normalize_apostrophes=True
)

del model, trainer, processor
gc.collect()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: TEST

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)               7.33%
CER (normalized)               1.76%
Sequence Similarity           96.78%
WER (raw)                     37.40%
CER (raw)                      8.16%
Seq Similarity (raw)          89.26%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)               6.33%
CER (normalized)               1.16%
Sequence Similarity           99.14%
WER (raw)                     31.73%
CER (raw)                      4.84%
Seq Similarity (raw)          96.29%

it
--------------------------------------------------------------------------------
WER (normalized)               9.33%
CER (normalized)               2.84%
Sequence Similarity           91.24%
WER (raw)                     26.07%
CER (raw)                      6.26%
Seq Similarity (raw)          76.07%

news
---

8

In [20]:
test_audio = os.path.join(DATASET_DIR, "devona_sample.wav")
if os.path.exists(test_audio):
    test_model("islomov/rubaistt_v2_medium", test_audio)


TESTING MODEL


Device set to use cuda:0


✓ Loaded model from islomov/rubaistt_v2_medium
✓ Processing: ../datasets/devona_sample.wav

Transcription: devonaning juda ko'p g'alati savollari bor edi. ba'zan duch kelgan odamlardan qayerga ketyapsizlar? deb so'rar. odamlar ham bozorga, qahvaxonaga, uyga ketayotganlardan aytishsa boshini ikki yoqqa silkitib, yo'q, topolmadingiz, der, vasta yurib ketardi.devona odamlar bilan ish yuzasidan bo'ladigan munosabatda ham haq va huquqqa qattiq rioya etar. bir kishining ishini bajarayotganda chin dildan ishlardi.

Sentence-based timestamps:
[0.00s - 3.58s]: devonaning juda ko'p g'alati savollari bor edi.
[3.58s - 7.68s]: ba'zan duch kelgan odamlardan qayerga ketyapsizlar?
[7.68s - 8.54s]: deb so'rar.
[8.54s - 20.04s]: odamlar ham bozorga, qahvaxonaga, uyga ketayotganlardan aytishsa boshini ikki yoqqa silkitib, yo'q, topolmadingiz, der, vasta yurib ketardi.
[19.52s - 26.78s]: devona odamlar bilan ish yuzasidan bo'ladigan munosabatda ham haq va huquqqa qattiq rioya etar.
[26.78s - 29.98s]: bir

## Kotib/uzbek_stt_v1 (Whisper Medium)

In [21]:
processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="uz", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("Kotib/uzbek_stt_v1")

model.generation_config.language = "uz"
model.generation_config.task = "transcribe"

# Force decoder to generate in Uzbek
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="uz",
    task="transcribe"
)

# Set pad token to avoid attention mask warning
if processor.tokenizer.pad_token_id is None:
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

training_args = Seq2SeqTrainingArguments(
    output_dir="./base_model_eval_temp",
    per_device_eval_batch_size=128,
    predict_with_generate=True,
    generation_max_length=225,
    fp16=True,
    generation_num_beams=1,
    dataloader_num_workers=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    processing_class=processor.feature_extractor,
)

evaluate_by_dataset_with_trainer(
    trainer,
    processor,
    dataset["test"],
    "test",
    should_always_normalize_apostrophes=True
)

del model, trainer, processor
gc.collect()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: TEST

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)               5.62%
CER (normalized)               1.37%
Sequence Similarity           96.94%
WER (raw)                     28.02%
CER (raw)                      6.77%
Seq Similarity (raw)          89.89%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)               5.28%
CER (normalized)               0.99%
Sequence Similarity           99.28%
WER (raw)                     23.28%
CER (raw)                      3.66%
Seq Similarity (raw)          96.91%

it
--------------------------------------------------------------------------------
WER (normalized)               8.80%
CER (normalized)               2.65%
Sequence Similarity           91.88%
WER (raw)                     25.22%
CER (raw)                      6.04%
Seq Similarity (raw)          76.91%

news
---

11222

In [22]:
test_audio = os.path.join(DATASET_DIR, "devona_sample.wav")
if os.path.exists(test_audio):
    test_model("Kotib/uzbek_stt_v1", test_audio)


TESTING MODEL


Device set to use cuda:0


✓ Loaded model from Kotib/uzbek_stt_v1
✓ Processing: ../datasets/devona_sample.wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.



Transcription: devonaning juda ko'p g'alati savollari bor edi. ba'zan duch kelgan odamlardan qayerga ketyapsizlar? deb so'rar. odamlar ham bozorga, qahvaxonaga, uyga ketayotganlarni aytishsa boshini ikki yoqqa silkitib, yo'q, topa olmadingiz, der, asta yurib ketardi. devona odamlar bilan ish yuzasidan bo'ladigan munosabatda ham haq va huquqqa qattiq rioya etar. bir kishining ishini bajarayotganda chin dildan ishlardi.

Sentence-based timestamps:
[0.00s - 3.62s]: devonaning juda ko'p g'alati savollari bor edi.
[3.62s - 7.68s]: ba'zan duch kelgan odamlardan qayerga ketyapsizlar?
[7.68s - 8.56s]: deb so'rar.
[8.56s - 20.20s]: odamlar ham bozorga, qahvaxonaga, uyga ketayotganlarni aytishsa boshini ikki yoqqa silkitib, yo'q, topa olmadingiz, der, asta yurib ketardi.
[20.20s - 26.88s]: devona odamlar bilan ish yuzasidan bo'ladigan munosabatda ham haq va huquqqa qattiq rioya etar.
[26.88s - 29.90s]: bir kishining ishini bajarayotganda chin dildan ishlardi.


## aisha-org/Whisper-Uzbek

In [11]:
# from scripts.whisper_utils import evaluate_by_dataset_with_trainer

processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="uz", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("aisha-org/Whisper-Uzbek")

model.generation_config.language = "uz"
model.generation_config.task = "transcribe"

# Force decoder to generate in Uzbek
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="uz",
    task="transcribe"
)

# Set pad token to avoid attention mask warning
if processor.tokenizer.pad_token_id is None:
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

training_args = Seq2SeqTrainingArguments(
    output_dir="./base_model_eval_temp",
    per_device_eval_batch_size=128,
    predict_with_generate=True,
    generation_max_length=225,
    fp16=True,
    generation_num_beams=1,
    dataloader_num_workers=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    processing_class=processor.feature_extractor,
)

evaluate_by_dataset_with_trainer(
    trainer,
    processor,
    dataset["test"],
    "test",
    should_always_normalize_apostrophes=True
)

del model, trainer, processor
gc.collect()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: TEST

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)              17.28%
CER (normalized)               5.49%
Sequence Similarity           90.40%
WER (raw)                     29.96%
CER (raw)                      9.82%
Seq Similarity (raw)          85.89%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)               8.76%
CER (normalized)               1.93%
Sequence Similarity           98.59%
WER (raw)                     28.40%
CER (raw)                      5.14%
Seq Similarity (raw)          95.82%

it
--------------------------------------------------------------------------------
WER (normalized)              39.98%
CER (normalized)              15.28%
Sequence Similarity           61.59%
WER (raw)                     58.48%
CER (raw)                     20.16%
Seq Similarity (raw)          48.47%

news
---

11230

In [14]:
test_audio = os.path.join(DATASET_DIR, "devona_sample.wav")
if os.path.exists(test_audio):
    test_model("aisha-org/Whisper-Uzbek", test_audio)


TESTING MODEL


Device set to use cuda:0


✓ Loaded model from aisha-org/Whisper-Uzbek
✓ Processing: ../datasets/devona_sample.wav


`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.



Transcription: Devonaning juda koʻp gʻalati savollari bor edi. Baʼzan duch kelgan odamlardan «qaerga ketayapsizlar?» deb soʻrar, odamlar ham bozorga, qahvaxonaga, uyga ketayotganlaridan aytishsa, boshini ikki yoqqa silkitib «yoʻq, topolmadingiz!», der, ostida yurib ketardi.Devona odamlar bilan ish yuzasidan bo‘ladigan munosabatda ham haq va huquqqa qattiq rioya etar, bir kishining ishini bajarayotganda chin dildan ishlardi.

Sentence-based timestamps:
[0.00s - 3.62s]: Devonaning juda koʻp gʻalati savollari bor edi.
[3.62s - 20.06s]: Baʼzan duch kelgan odamlardan «qaerga ketayapsizlar ?» deb soʻrar, odamlar ham bozorga, qahvaxonaga, uyga ketayotganlaridan aytishsa, boshini ikki yoqqa silkitib «yoʻq, topolmadingiz !», der, ostida yurib ketardi.
[19.64s - 29.96s]: Devona odamlar bilan ish yuzasidan bo‘ladigan munosabatda ham haq va huquqqa qattiq rioya etar, bir kishining ishini bajarayotganda chin dildan ishlardi.


## AbdulxoliqMirzaev/whisper-uz-medium

In [None]:
del model, trainer, processor
gc.collect()

In [15]:
processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="uz", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("AbdulxoliqMirzaev/whisper-uz-medium")

model.generation_config.language = "uz"
model.generation_config.task = "transcribe"

# Force decoder to generate in Uzbek
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="uz",
    task="transcribe"
)

# Set pad token to avoid attention mask warning
if processor.tokenizer.pad_token_id is None:
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id

training_args = Seq2SeqTrainingArguments(
    output_dir="./base_model_eval_temp",
    per_device_eval_batch_size=192,
    predict_with_generate=True,
    generation_max_length=225,
    fp16=True,
    generation_num_beams=1,
    dataloader_num_workers=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    processing_class=processor.feature_extractor,
)

evaluate_by_dataset_with_trainer(
    trainer,
    processor,
    dataset["test"],
    "test",
    should_always_normalize_apostrophes=True
)

del model, trainer, processor
gc.collect()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenize


DETAILED EVALUATION: TEST

OVERALL METRICS
--------------------------------------------------------------------------------
WER (normalized)              23.36%
CER (normalized)               6.53%
Sequence Similarity           88.90%
WER (raw)                     30.14%
CER (raw)                      9.82%
Seq Similarity (raw)          85.38%

METRICS BY DATASET

common_voice
--------------------------------------------------------------------------------
WER (normalized)              17.03%
CER (normalized)               3.51%
Sequence Similarity           97.38%
WER (raw)                     22.72%
CER (raw)                      4.60%
Seq Similarity (raw)          96.59%

it
--------------------------------------------------------------------------------
WER (normalized)              49.46%
CER (normalized)              17.87%
Sequence Similarity           57.87%
WER (raw)                     59.63%
CER (raw)                     20.48%
Seq Similarity (raw)          46.44%

news
---

11218

In [16]:
test_audio = os.path.join(DATASET_DIR, "devona_sample.wav")
if os.path.exists(test_audio):
    test_model("AbdulxoliqMirzaev/whisper-uz-medium", test_audio)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 310b009a-e6f8-4a67-b979-066b7a6e1f9b)')' thrown while requesting HEAD https://huggingface.co/openai/whisper-medium/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d60d0454-8251-4114-8451-0648f96bb7fb)')' thrown while requesting HEAD https://huggingface.co/openai/whisper-medium/resolve/main/processor_config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 50045734-e638-48db-9566-a3613c326e3a)')' thrown while requesting HEAD https://huggingface.co/openai/whisper-medium/resolve/main/chat_template.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (rea


TESTING MODEL
✓ Loaded model object
✓ Processing: ../datasets/devona_sample.wav


`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTok


Transcription: Devonaning juda ko'p g'alati savollari bor edi, ba'zan duch kelgan odamlardan qayerga ketayapsizlar, deb so'rar, odamlar ham bozorga, qahvaxonaga, uyga ketayotganlarda aytishsa boshini ikki yoqqasil ketib, yo'q, topolmadingiz, der, vasta yurib ketardi.Devona odamlar bilan ish yuzasidan bo'latgan munosabatda ham haq va huquqqa qattiq rioya etar, bir kishining ishini bajarayotgandi chindildan ishlardi.

Sentence-based timestamps:
[0.00s - 19.48s]: Devonaning juda ko 'p g 'alati savollari bor edi, ba 'zan duch kelgan odamlardan qayerga ketayapsizlar, deb so 'rar, odamlar ham bozorga, qahvaxonaga, uyga ketayotganlarda aytishsa boshini ikki yoqqasil ketib, yo 'q, topolmadingiz, der, vasta yurib ketardi.
[19.52s - 29.98s]: Devona odamlar bilan ish yuzasidan bo 'latgan munosabatda ham haq va huquqqa qattiq rioya etar, bir kishining ishini bajarayotgandi chindildan ishlardi.
