In [None]:
# tutorial for finetuning whisper
# https://huggingface.co/blog/fine-tune-whisper

In [1]:
import torch
print("MPS:", torch.backends.mps.is_available())

import sys
print(sys.executable)

MPS: True
/Users/zuzamakowska/Documents/Africa/.venv/bin/python


In [2]:
# huggingface-cli whoami <- to check if you're logged in to hugging face 

## Download Common Voice dataset (Swahili)

In [3]:
from datasets.utils.logging import set_verbosity_info
set_verbosity_info()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# api key: echo $MDC_API_KEY

from datasets import load_dataset, Features, Value, Audio

features = Features({
    "path": Value("string"),
    "sentence": Value("string"),
    "eng": Value("string"),
    "sw": Value("string"),
})

ds = load_dataset(
    "csv",
    data_files={
        "train": "../../data/1_02_2026_tcc_cv/train_dataset.tsv",
        "validation": "../../data/1_02_2026_tcc_cv/valid_dataset.tsv",
        "test": "../../data/1_02_2026_tcc_cv/test_dataset.tsv",
    },
    delimiter="\t",
    features=features,
)


Using custom data configuration default-cf71f9222498f735
Found cached dataset csv (/Users/zuzamakowska/.cache/huggingface/datasets/csv/default-cf71f9222498f735/0.0.0/d41f37fffd4cc4dfd07485b661c45b9863c2d0a8b0a28faa84befecfef33631a)


In [5]:
print(ds["train"][0])
print(ds["train"].features)

{'path': 'clips/Childhood 1_tcc_dato1239_tsim1256_IGS0229_2015-12-25_MM_10_0032832_0035221.mp3', 'sentence': 'ani qay a gahed gaw dugmosht', 'eng': None, 'sw': 'zamani ??? wangu nilisafiri kwenda dug mosht'}
{'path': Value('string'), 'sentence': Value('string'), 'eng': Value('string'), 'sw': Value('string')}


In [6]:
def fix_path(batch):
    batch["path"] = "/Users/zuzamakowska/Documents/Africa/Project/Low-resource-languages/data/1_02_2026_tcc_cv/" + batch["path"]
    return batch

ds = ds.map(fix_path)
print(ds["train"].features)

Loading cached processed dataset at /Users/zuzamakowska/.cache/huggingface/datasets/csv/default-cf71f9222498f735/0.0.0/d41f37fffd4cc4dfd07485b661c45b9863c2d0a8b0a28faa84befecfef33631a/cache-4f686eb79ad1bd0e_*_of_00001.arrow
Loading cached processed dataset at /Users/zuzamakowska/.cache/huggingface/datasets/csv/default-cf71f9222498f735/0.0.0/d41f37fffd4cc4dfd07485b661c45b9863c2d0a8b0a28faa84befecfef33631a/cache-e184bcc6d17355aa_*_of_00001.arrow
Loading cached processed dataset at /Users/zuzamakowska/.cache/huggingface/datasets/csv/default-cf71f9222498f735/0.0.0/d41f37fffd4cc4dfd07485b661c45b9863c2d0a8b0a28faa84befecfef33631a/cache-9f1b9a52e38c8639_*_of_00001.arrow


{'path': Value('string'), 'sentence': Value('string'), 'eng': Value('string'), 'sw': Value('string')}


In [7]:
from datasets import Audio
ds = ds.cast_column("path", Audio(sampling_rate=16000))

In [8]:

print(ds["train"].features)
print(ds["train"][0])


{'path': Audio(sampling_rate=16000, decode=True, num_channels=None, stream_index=None), 'sentence': Value('string'), 'eng': Value('string'), 'sw': Value('string')}
{'path': <datasets.features._torchcodec.AudioDecoder object at 0x15128ed10>, 'sentence': 'ani qay a gahed gaw dugmosht', 'eng': None, 'sw': 'zamani ??? wangu nilisafiri kwenda dug mosht'}


In [9]:
ds = ds.remove_columns(['eng', 'sw'])

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['path', 'sentence'],
        num_rows: 11855
    })
    validation: Dataset({
        features: ['path', 'sentence'],
        num_rows: 3202
    })
    test: Dataset({
        features: ['path', 'sentence'],
        num_rows: 3114
    })
})

In [11]:
print(ds['train'].column_names)
print(ds['train'].features)

['path', 'sentence']
{'path': Audio(sampling_rate=16000, decode=True, num_channels=None, stream_index=None), 'sentence': Value('string')}


In [12]:
print(ds["train"][0])

{'path': <datasets.features._torchcodec.AudioDecoder object at 0x174fa3750>, 'sentence': 'ani qay a gahed gaw dugmosht'}


In [13]:
ds = ds.with_format("numpy")
ds = ds.rename_column("path", "audio")
sample = ds["train"][0]
print(sample["audio"]["array"][:10])

[-1.0747810e-04 -5.9840626e-05 -2.3686751e-05  2.7955630e-05
  1.6578406e-05 -6.1724837e-05 -3.4597873e-05 -5.0002422e-05
 -2.9865387e-05 -3.0896772e-05]


In [14]:
print(ds['train'][0])

{'audio': <datasets.features._torchcodec.AudioDecoder object at 0x174fb1090>, 'sentence': np.str_('ani qay a gahed gaw dugmosht')}


## Features Extraction

In [15]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swahili", task="transcribe", padding='longest')


In [16]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small')

In [17]:
input_str = ds['train'][0]['sentence']
# labels = tokenizer(input_str).input_ids
input_str

np.str_('ani qay a gahed gaw dugmosht')

In [18]:
labels = tokenizer(input_str).input_ids
labels

[50258,
 50318,
 50359,
 50363,
 3782,
 9505,
 320,
 257,
 290,
 545,
 292,
 290,
 1607,
 22954,
 3415,
 357,
 50257]

In [19]:
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_with_special

'<|startoftranscript|><|sw|><|transcribe|><|notimestamps|>ani qay a gahed gaw dugmosht<|endoftext|>'

In [20]:
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
decoded_str

'ani qay a gahed gaw dugmosht'

In [21]:
raw_tokens = tokenizer(input_str)
raw_tokens

{'input_ids': [50258, 50318, 50359, 50363, 3782, 9505, 320, 257, 290, 545, 292, 290, 1607, 22954, 3415, 357, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
decoded_tokens = tokenizer.convert_ids_to_tokens(labels)
print(decoded_tokens)

['<|startoftranscript|>', '<|sw|>', '<|transcribe|>', '<|notimestamps|>', 'ani', 'Ġq', 'ay', 'Ġa', 'Ġg', 'ah', 'ed', 'Ġg', 'aw', 'Ġdug', 'mos', 'ht', '<|endoftext|>']


### WhisperProcessor

In [23]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained('openai/whisper-tiny', language='Swahili', task='transcribe')

In [24]:
print(ds["train"][0])

{'audio': <datasets.features._torchcodec.AudioDecoder object at 0x34ef64790>, 'sentence': np.str_('ani qay a gahed gaw dugmosht')}


In [25]:
from datasets import Audio

ds = ds.cast_column('audio', Audio(sampling_rate=16000))

In [26]:
print(ds["train"][0])

{'audio': <datasets.features._torchcodec.AudioDecoder object at 0x174fb2250>, 'sentence': np.str_('ani qay a gahed gaw dugmosht')}


In [27]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate = audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [28]:
preprocessed_ds = ds.map(prepare_dataset, num_proc=4)

Loading cached processed dataset at /Users/zuzamakowska/.cache/huggingface/datasets/csv/default-cf71f9222498f735/0.0.0/d41f37fffd4cc4dfd07485b661c45b9863c2d0a8b0a28faa84befecfef33631a/cache-dbe3ecca031c60b5_*_of_00004.arrow
Concatenating 4 shards
Loading cached processed dataset at /Users/zuzamakowska/.cache/huggingface/datasets/csv/default-cf71f9222498f735/0.0.0/d41f37fffd4cc4dfd07485b661c45b9863c2d0a8b0a28faa84befecfef33631a/cache-4618e8c8a3a90a0f_*_of_00004.arrow
Concatenating 4 shards
Loading cached processed dataset at /Users/zuzamakowska/.cache/huggingface/datasets/csv/default-cf71f9222498f735/0.0.0/d41f37fffd4cc4dfd07485b661c45b9863c2d0a8b0a28faa84befecfef33631a/cache-8584042ffe5163b3_*_of_00004.arrow
Concatenating 4 shards


In [29]:
preprocessed_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 11855
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 3202
    })
    test: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 3114
    })
})

In [30]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny')

Loading weights: 100%|██████████| 167/167 [00:00<00:00, 1824.47it/s, Materializing param=model.encoder.layers.3.self_attn_layer_norm.weight]  


In [31]:
model.generation_config.language = "swahili"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None
model.config.use_cache = False

In [32]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [33]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [34]:
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)

    return {
        "wer": wer,
        "cer": cer,
        "combined": 0.5 * wer + 0.5 * cer,
    }

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="../../models/whisper-tiny-asmjeeg-2026",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-6,
    warmup_steps=200,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    logging_strategy="steps",
)


In [36]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # tokenizer=processor.feature_extractor,
)


In [37]:
trainer.train()

  super().__init__(loader)


Step,Training Loss,Validation Loss


NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTo

KeyboardInterrupt: 