In [1]:
# tutorial for finetuning whisper
# https://huggingface.co/blog/fine-tune-whisper

In [1]:
import torch
print("MPS:", torch.backends.mps.is_available())

import sys
print(sys.executable)

MPS: True
/Users/zuzamakowska/Documents/Africa/Project/Low-resource-languages/venv/bin/python


In [3]:
# huggingface-cli whoami <- to check if you're logged in to hugging face 

## Download Common Voice dataset (Swahili)

In [4]:
from datasets.utils.logging import set_verbosity_info
set_verbosity_info()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# api key: echo $MDC_API_KEY

from datasets import load_dataset, Features, Value, Audio

features = Features({
    "path": Value("string"),
    "sentence": Value("string"),
    "eng": Value("string"),
    "sw": Value("string"),
})

ds = load_dataset(
    "csv",
    data_files={
        "train": "../data/6_11_2025_tcc_cv/train_dataset.tsv",
        "validation": "../data/6_11_2025_tcc_cv/valid_dataset.tsv",
        "test": "../data/6_11_2025_tcc_cv/test_dataset.tsv",
    },
    delimiter="\t",
    features=features,
)


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 65 examples [00:00, 5298.31 examples/s]
Generating validation split: 72 examples [00:00, 12261.06 examples/s]
Generating test split: 50 examples [00:00, 22229.72 examples/s]


In [3]:
print(ds["train"][0])
print(ds["train"].features)

{'path': 'clips/Childhood 1_tcc_dato1239_tsim1256_IGS0229_2015-12-25_MM_10_0032832_0035221.mp3', 'sentence': 'ani qay a gahed gaw dugmosht', 'eng': None, 'sw': 'zamani ??? wangu nilisafiri kwenda dug mosht'}
{'path': Value('string'), 'sentence': Value('string'), 'eng': Value('string'), 'sw': Value('string')}


In [4]:
def fix_path(batch):
    batch["path"] = "/Users/zuzamakowska/Documents/Africa/Project/Low-resource-languages/data/6_11_2025_tcc_cv/" + batch["path"]
    return batch

ds = ds.map(fix_path)
print(ds["train"].features)

Map: 100%|██████████| 65/65 [00:00<00:00, 6123.48 examples/s]
Map: 100%|██████████| 72/72 [00:00<00:00, 15119.91 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 11959.12 examples/s]

{'path': Value('string'), 'sentence': Value('string'), 'eng': Value('string'), 'sw': Value('string')}





In [5]:
from datasets import Audio
ds = ds.cast_column("path", Audio(sampling_rate=16000))

In [6]:

print(ds["train"].features)
print(ds["train"][0])


{'path': Audio(sampling_rate=16000, decode=True, num_channels=None, stream_index=None), 'sentence': Value('string'), 'eng': Value('string'), 'sw': Value('string')}
{'path': <datasets.features._torchcodec.AudioDecoder object at 0x1195b6710>, 'sentence': 'ani qay a gahed gaw dugmosht', 'eng': None, 'sw': 'zamani ??? wangu nilisafiri kwenda dug mosht'}


In [7]:
ds = ds.remove_columns(['eng', 'sw'])

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['path', 'sentence'],
        num_rows: 65
    })
    validation: Dataset({
        features: ['path', 'sentence'],
        num_rows: 72
    })
    test: Dataset({
        features: ['path', 'sentence'],
        num_rows: 50
    })
})

In [9]:
print(ds['train'].column_names)
print(ds['train'].features)

['path', 'sentence']
{'path': Audio(sampling_rate=16000, decode=True, num_channels=None, stream_index=None), 'sentence': Value('string')}


In [10]:
print(ds["train"][0])

{'path': <datasets.features._torchcodec.AudioDecoder object at 0x11a97c9d0>, 'sentence': 'ani qay a gahed gaw dugmosht'}


In [11]:
ds = ds.with_format("numpy")
ds = ds.rename_column("path", "audio")
sample = ds["train"][0]
print(sample["audio"]["array"][:10])

[-1.0747810e-04 -5.9840626e-05 -2.3686751e-05  2.7955630e-05
  1.6578406e-05 -6.1724837e-05 -3.4597873e-05 -5.0002422e-05
 -2.9865387e-05 -3.0896772e-05]


In [12]:
print(ds['train'][0])

{'audio': <datasets.features._torchcodec.AudioDecoder object at 0x14f165ab0>, 'sentence': np.str_('ani qay a gahed gaw dugmosht')}


## Features Extraction

In [13]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swahili", task="transcribe", padding='longest')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small')

In [15]:
input_str = ds['train'][0]['sentence']
# labels = tokenizer(input_str).input_ids
input_str

np.str_('ani qay a gahed gaw dugmosht')

In [16]:
labels = tokenizer(input_str).input_ids
labels

[50258,
 50318,
 50359,
 50363,
 3782,
 9505,
 320,
 257,
 290,
 545,
 292,
 290,
 1607,
 22954,
 3415,
 357,
 50257]

In [17]:
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_with_special

'<|startoftranscript|><|sw|><|transcribe|><|notimestamps|>ani qay a gahed gaw dugmosht<|endoftext|>'

In [18]:
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
decoded_str

'ani qay a gahed gaw dugmosht'

In [19]:
raw_tokens = tokenizer(input_str)
raw_tokens

{'input_ids': [50258, 50318, 50359, 50363, 3782, 9505, 320, 257, 290, 545, 292, 290, 1607, 22954, 3415, 357, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
decoded_tokens = tokenizer.convert_ids_to_tokens(labels)
print(decoded_tokens)

['<|startoftranscript|>', '<|sw|>', '<|transcribe|>', '<|notimestamps|>', 'ani', 'Ġq', 'ay', 'Ġa', 'Ġg', 'ah', 'ed', 'Ġg', 'aw', 'Ġdug', 'mos', 'ht', '<|endoftext|>']


### WhisperProcessor

In [21]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained('openai/whisper-small', language='Swahili', task='transcribe')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
print(ds["train"][0])

{'audio': <datasets.features._torchcodec.AudioDecoder object at 0x15a94f8e0>, 'sentence': np.str_('ani qay a gahed gaw dugmosht')}


In [23]:
from datasets import Audio

ds = ds.cast_column('audio', Audio(sampling_rate=16000))

In [24]:
print(ds["train"][0])

{'audio': <datasets.features._torchcodec.AudioDecoder object at 0x14f1ddd80>, 'sentence': np.str_('ani qay a gahed gaw dugmosht')}


In [25]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate = audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [26]:
preprocessed_ds = ds.map(prepare_dataset, num_proc=4)

Map (num_proc=4): 100%|██████████| 65/65 [00:00<00:00, 171.84 examples/s]
Map (num_proc=4): 100%|██████████| 72/72 [00:00<00:00, 246.37 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:00<00:00, 210.33 examples/s]


In [27]:
preprocessed_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 65
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 72
    })
    test: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 50
    })
})

In [28]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')

In [29]:
model.generation_config.language = "swahili"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [30]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [31]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [32]:
import evaluate

metric = evaluate.load("wer")

In [33]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="../models/whisper-small-asmjeeg",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    logging_strategy="steps",
)


In [41]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

  return fn(*args, **kwargs)
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  0%|          | 1/5000 [04:11<348:55:57, 251.28s/it]