In [1]:
# !pip3 install torch
import torch
print("MPS:", torch.backends.mps.is_available())

import sys
print(sys.executable)


MPS: True
/Users/zuzamakowska/Documents/Africa/.venv/bin/python


In [2]:
# api key: echo $MDC_API_KEY
# !pip3 install datasets

from datasets import load_dataset, Features, Value, Audio

features = Features({
    "client_id": Value("string"),
    "path": Value("string"),
    "sentence_id": Value("string"),
    "sentence": Value("string"),
    "sentence_domain": Value("string"),
    "up_votes": Value("string"),
    "down_votes": Value("string"),
    "age": Value("string"),
    "gender": Value("string"),
    "accents": Value("string"),
    "variant": Value("string"),
    "locale": Value("string"),
    "segment": Value("string"),
})

ds = load_dataset(
    "csv",
    data_files={
        "train": "../../data/cv-corpus-23.0-2025-09-05/sw/train.tsv",
        "validation": "../../data/cv-corpus-23.0-2025-09-05/sw/dev.tsv",
        "test": "../../data/cv-corpus-23.0-2025-09-05/sw/test.tsv"
    },
    delimiter="\t",
    features=features,
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def fix_path(batch):
    batch["path"] = "/Users/zuzamakowska/Documents/Africa/Project/Low-resource-languages/data/cv-corpus-23.0-2025-09-05/sw/clips/" + batch["path"]
    return batch

ds = ds.map(fix_path)
print(ds["train"].features)

{'client_id': Value('string'), 'path': Value('string'), 'sentence_id': Value('string'), 'sentence': Value('string'), 'sentence_domain': Value('string'), 'up_votes': Value('string'), 'down_votes': Value('string'), 'age': Value('string'), 'gender': Value('string'), 'accents': Value('string'), 'variant': Value('string'), 'locale': Value('string'), 'segment': Value('string')}


In [4]:
from datasets import Audio
ds = ds.cast_column("path", Audio(sampling_rate=16000))
ds = ds.remove_columns(['client_id', 'sentence_id', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'locale', 'segment'])
ds = ds.with_format("numpy")
ds = ds.rename_column("path", "audio")

In [5]:
# !pip3 install transformers

from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Swahili", task="transcribe", padding='longest')
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-tiny')
processor = WhisperProcessor.from_pretrained('openai/whisper-tiny', language='Swahili', task='transcribe')

In [6]:
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny')
model.generation_config.language = "swahili"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

Loading weights: 100%|██████████| 167/167 [00:00<00:00, 2002.47it/s, Materializing param=model.encoder.layers.3.self_attn_layer_norm.weight]  


In [7]:
MAX_DURATION = 30.0  
max_label_len = model.config.max_target_positions  

def prepare_dataset(batch):
    audio = batch["audio"]

    duration = len(audio["array"]) / audio["sampling_rate"]
    batch["duration"] = duration

    batch["input_features"] = feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    labels = tokenizer(batch["sentence"]).input_ids
    batch["labels"] = labels
    batch["label_length"] = len(labels)

    return batch

In [8]:
# !pip3 install torchcodec

preprocessed_ds = ds.map(
    prepare_dataset,
    num_proc=4,
    load_from_cache_file=False,
)

def filter_fn(batch):
    return (batch["duration"] <= MAX_DURATION) and (batch["label_length"] <= max_label_len)

preprocessed_ds = preprocessed_ds.filter(filter_fn)

Map (num_proc=4): 87524 examples [02:38, 301.02 examples/s]           Token indices sequence length is longer than the specified maximum sequence length for this model (1103 > 1024). Running this sequence through the model will result in indexing errors
Map (num_proc=4): 93222 examples [03:07, 248.33 examples/s]
Map (num_proc=4): 16898 examples [00:20, 310.61 examples/s]           Token indices sequence length is longer than the specified maximum sequence length for this model (82023 > 1024). Running this sequence through the model will result in indexing errors
Map (num_proc=4): 20748 examples [00:35, 375.96 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (26008 > 1024). Running this sequence through the model will result in indexing errors
Map (num_proc=4): 23384 examples [00:47, 244.81 examples/s]
Map (num_proc=4): 12919 examples [00:02, 374.79 examples/s]           Token indices sequence length is longer than the specifie

In [9]:
preprocessed_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'variant', 'duration', 'input_features', 'labels', 'label_length'],
        num_rows: 46610
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'variant', 'duration', 'input_features', 'labels', 'label_length'],
        num_rows: 11690
    })
    test: Dataset({
        features: ['audio', 'sentence', 'variant', 'duration', 'input_features', 'labels', 'label_length'],
        num_rows: 11941
    })
})

In [10]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


In [11]:
# !pip3 install evaluate
# !pip3 install jiwer
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    cer = 100 * cer_metric.compute(predictions=pred_str, references=label_str)

    return {
        "wer": wer,
        "cer": cer,
        "combined": 0.5 * wer + 0.5 * cer,
    }

In [12]:
# !pip3 install 'accelerate>=1.1.0'

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="../../models/whisper-tiny-sw-2026",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-6,
    warmup_steps=500,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=False,
    eval_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=2500,
    eval_steps=2500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="combined",
    greater_is_better=False,
    push_to_hub=False,
    logging_strategy="steps",
    max_grad_norm=1.0
)


In [15]:
!pip3 install tensorboardX

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # tokenizer=processor.feature_extractor,
)

Collecting tensorboardX
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Collecting protobuf>=3.20 (from tensorboardX)
  Downloading protobuf-6.33.5-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Downloading tensorboardx-2.6.4-py3-none-any.whl (87 kB)
Downloading protobuf-6.33.5-cp39-abi3-macosx_10_9_universal2.whl (427 kB)
Installing collected packages: protobuf, tensorboardX
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [tensorboardX]
[1A[2KSuccessfully installed protobuf-6.33.5 tensorboardX-2.6.4


In [16]:
trainer.train()

  super().__init__(loader)


Step,Training Loss,Validation Loss


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  7.72it/s]


TrainOutput(global_step=2000, training_loss=5.122577548980713, metrics={'train_runtime': 3172.8051, 'train_samples_per_second': 10.086, 'train_steps_per_second': 0.63, 'total_flos': 7.8780432384e+17, 'train_loss': 5.122577548980713, 'epoch': 0.6865184930919077})

In [None]:
import numpy as np
np.percentile(preprocessed_ds["validation"]["duration"], [50, 90, 95, 99, 100])

array([ 5.508 ,  8.0316,  8.928 , 10.08  , 10.8   ])