### GPU ACCESS

In [1]:
!nvidia-smi

Wed Feb 28 03:59:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  | 00000000:15:00.0 Off |                    0 |
| N/A   30C    P0              41W / 300W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch

In [3]:
device = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
torch.__version__

'2.2.1+cu121'

## Import Packages
We need to install transformers and datasets. soundfile is used to load audio files and jiwer is used to evaluate the finetuned model using word errot rate

In [1]:
!pip install datasets>=1.18.3
!pip install transformers==4.11.3
!pip install librosa
!pip install jiwer
!pip install wandb
# Restart the runtime for this change to take effect
!pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==4.11.3
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, sacremoses, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.1
    Uninstalling tokenizers-0.15.1:
     

To upload our training checkpoints directly to huggingface, we have to store the huggingface authentication key.

In [5]:
import transformers
transformers.__version__

'4.38.1'

In [5]:
from huggingface_hub import notebook_login

notebook_login('')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Install the GIT LFS in order to upload the model checkpoints

In [None]:
!apt-get install git-lfs

# Prepare Data, Tokenizer, Feature Extractor

### Create Wav2Vec2CTCTokenizer

In [6]:
# Load the dataset
from datasets import load_dataset, load_metric, Audio

# You can pass the streaming option to load_dataset to stream the data from the source instead of downloading and caching it
luganda = load_dataset("mozilla-foundation/common_voice_7_0", "lg", trust_remote_code=True)

print(luganda)

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 6626
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 4276
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 3549
    })
    other: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 29407
    })
    invalidated: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 2195
    })
})


In [7]:
# Remove the unnecessary columns from the dataset
luganda = luganda.remove_columns(["client_id", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment"])

### Display some of the rows in the dataset

In [8]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML
import re

def show_random_elements(dataset, num_examples=30):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(luganda["train"].remove_columns(["path", "audio"]))

Unnamed: 0,sentence
0,"""Ebiwuka ebyonoona ebimera n'endwadde bikosa amakungula g'omulimi."""
1,"""Yagamba abantu ku maka gy'ava."""
2,Nnyumirwa okuzannya omupiira.
3,"""Okuba ne ssente naye ng'emmere emu baagikugaana tekikugasa."""
4,"""Kadhi omukadde teyaliiwo ku mukolo gw'okuwaayo obukulembeze."""
5,Empaka bintu bya byamizannyo.
6,Obwavu buno omuntu teyandivudde mu nsi ye kugenda kukolera bweru.
7,"""Okukonziba mu nkula y'amatooke ereetebwa muddo mu nnimiro."""
8,"""Yabadde akaayana nti bamuwe ebisumuluzo by'emmotoka kyokka nga tamanyi na kugivuga."""
9,"""Ba nnanyini bizinesi bemulugunya olw'amagoba ga bank kwe beewolera okuba amangi."""


In [9]:
# Let's normalize the dataset to only lower case letters and ignore any special tokens because without a language model it is difficult to classify such tokens as they do not correspond to a characteristic sound.
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
    return batch

luganda = luganda.map(remove_special_characters)

In [10]:
# Display samples from the normalized dataset
show_random_elements(luganda["train"].remove_columns(["path", "audio"]))

Unnamed: 0,sentence
0,abakulira wooteeri baasubiza okutegekanga endaga za katemba buli kaseera
1,yambala masiki kye kikwaniriza ku buli dduuka lw'oyingira mu
2,omuwala agenda okufumbirwa oteekwa okutuukiriza emikolo gino nga obuwangwa bwe bugamba
3,omulimisa yatugamba nti tasuubira nkuba omwezi guno
4,olukungaana lw'abaddemu abakongo bangi
5,omwoleso gwabadde gukwata ku kutegeeza abantu ku ebyafaayo bya uganda
6,pulojekiti ejja kugoberera enkola za gavumenti ng'etandikiddwawo
7,singa obutafa kwali kugulwa na ssente singa abagagga balamu
8,kizibu okukomya okufuuwa sigala olw'okumwemanyiza
9,abalwadde b'akawuka ka kolona beeyongedde mu uganda


In CTC chunks of speech are classified into letters. We need to extract all distinct letters in the dataset and builf a vocabulary.   
We need a mapping function that will concatenate all the transcriptions into a long transcription and transforms the strings into a set of characters.

In [None]:
# Let's use the batched = True so that the map function can access all the transcriptions at a go
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocabs = luganda.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=luganda.column_names["train"])

In [12]:
# Create a vocabulary of all letters in the train dataset
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'x': 0,
 'c': 1,
 'p': 2,
 ' ': 3,
 'j': 4,
 'y': 5,
 'd': 6,
 'r': 7,
 'z': 8,
 'f': 9,
 'b': 10,
 'w': 11,
 'v': 12,
 'o': 13,
 'm': 14,
 'e': 15,
 '’': 16,
 "'": 17,
 ')': 18,
 'n': 19,
 'g': 20,
 's': 21,
 'k': 22,
 'i': 23,
 't': 24,
 'u': 25,
 'a': 26,
 'l': 27,
 '(': 28,
 'h': 29,
 '‘': 30}

We need to replace the " " in the dataset with a more visible character. We also need to add the UNKNOWN token so that to deal with characters not encountered in the training dataset.

In [13]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

We need to add the pad token that corresponds to CTC's blank token. The blank token is a core component of the CTC algorithm.

In [14]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

33


In [15]:
vocab_dict

{'x': 0,
 'c': 1,
 'p': 2,
 'j': 4,
 'y': 5,
 'd': 6,
 'r': 7,
 'z': 8,
 'f': 9,
 'b': 10,
 'w': 11,
 'v': 12,
 'o': 13,
 'm': 14,
 'e': 15,
 '’': 16,
 "'": 17,
 ')': 18,
 'n': 19,
 'g': 20,
 's': 21,
 'k': 22,
 'i': 23,
 't': 24,
 'u': 25,
 'a': 26,
 'l': 27,
 '(': 28,
 'h': 29,
 '‘': 30,
 '|': 3,
 '[UNK]': 31,
 '[PAD]': 32}

In [16]:
# Save the vocabulary to a json file
import json
with open('./vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [17]:
# Use the json file to instantiate an object of the Wav2Vec2CTCTokenizer class
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [18]:
!git lfs install

Git LFS initialized.


In [19]:
# Push the tokenizer to the hub
repo_name = "luganda_wav2vec2_ctc_reg"
tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/dmusingu/luganda_wav2vec2_ctc_reg/commit/a5b56f76e016ed9a6e610832d3bbb50e8cbf314e', commit_message='Upload tokenizer', commit_description='', oid='a5b56f76e016ed9a6e610832d3bbb50e8cbf314e', pr_url=None, pr_revision=None, pr_num=None)

### Create Wav2Vec Feature Extractor

In [20]:
# Create a feature extractor using Wav2Vec2FeatureExtractor. We shall pass feature size as 1 because we are dealing with raw audio files.
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [21]:
# Wrap the feature extractor and the tokenizer into a Wav2VecProcessor
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

### Preprocess the dataset

In [22]:
from datasets import Audio

In [23]:
luganda = luganda.cast_column("audio", Audio(sampling_rate=16000))

In [24]:
# Dislay an audio sample from the dataset
luganda['train'][10]["audio"]

{'path': '/ocean/projects/cis230036p/musinguz/datasets/downloads/extracted/11b22b6c2342f913cd6b27d3263eb9cdf01ca3c9f4638cc5e604e753632eb67b/cv-corpus-7.0-2021-07-21/lg/clips/common_voice_lg_23722908.mp3',
 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.85705873e-09, -8.11200329e-10,  0.00000000e+00]),
 'sampling_rate': 16000}

In [26]:
# Listen to sample audio from the dataset
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(luganda["train"]))

print(luganda["train"][rand_int]["sentence"])
ipd.Audio(data=np.asarray(luganda["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

ggulawo eddirisa ly'emiryango tufune ku mpewo


In [27]:
rand_int = random.randint(0, len(luganda["train"]))

print("Target text:", luganda["train"][rand_int]["sentence"])
print("Input array shape:", np.asarray(luganda["train"][rand_int]["audio"]["array"]).shape)
print("Sampling rate:", luganda["train"][rand_int]["audio"]["sampling_rate"])

Target text: akatimba kange ak'ensiri kayulise nga nkabikka ku buliri
Input array shape: (84010,)
Sampling rate: 16000


In [28]:
# Convert the sampling frewquency to 16kHz since the model was pretrained on audio sampled at 16kHz
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=16000).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [None]:
# Apply the map function to the dataset
luganda = luganda.map(prepare_dataset, remove_columns=luganda.column_names["train"], num_proc=4)

### Training and Evaluation

In [30]:
# Set up the trainer
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [31]:
# Initialize the data_collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [32]:
# Load the metric
wer_metric = load_metric("wer", trust_remote_code=True)

  wer_metric = load_metric("wer", trust_remote_code=True)


In [33]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [34]:
activation_dropout= 0.055
attention_dropout= 0.094
feat_proj_dropout = 0.04
hidden_dropout= 0.047
feat_proj_dropout= 0.04
layerdrop= 0.041
mask_time_prob = 0.4

In [49]:
# Load the pretrained Wav2Vec2 checkpoint. We use the tokenizers pad token id to degine the model's pad token id
from transformers import Wav2Vec2ForCTC

# Load the xlrs large model from the Hub
model = Wav2Vec2ForCTC.from_pretrained(
        "facebook/wav2vec2-large-xlsr-53",
        activation_dropout=activation_dropout,
        attention_dropout=attention_dropout,
        hidden_dropout=hidden_dropout,
        feat_proj_dropout=feat_proj_dropout,
        mask_time_prob=mask_time_prob,
        layerdrop=layerdrop,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer),
    )

# Load the model from a local checkpoint
# model = Wav2Vec2ForCTC.from_pretrained(
#     "luganda_wav2vec2_ctc_reg/tmp-checkpoint-3000"
# )

In [50]:
# Freeze the feature extractor
model.freeze_feature_extractor()

In [None]:
import wandb
# Insert wandb key to be able to log the model to the wandb dashboard
# This is key to be able to monitor your runs on the wandb dashboard
wandb.login(key = "")

In [None]:
%env WANDB_PROJECT=LugandaASR-wav2vec
%env WANDB_LOG_MODEL="checkpoint"

In [53]:
# Create callback to obtain predictions from the model during training
from transformers.integrations import WandbCallback
import pandas as pd


def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)

  return batch

def decode_predictions(tokenizer, predictions):
    labels = tokenizer.batch_decode(predictions.label_ids)
    logits = predictions.predictions.argmax(axis=-1)
    prediction_text = tokenizer.batch_decode(logits)
    return {"labels": labels, "predictions": prediction_text}


class WandbPredictionProgressCallback(WandbCallback):
    """Custom WandbCallback to log model predictions during training.

    This callback logs model predictions and labels to a wandb.Table at each
    logging step during training. It allows to visualize the
    model predictions as the training progresses.

    Attributes:
        trainer (Trainer): The Hugging Face Trainer instance.
        tokenizer (AutoTokenizer): The tokenizer associated with the model.
        sample_dataset (Dataset): A subset of the validation dataset
          for generating predictions.
        num_samples (int, optional): Number of samples to select from
          the validation dataset for generating predictions. Defaults to 100.
        freq (int, optional): Frequency of logging. Defaults to 2.
    """

    def __init__(self, trainer, tokenizer, val_dataset,
                 num_samples=10, freq=2):
        """Initializes the WandbPredictionProgressCallback instance.

        Args:
            trainer (Trainer): The Hugging Face Trainer instance.
            tokenizer (AutoTokenizer): The tokenizer associated
              with the model.
            val_dataset (Dataset): The validation dataset.
            num_samples (int, optional): Number of samples to select from
              the validation dataset for generating predictions.
              Defaults to 100.
            freq (int, optional): Frequency of logging. Defaults to 2.
        """
        super().__init__()
        self.trainer = trainer
        self.tokenizer = tokenizer
        self.sample_dataset = val_dataset.select(range(num_samples))
        self.freq = freq

    def on_evaluate(self, args, state, control, **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        # control the frequency of logging by logging the predictions
        # every `freq` epochs
        if state.epoch % self.freq == 0:
            # generate predictions
            predictions = self.trainer.predict(self.sample_dataset)
            # decode predictions and labels
            predictions = decode_predictions(self.tokenizer, predictions)
            # add predictions to a wandb.Table
            predictions_df = pd.DataFrame(predictions)
            predictions_df["epoch"] = state.epoch
            records_table = self._wandb.Table(dataframe=predictions_df)
            # log the table to wandb
            self._wandb.log({"sample_predictions": records_table})

In [57]:
# Define the training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=32,
  evaluation_strategy="steps",
  num_train_epochs=60,
  fp16=True, # mixed precison, makes training faster, this comes at a cost of memory
  gradient_checkpointing=True, # this is a balance between discarding the forward pass activations and recomputing them during backpropagation
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  # save_total_limit=2,
  load_best_model_at_end=True,
  metric_for_best_model="wer",
  greater_is_better=False,
  report_to="wandb",
  run_name="wav2vec-finetuning",# this is for the wandb run
  logging_steps=100,
  push_to_hub=True,
  hub_model_id=repo_name, 
  dataloader_num_workers=4,  # this makes loading the data from the dataset faster
  dataloader_pin_memory= True  # this too makes loading the data fast
)

In [58]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=luganda["other"],
    eval_dataset=luganda["validation"],
    tokenizer=processor.feature_extractor,
)

# Instantiate the WandbPredictionProgressCallback
progress_callback = WandbPredictionProgressCallback(
    trainer=trainer,
    tokenizer=tokenizer,
    val_dataset=luganda["test"],
    num_samples=10,
    freq=2,
)

# Add the callback to the trainer
trainer.add_callback(progress_callback)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# Train the model
trainer.train()
wandb.finish()



Step,Training Loss,Validation Loss,Wer
500,2.5842,0.129841,0.301544
1000,0.4127,0.12,0.294058
1500,0.3852,0.117175,0.288727




In [None]:
# Push the model to hub
trainer.push_to_hub(repo_name)

### Evaluation

In [43]:
processor = Wav2Vec2Processor.from_pretrained(repo_name)
model = Wav2Vec2ForCTC.from_pretrained(repo_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# Evaluation is carried out with a batch size of 1
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)

  return batch

results = luganda["test"].map(map_to_result, remove_columns=luganda["test"].column_names)

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [None]:
# Chech the errors made by the model
show_random_elements(results)

From the output above we can make the following observations
1. The xlrs model was pretrained on various languages which did not include Luganda but it significantly improved the word error rate. More optimization needs to be applied in order to push improve the accuracy of the model.
2. Some possible methods could be to reduce the level of regularization being applied to the model and to increase the amount of data being used to train the model
3. Need to perform error analysis to determine cases the model fails and to increase the robustness of the model to noise