## Notebook Setup

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Aug 16 12:23:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     On  | 00000000:53:00.0 Off |                    0 |
|  0%   29C    P8              21W / 300W |      4MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
%%capture
!pip install datasets
!pip install transformers
!pip install torchaudio
!pip install jiwer
!pip install accelerate -U
!pip install soundfile
!pip install librosa

In [66]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
%%capture
!apt install git-lfs

In [3]:
from datasets import load_dataset, load_metric, Audio

lg_cv_train = load_dataset("mozilla-foundation/common_voice_14_0", "lg", split="train", trust_remote_code=True)
lg_cv_test = load_dataset("mozilla-foundation/common_voice_14_0", "lg", split="test", trust_remote_code=True)

In [4]:
print(lg_cv_train)

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 70819
})


In [22]:
print(lg_cv_test)

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes'],
    num_rows: 13431
})


In [5]:
lg_cv_train = lg_cv_train.select_columns(['audio', 'sentence', 'up_votes', 'down_votes'])
lg_cv_test  = lg_cv_test.select_columns(['audio', 'sentence', 'up_votes', 'down_votes'])

In [8]:
lg_cv_train_new = lg_cv_train.filter(lambda row: row['down_votes'] < 1 and row['up_votes'] > 1)

Filter:   0%|          | 0/70819 [00:00<?, ? examples/s]

In [20]:
lg_cv_test_new  = lg_cv_test.filter(lambda row: row['down_votes'] < 1 and row['up_votes'] > 1)

Filter:   0%|          | 0/13431 [00:00<?, ? examples/s]

In [9]:
print(lg_cv_train_new)

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes'],
    num_rows: 3767
})


In [21]:
print(lg_cv_test_new)

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes'],
    num_rows: 11118
})


In [10]:
def calculate_duration(batch):
    audio = batch['audio']
    batch['duration'] = len(audio['array'])/ audio['sampling_rate']
    return batch

In [13]:
lg_cv_train_new = lg_cv_train_new.map(calculate_duration, num_proc=8, writer_batch_size=128) 

Map (num_proc=8):   0%|          | 0/3767 [00:00<?, ? examples/s]

In [23]:
lg_cv_test_new = lg_cv_test_new.map(calculate_duration, num_proc=8, writer_batch_size=128)
lg_cv_test_new

Map (num_proc=8):   0%|          | 0/11118 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes', 'duration'],
    num_rows: 11118
})

In [14]:
lg_cv_train_new

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes', 'duration'],
    num_rows: 3767
})

In [15]:
train_duration = lg_cv_train_new['duration']

In [16]:
sum(train_duration)/3600

6.2423582175925905

In [25]:
test_duration = lg_cv_test_new['duration']

In [26]:
sum(test_duration)/3600

17.91129777777793

# Baseline evaluation of the models

### Wav2Vec-BERT

In [27]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(lg_cv_test_new.remove_columns(["duration", "audio", "down_votes", "up_votes"]), num_examples=10)

Unnamed: 0,sentence
0,Yasalawo okumwekutulako olw'obubbi
1,Ensi yaffe ejja kukola eddagala ly'akawuka.
2,Obwakabaka bwalina ennono n'obulombolombo eby'enjawulo okwetoolola obuganda.
3,"Bwebuba bwongo, buno bulina okuba obuyiiya ."
4,Abasawo baalidde matereke ku by'omusaala ne bakama baabwe.
5,Tulina kulwaana okubeera obulungi.
6,Gwe musujja.
7,Omuti gwa ffene teguba mugumu.
8,Namulabako nga alabika yenna bambi takyeyinza.
9,Wali osiibye ku njala?


In [49]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_test = lg_cv_test_new.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=lg_cv_test_new.column_names)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

In [50]:
vocab_list = list(set(vocab_test["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 'ŋ': 27,
 '’': 28}

In [52]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

31

In [53]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [54]:
vocab_dict

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 'ŋ': 27,
 '’': 28,
 '[UNK]': 29,
 '[PAD]': 30,
 '|': 0}

In [39]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\»\«\(\)\½\–\‟\…\῎]'

def remove_special_characters(batch):
    # remove special characters
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()

    return batch

In [40]:
lg_cv_test_new = lg_cv_test_new.map(remove_special_characters)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

In [47]:
def replace_hat_char(batch):
    batch["sentence"] = re.sub('[ñ]', 'n', batch["sentence"])
    return batch

In [48]:
lg_cv_test_new = lg_cv_test_new.map(replace_hat_char)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

In [55]:
import json
with open('luganda_vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [57]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")



In [58]:
from transformers import SeamlessM4TFeatureExtractor

feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")

preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

In [59]:
from transformers import Wav2Vec2BertProcessor

processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [60]:
lg_cv_test_new = lg_cv_test_new.cast_column("audio", Audio(sampling_rate=16_000))

In [61]:
lg_cv_test_new[0]["audio"]

{'path': None,
 'array': array([-2.70983946e-08,  3.01502041e-08, -3.28485612e-08, ...,
        -1.77729748e-06,  2.11014903e-05,  2.42224087e-05]),
 'sampling_rate': 16000}

In [62]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(lg_cv_test_new)-1)

print(lg_cv_test_new[rand_int]["sentence"])
ipd.Audio(data=lg_cv_test_new[rand_int]["audio"]["array"], autoplay=True, rate=16000)

nnaasanyuka nnyo singa munanonda mu kifo ekyo


In [63]:
rand_int = random.randint(0, len(lg_cv_test_new)-1)

print("Target text:", lg_cv_test_new[rand_int]["sentence"])
print("Input array shape:", lg_cv_test_new[rand_int]["audio"]["array"].shape)
print("Sampling rate:", lg_cv_test_new[rand_int]["audio"]["sampling_rate"])

Target text: singa tawandiikira mmeeya bbaluwa tetujja kumuwa bululu
Input array shape: (155520,)
Sampling rate: 16000


In [64]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_length"] = len(batch["input_features"])

    batch["labels"] = processor(text=batch["sentence"]).input_ids
    return batch

In [65]:
lg_cv_test_new = lg_cv_test_new.map(prepare_dataset, remove_columns=lg_cv_test_new.column_names)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

In [None]:
lg_cv_test_new.push_to_hub('dmusingu/luganda-test-data', private = True)

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
import evaluate 

cer_metric = evaluate.load('cer')
wer_metric = evaluate.load('wer')

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}