## Notebook Setup

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Aug 16 16:27:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     On  | 00000000:D1:00.0 Off |                    0 |
|  0%   30C    P8              21W / 300W |      4MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
%%capture
!pip install datasets
!pip install transformers
!pip install torchaudio
!pip install jiwer
!pip install accelerate -U
!pip install soundfile
!pip install librosa

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
%%capture
!apt install git-lfs

In [68]:
from datasets import load_dataset, load_metric, Audio

lg_cv_train = load_dataset("mozilla-foundation/common_voice_14_0", "lg", split="train", trust_remote_code=True)
lg_cv_test = load_dataset("mozilla-foundation/common_voice_14_0", "lg", split="test", trust_remote_code=True)

In [69]:
print(lg_cv_train)

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 70819
})


In [70]:
print(lg_cv_test)

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 13431
})


In [71]:
lg_cv_train = lg_cv_train.select_columns(['audio', 'sentence', 'up_votes', 'down_votes'])
lg_cv_test  = lg_cv_test.select_columns(['audio', 'sentence', 'up_votes', 'down_votes'])

In [8]:
lg_cv_train_new = lg_cv_train.filter(lambda row: row['down_votes'] < 1 and row['up_votes'] > 1)

Filter:   0%|          | 0/70819 [00:00<?, ? examples/s]

In [72]:
lg_cv_test_new  = lg_cv_test.filter(lambda row: row['down_votes'] < 1 and row['up_votes'] > 1)

In [32]:
# print(lg_cv_train_new)

In [21]:
print(lg_cv_test_new)

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes'],
    num_rows: 11118
})


In [76]:
# normalize the data
def normalize(batch):
    batch['sentence'] = batch['sentence'].lower()
    return batch

lg_cv_test    = lg_cv_test.map(normalize)

Map:   0%|          | 0/13431 [00:00<?, ? examples/s]

In [10]:
def calculate_duration(batch):
    audio = batch['audio']
    batch['duration'] = len(audio['array'])/ audio['sampling_rate']
    return batch

In [13]:
lg_cv_train_new = lg_cv_train_new.map(calculate_duration, num_proc=8, writer_batch_size=128) 

Map (num_proc=8):   0%|          | 0/3767 [00:00<?, ? examples/s]

In [23]:
lg_cv_test_new = lg_cv_test_new.map(calculate_duration, num_proc=8, writer_batch_size=128)
lg_cv_test_new

Map (num_proc=8):   0%|          | 0/11118 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes', 'duration'],
    num_rows: 11118
})

In [14]:
lg_cv_train_new

Dataset({
    features: ['audio', 'sentence', 'up_votes', 'down_votes', 'duration'],
    num_rows: 3767
})

In [15]:
train_duration = lg_cv_train_new['duration']

In [16]:
sum(train_duration)/3600

6.2423582175925905

In [25]:
test_duration = lg_cv_test_new['duration']

In [26]:
sum(test_duration)/3600

17.91129777777793

# Baseline evaluation of the models

### Wav2Vec-BERT

In [78]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(lg_cv_test.remove_columns(["audio", "down_votes", "up_votes"]), num_examples=10)

Unnamed: 0,sentence
0,akakuubagano kavuddemu obwa kkondo n'obubbi.
1,kino omulamuzi akigaanye n'amulagira okujjanjabirwa e kitalya.
2,obungi bw'abantu mu uganda bwenkana ki?
3,omwami amosi yagula emmeeza n'entebe.
4,olulagala olusooka ku ndu nga teruneeyanjuluza.
5,agumizza abataka ku mbeera y'omutanda gy'alimu ennaku zino eyeeyagaza bulungi.
6,kale babalabula bwebatyo baleme okwokebwa abalongo era baba balina okuyimba ebigambo ebikusike.
7,ssuuna yasiima ekitiibwa katikkiro we ky'amuwadde okumuyita ssemunywa.
8,yayongeddeko nti aba ng'omuti ogutalina mirandira.
9,lwaki abaana abasinga basoma bakyali bato?


In [86]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_test = lg_cv_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=lg_cv_test.column_names)

Map:   0%|          | 0/13431 [00:00<?, ? examples/s]

In [87]:
vocab_list = list(set(vocab_test["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 'ŋ': 27,
 '’': 28}

In [88]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

31

In [89]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [90]:
vocab_dict

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 'ŋ': 27,
 '’': 28,
 '[UNK]': 29,
 '[PAD]': 30,
 '|': 0}

In [82]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\»\«\(\)\½\–\‟\…\῎]'

def remove_special_characters(batch):
    # remove special characters
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()

    return batch

In [83]:
lg_cv_test = lg_cv_test.map(remove_special_characters)

Map:   0%|          | 0/13431 [00:00<?, ? examples/s]

In [84]:
def replace_hat_char(batch):
    batch["sentence"] = re.sub('[ñ]', 'n', batch["sentence"])
    return batch

In [85]:
lg_cv_test = lg_cv_test.map(replace_hat_char)

Map:   0%|          | 0/13431 [00:00<?, ? examples/s]

In [46]:
import json
with open('luganda_vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [47]:
# Load the vocabulary into an instance of the Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./luganda_vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")



In [66]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [67]:
# Wrap the feature extractor and tokenizer in the Wav2Vec2Processor
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [96]:
lg_cv_test = lg_cv_test.cast_column("audio", Audio(sampling_rate=16_000))

In [51]:
lg_cv_test_new[0]["audio"]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/3e120f4c8a625d3e2d17ebe719146f4771511bdb698994635ae15bce330b33c9/lg_test_0/common_voice_lg_27079826.mp3',
 'array': array([-6.91215973e-11, -4.36557457e-11, -1.01863407e-10, ...,
         2.17164434e-05,  3.42857093e-05,  3.66064560e-05]),
 'sampling_rate': 16000}

In [91]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(lg_cv_test)-1)

print(lg_cv_test[rand_int]["sentence"])
ipd.Audio(data=lg_cv_test[rand_int]["audio"]["array"], autoplay=True, rate=16000)

ezinunula omuganda w’akaco kano


In [97]:
rand_int = random.randint(0, len(lg_cv_test_new)-1)

print("Target text:", lg_cv_test[rand_int]["sentence"])
print("Input array shape:", lg_cv_test[rand_int]["audio"]["array"].shape)
print("Sampling rate:", lg_cv_test[rand_int]["audio"]["sampling_rate"])

Target text: omanyi kye bayita enkumbi terimba
Input array shape: (67968,)
Sampling rate: 16000


In [100]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_features"])

    batch["labels"] = processor(text=batch["sentence"]).input_ids
    return batch

In [101]:
lg_cv_test = lg_cv_test.map(prepare_dataset, remove_columns=lg_cv_test.column_names)

Map:   0%|          | 0/13431 [00:00<?, ? examples/s]

In [56]:
# lg_cv_test_new.push_to_hub('dmusingu/luganda-test-data', private = True)

In [15]:
# lg_cv_test_new = load_dataset('dmusingu/luganda-test-data', split='train')

In [102]:
print(lg_cv_test)

Dataset({
    features: ['input_features', 'input_length', 'labels'],
    num_rows: 13431
})


In [103]:
import evaluate 

cer_metric = evaluate.load('cer')
wer_metric = evaluate.load('wer')

In [104]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

In [105]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xls-r-300m",
                                          pad_token_id=processor.tokenizer.pad_token_id,
                                          vocab_size=len(processor.tokenizer)
                                          )

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [106]:
import torch

In [107]:
model = model.to('cuda') 

In [111]:
# Evaluation is carried out with a batch size of 1
def map_to_result(batch):
    model.eval()
    with torch.no_grad():
        input_values = torch.tensor(batch["input_features"], device="cuda").unsqueeze(0)
        logits = model(input_values).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)
    batch['wer']  = wer_metric.compute(predictions=[batch["pred_str"]], references=[batch["text"]])
    batch['cer']  = cer_metric.compute(predictions=[batch["pred_str"]], references=[batch["text"]])
    
    return batch

results = lg_cv_test.map(map_to_result, remove_columns=lg_cv_test.column_names)

Map:   0%|          | 0/13431 [00:00<?, ? examples/s]

In [25]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [112]:
show_random_elements(results)

Unnamed: 0,pred_str,text,wer,cer
0,catatatatatatatatatatakatatatatatatatakatatktatatatatas,akaami akatono okanyoomera mitala wa mugga,1.0,0.97619
1,cakazkakzakakatagakakakakac,tubeera tutya nekizimbe kino naye nga kiri mu mbeera embi,1.0,0.824561
2,ceatatatatatatatatatatatatatatatatatatkatatatatatatatatatatatatakatatatatatktatatatatatatatatac,n’atwalibwa muganzi we okukyakalako mu kafo akamu mu nakawa,1.0,1.322034
3,caksaktatatatatakatatatata[UNK]takataktatatatakatatakatatatatakatakatatakata,zino yaziwa akakiiko kebyokulonda ngagenda okwewandiisa,1.0,1.145455
4,ctatctktatatatktaktktkatakatataktatatktatatatatatatakatatatatatktata,yamatiza abasube nti betaaga okubaako kye bakolawo ku mbeera yonutali bwenkanya,1.0,0.772152
5,cakatatatakatajatac,ngomukazi mmanyi okubonaabona bamaama kwe bayitamu nga bazaala,1.0,0.806452
6,catatatatatatatatatatatatatatatatatatatatatkatatatatatatatatatatatktatatatatatatatatatatatatatatatatatatatatatakatatatatata,yayiga natunganga bulungi nnyo enkuufiira ezamaaso ezaayitibwanga entalabuusi,1.0,1.350649
7,cakakakakakakakakakotakatatatztzaktzkatakikatkakzatakatatakakatakac,osobola okuzuula kyonna kyoyagala,1.0,1.69697
8,cacakaoazakakzdkakakazakac,ye katonda webiseera byonna,1.0,0.851852
9,caktaktak[UNK]tazkaktazataktazstakakakakakakatakakac,akaseera kakazigizigi kaali keeraliikiriza,1.0,0.952381


In [None]:
ee