# Finetune Wav2Vec2-BERT for Luaganda ASR with 🤗 Transformers


Adapted from [Fine-Tune W2V2-Bert for low-resource ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-w2v2-bert) and [
Boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram)

In [None]:
# Check for GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print("Not connected to a GPU")
else:
    print(gpu_info)

#### Install Packages
We shall install `datasets`, `transformers`, `accelerate` for model training. `torchaudio` to load audio files and `jiwer` to evaluate the model using word error rate (WER) and character error rate(CER)

In [2]:
%%capture
!pip install datasets==2.17.0
!pip install transformers==4.48.3
!pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip install jiwer # jiwer is used for evaluation using WER and CER
!pip install accelerate -U # Restart runtime after running this cell
!pip install wandb
!pip install soundfile
!pip install librosa
!pip install matplotlib
!pip install evaluate
!pip install seaborn
!pip install pyctcdecode 
!pip install kenlm

In [2]:
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### Huggingface login

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Install Git-LFS to support uploading model weights to huggingface

In [4]:
%%capture
!apt install git-lfs

### Prepare Data, Tokenizer, Feature Extractor
Wav2Vec2-BERT uses the Wav2Vec2CTCTokenizer and the  SeamlessM4TFeatureExtractor to process the inputs of the model

#### Create Wav2Vec2CTCTokenizer

In [5]:
from datasets import load_dataset, Audio, Dataset

In [None]:
# You need to log in on HuggingFace and accept the terms and conditions of the Mozilla Foundation common voice dataset
lg_cv_train = load_dataset("mozilla-foundation/common_voice_7_0", "lg", split="train+validation", trust_remote_code=True)
lg_cv_test = load_dataset("mozilla-foundation/common_voice_7_0", "lg", split="test", trust_remote_code=True)

In [None]:
# Print the dataset to view a summary of the dataset
print(lg_cv_train)
print(lg_cv_test)

In [7]:
# View a sample from the dataset
lg_cv_train[0]['audio']

In [9]:
# # Compute the duration of the audio files
import librosa

def compute_duration(batch):
    batch['length'] = librosa.get_duration(y=batch['audio']['array'], sr=batch['audio']['sampling_rate'])
    return batch


lg_cv_train = lg_cv_train.map(compute_duration, keep_in_memory= False,num_proc=8)
lg_cv_test  = lg_cv_test.map(compute_duration, keep_in_memory= False,num_proc=8)

In [None]:
min(lg_cv_train['duration']), max(lg_cv_train['duration'])

In [None]:
min(lg_cv_test['duration']), max(lg_cv_test['duration'])

In [91]:
lg_cv_train = lg_cv_train.select_columns(['audio', 'Transcriptions', 'duration'])
lg_cv_test  = lg_cv_test.select_columns(['audio', 'Transcriptions', 'duration'])

In [92]:
# Determine the number of samples required to make up a certain number of hours
# The duration is calculated in seconds
cumsum= np.cumsum(lg_cv_train['duration'])
cumsum[cumsum<3_600].shape

(183,)

#### Display

In [94]:
# Show samples from the dataset
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples= 10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = (random.randint(0, len(dataset)-1))
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(lg_cv_train.remove_columns(['audio']))

## Preprocess Transcripts

In [96]:
# normalize the transcripts. We are not training an orthographic model
def normalize(batch):
    batch['transcription'] = batch['Transcriptions'].lower()
    return batch

lg_cv_train    = lg_cv_train.map(normalize)
lg_cv_test     = lg_cv_test.map(normalize)

In [97]:
# Obtain the vocabulary from the dataset
def extract_all_chars(batch):
  all_text = " ".join(batch["transcription"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
luganda_train_vocab   = lg_cv_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=lg_cv_train.column_names)
luganda_test_vocab    = lg_cv_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=lg_cv_test.column_names)

In [26]:
vocab_list = list(set(luganda_train_vocab["vocab"][0]) | set(luganda_test_vocab["vocab"][0]))

In [None]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

In [None]:
luganda_alphabet_list = ['b', 'p', 'v', 'f', 'm', 'd', 't', 'l', 'r', 'n', 'z', 's', 'j', 'c', 'g', 'k', 'ny', 'ŋ', 'a', 'e', 'i', 'o', 'u', 'w', 'y']
luganda_alphabet = ", ".join(luganda_alphabet_list)
print(luganda_alphabet)

In [None]:
luganda_alphabet = luganda_alphabet.split(",")
luganda_alphabet = [char.lstrip() for char in luganda_alphabet]
print(luganda_alphabet)

In [55]:
vocab_keys = vocab_dict.keys()
digits = []
for key in vocab_keys:
    if key.isnumeric():
        digits.append(key)

In [None]:
digits

In [None]:
allowed_characters = digits + luganda_alphabet
print(allowed_characters)

In [None]:
characters_to_remove = [char for char in vocab_dict.keys() if char not in allowed_characters]
print(characters_to_remove)

In [None]:
characters_for_substitution = [char for char in characters_to_remove if char.isalnum()]
print(characters_for_substitution)

In [None]:
characters_for_replacement = [char for char in characters_to_remove if char not in characters_for_substitution]
print(characters_for_replacement)

In [None]:
characters_for_replacement_regex = "\\".join(characters_for_replacement)
print(characters_for_replacement_regex)

## Build Model

In [28]:
# Replace the spaces with |
# Let's give the " " a more visible character
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [29]:
# Add an "unknown" token so that the model can later deal
# with characters not encountered in Common Voice's training set
# Add a padding token that corresponds to CTC blank token
vocab_dict['[UNK]'] = len(vocab_dict)
vocab_dict['[PAD]'] = len(vocab_dict)
len(vocab_dict)

34

In [30]:
# save the vocabulary as a json file
import json
with open('w2v2_bert_luganda_vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [31]:
# Load the vocabulary into an instance of the Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./w2v2_bert_luganda_vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
# Save the tokenizer to HuggingFace
tokenizer.push_to_hub('username/repo', private =True)

#### Create SeamlessM4TFeatureExtractor

In [33]:
from transformers import SeamlessM4TFeatureExtractor

feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")

In [None]:
# Replace the username and repo with your HuggingFace username and name for your repo
repo_name = 'username/repo'

from transformers import Wav2Vec2BertProcessor

processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.push_to_hub(repo_name, private=True)

In [36]:
# Resample all the audios to 16kHz
lg_cv_train = lg_cv_train.cast_column('audio', Audio(16000))
lg_cv_test  = lg_cv_test.cast_column('audio', Audio(16000)) 

In [None]:
# Play an audio sample from the dataset
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(lg_cv_train)-1)
print(lg_cv_train[rand_int]["transcription"])
ipd.Audio(data=np.asarray(lg_cv_train[rand_int]["audio"]["array"]), autoplay=False, rate=16000)

Load and resample the audio data, simply by calling batch["audio"]. Extract the input_features from the loaded audio file.  Wav2Vec2BertProcessor creates a more complex representation as the raw waveform, known as Log-Mel feature extraction. Encode the transcriptions to label ids.

In [38]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["length"]     = batch["duration"]

    batch["labels"] = processor(text=batch["transcription"]).input_ids
    return batch

In [None]:
lg_cv_train = lg_cv_train.map(prepare_dataset, num_proc=1, remove_columns=lg_cv_train.column_names)
lg_cv_test  = lg_cv_test.map(prepare_dataset, num_proc=1, remove_columns=lg_cv_test.column_names)

In [40]:
# Check the total duration of the training dataset
sum(lg_cv_train['length'])/3600

1.0016150694444446

### Training

Create a special collate function that pads the input values to the maximum length in the batch because XLS-R has a very long context length

#### Set-up Trainer

In [41]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2BertProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [42]:
# Define the wer and cer metrics
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [43]:
# Function used to compute the WER and CER
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

In [None]:
# Load the model for finetuning
from transformers import Wav2Vec2BertForCTC

model = Wav2Vec2BertForCTC.from_pretrained(
    "facebook/w2v-bert-2.0",
    attention_dropout=0.05,
    hidden_dropout=0.05,
    feat_proj_dropout=0.05,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    add_adapter=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

In [45]:
model.config.ctc_zero_infinity = True # Deals with some issues with HuggingFace

#### Wandb for Logging and Montoring

In [None]:
import wandb

wandb.login()

In [None]:
# Wandb arguments
%env WANDB_LOG_MODEL=end
%env WANDB_PROJECT=ASR Africa
%env WANDB_WATCH=all
%env WANDB_SILENT=true

In [57]:
# Create a training config
config = {
    "batch_size" : 8, 
    "dataset": "Kallaama-dataset",
    "Epochs": 50,
    "Model": "w2v-bert-2.0",
    "hours": 20,
    "repo": "luganda-cv-w2v2-bert"
}

In [48]:
# Define the training arguments
# use the group_by_length argument to make training more efficient

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=f"./{config['repo']}",
    group_by_length=True,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2, # increase the effective batch size to 4,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    save_strategy ="steps",
    num_train_epochs=50,
    bf16=True, # mixed precision training
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=3e-4,
    warmup_ratio=0.1,
    save_total_limit=2,
    push_to_hub=True,
    gradient_checkpointing=True,
    report_to="wandb",
    run_name=f"{config['repo']}",
    load_best_model_at_end=True,
    metric_for_best_model = "wer",  
    greater_is_better=False,
    hub_private_repo = True,
    torch_compile = True,
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=2,
    hub_model_id=f"{config['repo']}",
    )

In [49]:
# Pass the model, the training arguments and the data collator to the Trainer

from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=lg_cv_train,
    eval_dataset=lg_cv_test,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()
wandb.finish()
trainer.push_to_hub(f'username/{config['config']}')

## Test the model

In [None]:
model = Wav2Vec2BertForCTC.from_pretrained(
    f'username/config['repo']',
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer) 
)

In [56]:
model = model.to('cuda')

In [57]:
input_dict = lg_cv_test[0]

logits = model(torch.tensor(input_dict["input_features"]).to("cuda").unsqueeze(0)).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

In [None]:
processor.decode(pred_ids)

In [None]:
processor.decode(input_dict["labels"]).lower()

In [60]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [61]:
model = model.to('cuda')

In [None]:
# Evaluation is carried out with a batch size of 1
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_features"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)

  return batch

results = lg_cv_test.map(map_to_result, remove_columns=lg_cv_test.column_names)

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [None]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [65]:
def calculate_wer_cer(batch):
    batch["WER"] = wer_metric.compute(references=[batch['text']], predictions= [batch['pred_str']])
    batch["CER"] = cer_metric.compute(references=[batch['text']], predictions= [batch['pred_str']])
    return batch

In [None]:
results = results.map(calculate_wer_cer, num_proc=8)

In [None]:
show_random_elements(results)

In [68]:
df = results.to_pandas()

In [69]:
# Save the results to a csv file

df.to_csv('predictions/w2v2_bert-Akan-1-hours.csv')

#### Boosting the model with a language model

In [None]:
# Use the transcriptions of the training dataset to train an n-gram lm
# Install the Ubuntu library prerequisites
!sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev

In [None]:
# Download and unpack the KenLM repo
!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz

In [None]:
!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
!ls kenlm/build/bin

In [None]:
from datasets import load_dataset

username = "hf-test"  # change to your username

dataset = load_dataset(f"mozilla-foundation/common_voice_7_0", "lg", split="train")

with open("text.txt", "w") as file:
  file.write(" ".join(dataset["text"]))


In [None]:
!kenlm/build/bin/lmplz -o 5 <"text.txt" > "5gram.arpa"

In [71]:
with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

In [None]:
!tail 5gram_correct.arpa

In [None]:
# Load the model 

from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(f"username/{config['repo']}")

In [73]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [74]:
len(sorted_vocab_dict)

36

In [None]:
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="5gram_correct.arpa",
)

In [76]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

In [None]:
from huggingface_hub import Repository

repo = Repository(local_dir=f"username/{config['repo']}", clone_from=f"username/{config['repo']}")

In [None]:
# Save the n-gram lm to a different branch of the repo

repo = Repository(local_dir=f"username/{config['repo']}", revision= 'language-model')

In [79]:
processor_with_lm.save_pretrained(f"username/{config['repo']}")

In [None]:
!tree -h username/repo

In [None]:
repo.push_to_hub(commit_message="Upload lm-boosted decoder")

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2BertForCTC
from transformers import Wav2Vec2ProcessorWithLM

processor = Wav2Vec2ProcessorWithLM.from_pretrained(f"username/{config['repo']}")

In [83]:
device = "cuda"

In [84]:
model = Wav2Vec2BertForCTC.from_pretrained(f"username/{config['repo']}")

In [85]:
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import tqdm
import torch

In [86]:
asr = pipeline("automatic-speech-recognition", model=model, device=device, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder = processor.decoder) #  

In [87]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Run inference on the test dataset
predictions = [prediction['text'] for prediction in tqdm.tqdm(asr(KeyDataset(lg_cv_test, "audio"), batch_size=16),\
                                                              desc= "Running inference", total= lg_cv_test.num_rows)]

In [99]:
# Add the predictions to the test dataset
lg_cv_test = lg_cv_test.add_column("predictions", predictions)

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=lg_cv_test["predictions"], references=lg_cv_test["transcription"])))

In [None]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=lg_cv_test["predictions"], references=lg_cv_test["transcription"])))

In [102]:
# Calculate the WER and CER 
def calculate_wer_cer(batch):
    batch['wer'] = wer_metric.compute(references=[batch["transcription"]], predictions=[batch["predictions"]])
    batch['cer'] = cer_metric.compute(references=[batch["transcription"]], predictions=[batch["predictions"]])
    return batch

In [None]:
lg_cv_test = lg_cv_test.map(calculate_wer_cer, num_proc=9)

In [None]:
show_random_elements(lg_cv_test.remove_columns(['audio']), 5)

In [106]:
# save the results to a csv file
df = lg_cv_test.remove_columns(['audio']).to_pandas()
df.to_csv('predictions/results.csv', index =False)