<a href="https://colab.research.google.com/github/Moses05/lingala-english-asr/blob/main/lingala_english_asr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table of Contents

In [None]:
%%capture
!pip install datasets

In [None]:
import os

if not (os.path.exists("lingala-english-asr")):
  !git clone https://github.com/Moses05/lingala-english-asr.git

Cloning into 'lingala-english-asr'...
remote: Enumerating objects: 2981, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 2981 (delta 0), reused 3 (delta 0), pack-reused 2975 (from 1)[K
Receiving objects: 100% (2981/2981), 432.52 MiB | 15.58 MiB/s, done.
Resolving deltas: 100% (11/11), done.
Updating files: 100% (2967/2967), done.


lingala-english-asr  savedModel  wandb			      wav2vec2-large-xlsr-lingala2
sample_data	     vocab.json  wav2vec2-large-xlsr-lingala


In [None]:
base_path = "lingala-english-asr/LRSC/lingala"

train_audio_path = f"{base_path}/train/audio"
train_transcript_path = f"{base_path}/train/transcript.txt"

valid_audio_path = f"{base_path}/valid/audio"
valid_transcript_path = f"{base_path}/valid/transcript.txt"

In [None]:
manifest_path = f"{base_path}/manifest"

dict_txt = f"{manifest_path}/dict.ltr.txt"

train_letter = f"{manifest_path}/train.ltr"
train_tsv = f"{manifest_path}/train.tsv"
train_word = f"{manifest_path}/train.wrd"

valid_letter = f"{manifest_path}/valid.ltr"
valid_tsv = f"{manifest_path}/valid.tsv"
valid_word = f"{manifest_path}/valid.wrd"

In [None]:
import wave

def check_sample_rate(dir):

  wrongFramerate = []

  for wav in os.listdir(dir):
    path = f"{dir}/{wav}"

    if os.path.isfile(path):
      with wave.open(path, "rb") as wav_file:
        if wav_file.getframerate() != 16000:
          wrongFramerate.append(wav_file)

  return wrongFramerate


train_wrongSample = check_sample_rate(train_audio_path)
valid_wrongSample = check_sample_rate(valid_audio_path)

print(f"list of train audio files not 16000hz: {train_wrongSample}")
print(f"list of valid audio files not 16000hz: {valid_wrongSample}")

list of train audio files not 16000hz: []
list of valid audio files not 16000hz: []


In [None]:
from datasets import load_dataset, Dataset

def load_manifest_data(tsv_file, ltr_file, audio_path):
    data = {
        "path": [],
        "duration": [],
        "text": [],
    }

    # Load the .tsv file
    with open(tsv_file, 'r') as tsv_f:
        lines = tsv_f.readlines()[1:]  # Skip header
        # lines = tsv_f.readlines()[1:]  # Skip header
        for line in lines:
            parts = line.strip().split("\t")

            # Ensure two columns path and duration
            if len(parts) != 2:
              print(f"skipping malformed line: {line}")
              continue

            path, duration = parts
            full_path = os.path.join(audio_path, path) # prepend base path

            data["path"].append(full_path)
            # data["duration"].append(int(duration) / 1000)
            data["duration"].append(duration)

    # Load the .ltr file for transcriptions
    with open(ltr_file, 'r') as ltr_f:
        transcriptions = ltr_f.readlines()
        data["text"] = [trans.strip() for trans in transcriptions]

    return Dataset.from_dict(data)

# Load the training and validation datasets
train_dataset = load_manifest_data(train_tsv, train_letter, train_audio_path)
valid_dataset = load_manifest_data(valid_tsv, valid_letter, valid_audio_path)

In [None]:
print(f"train dataset \n{train_dataset} \n\nvalid dataset \n{valid_dataset}")

train dataset 
Dataset({
    features: ['path', 'duration', 'text'],
    num_rows: 2557
}) 

valid dataset 
Dataset({
    features: ['path', 'duration', 'text'],
    num_rows: 383
})


In [None]:
import random
import pandas as pd
from IPython.display import display, HTML
import numpy as np

def show_random_samples(dataset, num_examples=10):
  assert num_examples <= len(dataset), "More specified examples than dataset elements"

  picks = np.random.randint(len(dataset), size = num_examples)

  random_samples = []

  for pick in picks:
    random_samples.append(dataset[int(pick)])


  df = pd.DataFrame(random_samples)

  display(HTML(df.to_html()))

In [None]:
show_random_samples(train_dataset)

Unnamed: 0,path,duration,text
0,lingala-english-asr/LRSC/lingala/train/audio/kerene_221011-105639_lin_359_elicit_136.wav,59895,b a m o n a | m ɔ t ɔ | a z a l i | l i s u s u | k o n i n g a n a | t e |
1,lingala-english-asr/LRSC/lingala/train/audio/exauce1_221010-164503_lin_359_elicit_128.wav,112893,s i k o y ɔ | b o y o k a | n d e n g e | n i n i | d j o | r i g o | a k o m a k i | f w a k u m p u t u |
2,lingala-english-asr/LRSC/lingala/train/audio/maswa_221010-160522_lin_359_elicit_54.wav,30492,l o g i q u e | e z a l i | k w a n g a | t e |
3,lingala-english-asr/LRSC/lingala/train/audio/kev_221010-150830_lin_359_elicit_17.wav,161898,t o y a k a | k i n s h a s a | m p o | a n t o i n e | m u n d a n d a | a y e m b a k a | k i n s h a s a | p o t o | m o i n d o |
4,lingala-english-asr/LRSC/lingala/train/audio/maswa_221010-160522_lin_359_elicit_68.wav,82401,e k o m i | y e | n a | e t e y e l o | a k e y i | k o m o n a | m o l a k i s i | y a | m a t h e m a t i q u e |
5,lingala-english-asr/LRSC/lingala/train/audio/exauce1_221010-164503_lin_359_elicit_23.wav,92565,a t i e | y a n g o | n a | p u s u | t i i | n a | b a r u m b u | e p a y i | b a z a l a k i | k o f a n d a |
6,lingala-english-asr/LRSC/lingala/train/audio/emma_221010-142655_lin_359_elicit_88.wav,85305,l i b o s o | b a k a n i s a k i | t e | a z a l i | k o s e k a | m a k a m b o | b a n g o | b a z a l a k i | k o l o b a |
7,lingala-english-asr/LRSC/lingala/train/audio/rebecca_221011-120830_lin_359_elicit_120.wav,48642,y ɔ | o y e b i | k o b e t a | b r i q u e |
8,lingala-english-asr/LRSC/lingala/train/audio/kev_221010-150830_lin_359_elicit_73.wav,148830,b a n d u l i s t e | b a k o b a n d a | k o s a l a | m u l o n g o | p o | b a y e m b a | n g a i |
9,lingala-english-asr/LRSC/lingala/train/audio/rebecca_221011-120830_lin_359_elicit_72.wav,71511,t e | p e t i t | n a z a l i | n a | m p o s a | y a | m ɔ t ɔ | y a | m o s a l a | t e |


In [None]:
show_random_samples(valid_dataset)

Unnamed: 0,path,duration,text
0,lingala-english-asr/LRSC/lingala/valid/audio/yan2_220717-135652_lin_9f7_elicit_87.wav,108900,m o k o | n a | m a t o n g e | m o s u s u | n a | n d j i l i | o y ɔ | y a | s u k a | n a | l i n g w a l a |
1,lingala-english-asr/LRSC/lingala/valid/audio/yan2_220717-135652_lin_9f7_elicit_65.wav,75504,t s h o t s h o | n d e | m ɔ t ɔ | a y a k a | n a | b a n i n g a | n a | y e |
2,lingala-english-asr/LRSC/lingala/valid/audio/urbain_220716-130438_lin_9f7_elicit_57.wav,119790,n a | e t e l e m e l o | y a | b i s i | y a | k i n g a s a n i | y a | s u k a | e z a l a k a | n a | b a t o | e b e l e |
3,lingala-english-asr/LRSC/lingala/valid/audio/vipok_220711-182935_lin_9f7_elicit_29.wav,72600,y ɔ | n a | m o b e k o | o z o l u k a | k w a n g a | p o | n a | m b o k a | n a | y ɔ |
4,lingala-english-asr/LRSC/lingala/valid/audio/yan2_220717-135652_lin_9f7_elicit_69.wav,179322,a k o t i | n a | e p e m e l o | a m o n i | m w a n a | y a | m o s a l a | a s i l a k i | k o p a n g u s a | n d a k o |
5,lingala-english-asr/LRSC/lingala/valid/audio/yan2_220717-135652_lin_9f7_elicit_77.wav,91839,p o | e y e b a n a | s u k a s u k a | b a k o l o n g o l a | y e |
6,lingala-english-asr/LRSC/lingala/valid/audio/urbain_220716-131602_lin_9f7_elicit_67.wav,46827,o k o e x p l i q u e r | n g a i | n i n i |
7,lingala-english-asr/LRSC/lingala/valid/audio/garcongina_220722-144016_lin_9f7_elicit_135.wav,49368,t s h o t s h o | k o d i o n g o | a b e t a m i |
8,lingala-english-asr/LRSC/lingala/valid/audio/vipok_220711-182935_lin_9f7_elicit_23.wav,56628,a m e l i | m a y ε l ε | p e | b w a n y a | n a | y e |
9,lingala-english-asr/LRSC/lingala/valid/audio/v_220716-105601_lin_9f7_elicit_83.wav,173514,m e r e | n a | b i s o | k u t u | a z o b e t a | l i s o l o | k e | t a n g u | s u k u m a | y a | l u m u m b a | e z o s a l a m a | p e r e | a t a m b o l a | z a m b a z a m b a |


In [None]:
vocab_dict = dict()

with open(dict_txt, "r") as f:
  for line in f:
    char, index = line.strip().split()
    index = int(index)

    vocab_dict[char] = index

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
# vocab_dict["<blank>"] = len(vocab_dict)

print(vocab_dict)
print(f"length of vocab dict: {len(vocab_dict)}")

{'’': 0, 'x': 1, 'a': 2, 'w': 3, 'à': 4, 'f': 5, '-': 6, 'q': 7, '5': 8, 'ç': 9, 'd': 10, 'î': 11, 'j': 12, 'e': 13, '0': 14, 'g': 15, 's': 16, 'o': 17, 'c': 18, "'": 19, 'h': 20, '3': 21, 't': 22, 'l': 23, 'ǎ': 24, 'r': 25, 'ε': 26, 'ê': 27, '8': 28, 'y': 29, 'n': 30, '|': 31, 'u': 32, 'ɔ': 33, 'z': 34, 'ɛ': 35, 'k': 36, 'm': 37, 'å': 38, 'v': 39, 'i': 40, 'p': 41, 'b': 42, '[UNK]': 43, '[PAD]': 44}
length of vocab dict: 45


In [None]:
for key, value in vocab_dict.items():
  print(f"{key} {value}")

’ 0
x 1
a 2
w 3
à 4
f 5
- 6
q 7
5 8
ç 9
d 10
î 11
j 12
e 13
0 14
g 15
s 16
o 17
c 18
' 19
h 20
3 21
t 22
l 23
ǎ 24
r 25
ε 26
ê 27
8 28
y 29
n 30
| 31
u 32
ɔ 33
z 34
ɛ 35
k 36
m 37
å 38
v 39
i 40
p 41
b 42
[UNK] 43
[PAD] 44


In [None]:
import json

with open("vocab.json", "w") as vocab_file:
  json.dump(vocab_dict, vocab_file)

print(os.path.exists("vocab.json"))

True


In [None]:
with open("vocab.json", "r") as vocab_file:
  print(vocab_file.readlines())
  vocab_file.close()

['{"\\u2019": 0, "x": 1, "a": 2, "w": 3, "\\u00e0": 4, "f": 5, "-": 6, "q": 7, "5": 8, "\\u00e7": 9, "d": 10, "\\u00ee": 11, "j": 12, "e": 13, "0": 14, "g": 15, "s": 16, "o": 17, "c": 18, "\'": 19, "h": 20, "3": 21, "t": 22, "l": 23, "\\u01ce": 24, "r": 25, "\\u03b5": 26, "\\u00ea": 27, "8": 28, "y": 29, "n": 30, "|": 31, "u": 32, "\\u0254": 33, "z": 34, "\\u025b": 35, "k": 36, "m": 37, "\\u00e5": 38, "v": 39, "i": 40, "p": 41, "b": 42, "[UNK]": 43, "[PAD]": 44}']


In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("vocab.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|"
                                 )

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000, padding_value=0.0, do_normalise=True, return_attention_mask=True
                                             )

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
from datasets import Audio

def raw_audio(dataset):
  if "path" in dataset.column_names:
    dataset = dataset.rename_column("path", "audio")

    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

  return dataset

train_dataset = raw_audio(train_dataset)
valid_dataset = raw_audio(valid_dataset)

In [None]:
print(train_dataset[0])

{'audio': {'path': 'lingala-english-asr/LRSC/lingala/train/audio/exauce1_221010-164503_lin_359_elicit_73.wav', 'array': array([ 0.        ,  0.        ,  0.        , ...,  0.00247192,
       -0.0140686 , -0.01870728]), 'sampling_rate': 16000}, 'duration': '71874', 'text': 'm ɔ t ɔ | a l o b i s a | y e | t e | t i i | n t a n g o | p o n d u | e k o b e l a |'}


In [None]:
import IPython.display as ipd
import random

rand_int = random.randint(0, len(train_dataset)-1)


print(train_dataset[rand_int]['audio']['path'])
print(train_dataset[rand_int]["text"])
print(f"Shape: {train_dataset[rand_int]['audio']['array'].shape}")
ipd.Audio(data=train_dataset[rand_int]["audio"]["array"], autoplay=True, rate=16000)

lingala-english-asr/LRSC/lingala/train/audio/kerene_221011-105639_lin_359_elicit_103.wav
b a b o y i | k o l e k a | n z e l a | y a | g e n d a r m e r i e | m b a m b a | b a s o d a | b a s o m b a l | l i k a m b o |
Shape: (97284,)


In [None]:
def prepare_dataset(batch):
  audio = batch["audio"]

  batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]

  with processor.as_target_processor():
    batch["labels"] = processor(batch["text"]).input_ids
  return batch

In [None]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names, num_proc=4)
valid_dataset = valid_dataset.map(prepare_dataset, remove_columns=valid_dataset.column_names, num_proc=4)

Map (num_proc=4):   0%|          | 0/2557 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/383 [00:00<?, ? examples/s]



In [None]:
print(train_dataset[0])

{'input_values': [-0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156, -0.0006520673050545156

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    # max_length: Optional[int] = None
    # max_length_labels: Optional[int] = None
    # pad_to_multiple_of: Optional[int] = None
    # pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
      # split inputs and labels since they have to be of different lenghts and need
      # different padding methods
      input_features = [{"input_values": feature["input_values"]} for feature in features]
      label_features = [{"input_ids": feature["labels"]} for feature in features]

      batch = self.processor.pad(
          input_features,
          padding=self.padding,
          return_tensors="pt",
      )
      with self.processor.as_target_processor():
          labels_batch = self.processor.pad(
              label_features,
              padding=self.padding,
              return_tensors="pt",
          )

      # replace padding with -100 to ignore loss correctly
      labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
      batch["labels"] = labels

      return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(vocab_dict)
)

model.freeze_feature_extractor()

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-large-xlsr-lingala",
    group_by_length=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=5,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2
)



In [None]:
%%capture
!pip install evaluate
!pip install jiwer

In [None]:
from transformers import Trainer
import numpy as np
import evaluate

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
  pred_logits = pred.predictions
  pred_ids = np.argmax(pred_logits, axis=-1)

  pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

  pred_str = processor.batch_decode(pred_ids)
  label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

  wer = wer_metric.compute(predictions=pred_str, references=label_str)

  return {"wer": wer}

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss,Wer
100,10.0972,8.878903,0.998238
200,2.6751,2.334919,1.0
300,2.0955,1.917456,1.0
400,1.8723,1.898739,1.0
500,1.6523,1.658826,1.0
600,1.665,1.662004,0.981894
700,1.6156,1.62542,0.977884
800,1.5927,1.617113,0.977824
900,1.5527,1.55962,0.980315
1000,1.4435,1.361589,0.952002


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=1600, training_loss=2.066623463332653, metrics={'train_runtime': 2422.5467, 'train_samples_per_second': 5.278, 'train_steps_per_second': 0.66, 'total_flos': 2.4006381904705275e+18, 'train_loss': 2.066623463332653, 'epoch': 5.0})

In [None]:
def compute_metrics(pred):
  pred_logits = pred.predictions
  pred_ids = np.argmax(pred_logits, axis=1)

  pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

  pred_str = processor.batch_decode(pred_ids)

  label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

  wer = wer_metric.compute(predictions=pred_str, references=label_str)

  return {"wer": wer}

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    # attention_dropout=0.1,
    # hidden_dropout=0.1,
    # feat_proj_dropout=0.0,
    # mask_time_prob=0.05,
    # layer_drop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.freeze_feature_extractor()
model.gradient_checkpointing_enable()



In [None]:
model.gradient_checkpointing_enable()

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor.feature_extractor,
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112746533333646, max=1.0…



Step,Training Loss,Validation Loss,Wer
100,3.544,2.415191,1.0
200,1.8022,1.663192,1.0
300,1.7694,1.646362,1.0
400,1.6649,1.628671,1.0
500,1.6101,1.610545,1.0
600,1.2451,1.147436,1.0
700,0.721,0.789387,1.0
800,0.5607,0.660434,1.0
900,0.5235,0.566811,1.0
1000,0.5783,0.555292,1.0


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=1600, training_loss=1.3429930251836777, metrics={'train_runtime': 3139.9164, 'train_samples_per_second': 4.072, 'train_steps_per_second': 0.51, 'total_flos': 2.400653789661325e+18, 'train_loss': 1.3429930251836777, 'epoch': 5.0})

In [None]:
trainer.save_model("./savedModel")

In [None]:
processor.save_pretrained("./savedModel")

[]

In [None]:
trainer.save_model("./wav2vec2-large-xlsr-lingala2")
processor.save_pretrained("./wav2vec2-large-xlsr-lingala2")

[]

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load model and processor
model = Wav2Vec2ForCTC.from_pretrained("./wav2vec2-large-xlsr-lingala2", use_safetensors=True)
processor = Wav2Vec2Processor.from_pretrained("./wav2vec2-large-xlsr-lingala2")

print("Model and processor loaded successfully!")

Model and processor loaded successfully!


In [None]:
# def load_vocab(vocab_path):
#     char_to_index = {}
#     index_to_char = {}

#     with open(vocab_path, "r") as f:
#         for line in f:
#             char, index = line.strip().split()
#             index = int(index)
#             char_to_index[char] = index
#             index_to_char[index] = char

#     # Add the <blank> token if not already present
#     if "<blank>" not in char_to_index:
#         char_to_index["<blank>"] = len(char_to_index)
#         index_to_char[len(index_to_char)] = "<blank>"

#     return char_to_index, index_to_char

# char_to_index, index_to_char = load_vocab(dict_txt)
# vocab_size = len(char_to_index)

# char_to_index, index_to_char = load_vocab(dict_txt)
# vocab_size = len(char_to_index)

# print(f"Loaded vocabulary with {vocab_size} characters")

In [None]:
# def text_to_ids(text, char_to_index):
#     return [char_to_index[char] for char in text if char in char_to_index]

# def add_labels(batch):
#     # Convert text into numerical labels
#     batch["labels"] = text_to_ids(batch["text"], char_to_index)
#     return batch

In [None]:
# import torchaudio

# def preprocess_audio(batch):
#   waveform, sample_rate = torchaudio.load(batch["path"])

#   if waveform.shape[0] > 1:
#     waveform = torch.mean(waveform, dim=0, keepdim=True)

#   if sample_rate != 16000:
#     resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
#     waveform = resampler(waveform)

#   waveform = waveform / torch.max(torch.abs(waveform))

#   inputs = feature_extractor(
#       waveform.squeeze().numpy(),
#       sampling_rate=16000,
#       return_tensors="pt",
#       padding=True,
#   )

#   batch["input_values"] = inputs.input_values[0].float()
#   return batch

In [None]:
# from torch.nn.utils.rnn import pad_sequence

# def collate_fn(batch):
#   input_values = [
#       torch.tensor(sample["input_values"], dtype=torch.float32)
#       if not isinstance(sample["input_values"], torch.Tensor)
#       else sample["input_values"].float()
#       for sample in batch
#   ]

#   input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0.0)

#   labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
#   labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

#   return {
#       "input_values": input_values_padded,
#       "labels": labels_padded,
#       "input_lengths": torch.tensor([len(x) for x in input_values]),
#       "label_lengths": torch.tensor([len(x) for x in labels])
#   }

In [None]:
# def truncate_audio(sample, max_length=80000):
#   if len(sample["input_values"]) > max_length:
#     sample["input_values"] = sample["input_values"][:max_length]
#   return sample

In [None]:
# def prepare_dataset(dataset):
#   dataset = dataset.map(preprocess_audio, remove_columns=["path", "duration"])

#   dataset = dataset.map(add_labels, remove_columns=["text"])

#   dataset.set_format(type="torch", columns=["input_values", "labels"])

#   dataset = truncate_audio(dataset)

#   return dataset

In [None]:
# train_dataset = prepare_dataset(train_dataset)
# valid_dataset = prepare_dataset(valid_dataset)

Map:   0%|          | 0/2557 [00:00<?, ? examples/s]

Map:   0%|          | 0/2557 [00:00<?, ? examples/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]

In [None]:
# from torch.utils.data import DataLoader

# train_loader = DataLoader(
#     train_dataset,
#     batch_size=1,
#     shuffle=True,
#     collate_fn=collate_fn,
#     num_workers=2
# )

# valid_loader = DataLoader(
#     valid_dataset,
#     batch_size=1,
#     collate_fn=collate_fn,
#     num_workers=2
# )

In [None]:
# for batch in train_loader:
#     input_values = batch["input_values"]
#     print(f"Batch Shape:", input_values.shape)
#     break

  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]


Batch Shape: torch.Size([1, 71148])


In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

Mon Nov 18 02:56:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# import torch
# import torch.nn as nn
# from transformers import Wav2Vec2Model

# class Wav2Vec2CTC(nn.Module):
#   def __init__(self, model, vocab_size):
#     super(Wav2Vec2CTC, self).__init__()
#     self.feature_extractor = model
#     self.ctc_head = nn.Linear(self.feature_extractor.config.hidden_size, vocab_size)

#   def forward(self, input_values):
#     features = self.feature_extractor(input_values).last_hidden_state

#     logits = self.ctc_head(features)
#     return logits

# wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")


# # wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")

# model = Wav2Vec2CTC(wav2vec2_model, vocab_size)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)



Wav2Vec2CTC(
  (feature_extractor): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (en

In [None]:
# import torch.optim as optim

# blank_index = char_to_index["<blank>"]
# print(f"Blank index: {blank_index}")

# criterion = nn.CTCLoss(blank=blank_index, zero_infinity=True)
# optimizer = optim.Adam(model.parameters(), lr=1e-4)

Blank index: 43


In [None]:
torch.cuda.empty_cache()

In [None]:
# def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
#     model.train()

#     for epoch in range(num_epochs):
#         total_loss = 0

#         for batch in train_loader:
#             inputs = batch["input_values"].to(device)
#             labels = batch["labels"].to(device)
#             # inputs = batch["input_values"]
#             # labels = batch["labels"]

#             # Forward pass through the model
#             logits = model(inputs)
#             logits = logits.log_softmax(2).permute(1, 0, 2)  # Shape: (seq_len, batch, vocab_size)

#             # Calculate input lengths based on model output
#             input_lengths = torch.full((logits.size(1),), logits.size(0), dtype=torch.long)
#             label_lengths = torch.sum(labels != -100, dim=1)

#             # Zero gradients
#             optimizer.zero_grad()

#             # Compute the loss
#             loss = criterion(logits, labels, input_lengths, label_lengths)
#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()

#         print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

# # Run the training
# train_model(model, train_loader, criterion, optimizer)

  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]


Epoch 1/5, Loss: 2.936726912139308


  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]


Epoch 2/5, Loss: 2.8926950453965983


  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]


Epoch 3/5, Loss: 2.8898955950477787


  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]


Epoch 4/5, Loss: 2.8900353307130984


  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]


Epoch 5/5, Loss: 2.8899185226725375


In [None]:
!pip install evaluate jiwer

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiwer-3.0.5-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer, evaluate
Successfully installed evaluate-0.4.3 jiwer-3.0.5 rapidfuzz-3.10.1


In [None]:
# from evaluate import load
# import torch

# wer_metric = load("wer")

# def evaluate_model(model, valid_loader, index_to_char):
#     model.eval()
#     total_wer = 0
#     num_batches = 0

#     with torch.no_grad():
#         for batch in valid_loader:
#             inputs = batch["input_values"].to(device)
#             labels = batch["labels"].to(device)

#             # Forward pass through model to get logits
#             logits = model(inputs).log_softmax(2)

#             # Get predicted IDs from the logits
#             predicted_ids = torch.argmax(logits, dim=-1)

#             # Decode predictions and labels
#             pred_texts = ["".join([index_to_char[i] for i in pred if i in index_to_char]) for pred in predicted_ids]
#             label_texts = ["".join([index_to_char[i] for i in label if i in index_to_char]) for label in labels]

#             # Debugging: Print out a few predictions and labels
#             print("\nPredicted Texts:", pred_texts[:3])
#             print("Label Texts:", label_texts[:3])

#             # Check for empty labels
#             non_empty_indices = [i for i, label in enumerate(label_texts) if label]
#             if not non_empty_indices:
#                 print("Skipping batch due to empty labels.")
#                 continue

#             # Filter out empty references
#             pred_texts = [pred_texts[i] for i in non_empty_indices]
#             label_texts = [label_texts[i] for i in non_empty_indices]

#             # Compute WER for the current batch
#             wer = wer_metric.compute(predictions=pred_texts, references=label_texts)
#             print(f"Batch WER: {wer:.4f}")

#             total_wer += wer
#             num_batches += 1

#     avg_wer = total_wer / num_batches if num_batches > 0 else float("inf")
#     print(f"\nValidation WER: {avg_wer:.4f}")

# # Run the evaluation
# evaluate_model(model, valid_loader, index_to_char)

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]
  labels = [torch.tensor(sample["labels"], dtype=torch.long) for sample in batch]



Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

Predicted Texts: ['']
Label Texts: ['']
Skipping batch due to empty labels.

In [None]:
# print("Inspecting validation dataset after preprocessing:")
# for i in range(5):
#     input_values = valid_dataset[i]['input_values']
#     labels = valid_dataset[i]['labels']
#     print(f"Sample {i+1}:")
#     print(f"Input Shape: {input_values.shape if isinstance(input_values, torch.Tensor) else 'Not a tensor'}, Labels Length: {len(labels)}")
#     print(f"Decoded Labels: {''.join([index_to_char[c] for c in labels if c in index_to_char])}")
#     print("\n")

Inspecting validation dataset after preprocessing:
Sample 1:
Input Shape: Not a tensor, Labels Length: 49


NameError: name 'index_to_char' is not defined