We trained the "facebook/wav2vec2-large-xlsr-53" model on our pre-processed data. The training was done in 12 hour sessions for about 50 hours in total. For each session, the model of the the last session was loaded and trained further. We decided not to train from the last checkpoint since we found AdamW learning rates to be untenably low after each session.

Relevant models can be found at : https://huggingface.co/Sameen53
Relevant pre-processed data can be found at: https://huggingface.co/Lancelot53


Edit: September 1, 2022

First phase of training started with facebook/wav2vec2-large-xlsr-53 as base. Trained on 36919 samples from the train set (upvotes>downvotes and between 1 and 10s, can be found on PreProcessing1 notebook). Trained for 71 epochs. Final model saved as Sameen53/cv_bn_bestModel_1


Second phase of training with Sameen53/cv_bn_bestModel_1 as base. Trained on about 45k data collected from train set and validation set combined (can be found on PreProcessing2 notebook)

In [None]:
# %%capture
# !apt install git-lfs
# !pip install transformers
# !pip install jiwer

In [2]:
import torch
from transformers import Wav2Vec2Processor
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

In [3]:
# pad features to get tensors of the same size
# loss from the padded labels is ignored during training

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [1]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("arijitx/wav2vec2-xls-r-300m-bengali")

# model = Wav2Vec2ForCTC.from_pretrained("Sameen53/cv_bn_bestModel_1")
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53", ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id, vocab_size=len(processor.tokenizer), gradient_checkpointing=True)


# Load model from local
model = Wav2Vec2ForCTC.from_pretrained("YellowKing_model")
# ;_;
#model = model.to("cuda")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [5]:
from datasets import load_metric
wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")
Downloading builder script: 4.48kB [00:00, ?B/s]                       


ImportError: To be able to use wer, you need to install the following dependency: jiwer.
Please install it using 'pip install jiwer' for instance.

In [None]:
import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
model.freeze_feature_encoder()

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
hugging_face_token = os.getenv('hugging_face_token')
print(hugging_face_token)

ModuleNotFoundError: No module named 'dotenv'

In [None]:

from transformers import TrainingArguments

repo_name = "training_45k"

training_args = TrainingArguments(
  report_to="none",
  hub_token =  "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",
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=7,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=1500,
  eval_steps=1500,
  logging_strategy="epoch",
  learning_rate=5e-7,
  weight_decay=0.0000025,
  warmup_steps=500,
  save_total_limit=3,
#   load_best_model_at_end=True,
#   metric_for_best_model="wer",  
# greater_is_better=False,
)

In [None]:
# from datasets import load_from_disk

# dataset = load_from_disk("../input/cv-bn-train")

from datasets import load_from_disk
dataset = load_from_disk("../input/cv-bn-45k/45kData")
# dataset = load_dataset("common_voice","ab")

In [None]:
# validation_dataset = load_from_disk("../input/commonvoicesbn2to9sec/validation_data")


In [None]:
# max_input_length_in_sec = 9.0
# dataset = dataset['train'].filter(lambda x: x < max_input_length_in_sec * 16000, input_columns=["input_length"])
# min_input_length_in_sec = 1.0
# dataset = dataset.filter(lambda x: x > min_input_length_in_sec * 16000, input_columns=["input_length"])

In [None]:
# dataset = dataset.train_test_split(test_size=0.2, seed = 4 )

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
!rm -r ./training_45k

In [None]:
trainer.push_to_hub()