We trained the "facebook/wav2vec2-large-xlsr-53" model on our pre-processed data. The training was done in 12 hour sessions for about 50 hours in total. For each session, the model of the the last session was loaded and trained further. We decided not to train from the last checkpoint since we found AdamW learning rates to be untenably low after each session.

Relevant models can be found at : https://huggingface.co/Sameen53
Relevant pre-processed data can be found at: https://huggingface.co/Lancelot53


Edit: September 1, 2022

First phase of training started with facebook/wav2vec2-large-xlsr-53 as base. Trained on 36919 samples from the train set (upvotes>downvotes and between 1 and 10s, can be found on PreProcessing1 notebook). Trained for 71 epochs. Final model saved as Sameen53/cv_bn_bestModel_1


Second phase of training with Sameen53/cv_bn_bestModel_1 as base. Trained on about 45k data collected from train set and validation set combined (can be found on PreProcessing2 notebook)

In [1]:
# # %%capture
# # !apt install git-lfs
# # !pip install transformers
# %pip install jiwer
# %pip install python-dotenv
# %pip install transformers[torch]
# %pip install accelerate -U

In [147]:
import torch
from transformers import Wav2Vec2Processor
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

In [148]:
import os
from dotenv import load_dotenv

load_dotenv()
hugging_face_token = os.getenv('hugging_face_token')
print(hugging_face_token)

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


In [149]:
# pad features to get tensors of the same size
# loss from the padded labels is ignored during training

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [150]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("arijitx/wav2vec2-xls-r-300m-bengali")

# model = Wav2Vec2ForCTC.from_pretrained("Sameen53/cv_bn_bestModel_1")
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53", ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id, vocab_size=len(processor.tokenizer), gradient_checkpointing=True)

# Load model from repo
#model = Wav2Vec2ForCTC.from_pretrained("myliew/Bengali_ASR", use_auth_token=hugging_face_token)

# Load model from local
model = Wav2Vec2ForCTC.from_pretrained("finetune_batch_2")
# ;_;
#model = model.to("cuda")


In [151]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [152]:
model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [153]:
from datasets import load_metric
wer_metric = load_metric("wer")

In [154]:
import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [155]:
model.freeze_feature_encoder()

In [156]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  report_to="none",
  hub_token = hugging_face_token,
  output_dir="myliew/Bengali_ASR",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=7,
  gradient_checkpointing=True,
  save_steps=1500,
  eval_steps=1500,
  logging_strategy="epoch",
  learning_rate=5e-7,
  weight_decay=0.0000025,
  warmup_steps=500,
  save_total_limit=3,
  resume_from_checkpoint=True,
  disable_tqdm=True,
#   load_best_model_at_end=True,
#   metric_for_best_model="wer",  
# greater_is_better=False,
)

In [157]:
# from datasets import load_from_disk

# dataset = load_from_disk("../input/cv-bn-train")

from datasets import load_from_disk
dataset = load_from_disk("train_subset_4_preprocessed_trimmed")
# dataset = load_dataset("common_voice","ab")

In [158]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'input_values', 'input_length', 'labels'],
        num_rows: 2515
    })
    test: Dataset({
        features: ['audio', 'transcription', 'input_values', 'input_length', 'labels'],
        num_rows: 444
    })
})

In [159]:
# validation_dataset = load_from_disk("../input/commonvoicesbn2to9sec/validation_data")


In [160]:
# max_input_length_in_sec = 9.0
# dataset = dataset['train'].filter(lambda x: x < max_input_length_in_sec * 16000, input_columns=["input_length"])
# min_input_length_in_sec = 1.0
# dataset = dataset.filter(lambda x: x > min_input_length_in_sec * 16000, input_columns=["input_length"])

In [161]:
# dataset = dataset.train_test_split(test_size=0.2, seed = 4 )

In [162]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=processor.feature_extractor,
)

In [163]:
trainer.train()



{'loss': 0.3583, 'learning_rate': 7.899999999999999e-08, 'epoch': 1.0}
{'loss': 0.3548, 'learning_rate': 1.5799999999999999e-07, 'epoch': 2.0}
{'loss': 0.3545, 'learning_rate': 2.3699999999999996e-07, 'epoch': 3.0}
{'loss': 0.3541, 'learning_rate': 3.1599999999999997e-07, 'epoch': 4.0}
{'loss': 0.3513, 'learning_rate': 3.95e-07, 'epoch': 5.0}
{'loss': 0.3523, 'learning_rate': 4.7399999999999993e-07, 'epoch': 6.0}
{'loss': 0.3396, 'learning_rate': 0.0, 'epoch': 7.0}
{'train_runtime': 97624.6124, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.006, 'train_loss': 0.3521233018871673, 'epoch': 7.0}


TrainOutput(global_step=553, training_loss=0.3521233018871673, metrics={'train_runtime': 97624.6124, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.006, 'train_loss': 0.3521233018871673, 'epoch': 7.0})

In [142]:
#!rm -r ./myliew

In [164]:
trainer.push_to_hub()


c:\Users\Moses\.vscode\Bengali_ASR\myliew/Bengali_ASR is already a clone of https://huggingface.co/myliew/Bengali_ASR. Make sure you pull the latest changes with `repo.git_pull()`.
Upload file pytorch_model.bin: 1.18GB [47:26, 561kB/s]                            To https://huggingface.co/myliew/Bengali_ASR
   01f046e..349c006  main -> main

Upload file pytorch_model.bin: 100%|██████████| 1.18G/1.18G [47:27<00:00, 443kB/s]
Upload file training_args.bin: 100%|██████████| 3.93k/3.93k [47:27<00:00, 1.41B/s] 


'https://huggingface.co/myliew/Bengali_ASR/commit/349c006a199d19aade1b1a1ccba7e4a1ba53db5b'

In [165]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 13:46:27


In [166]:
trainer.save_model("finetune_batch_4")