In [1]:
! pip install -U jiwer datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0
! pip install evaluate timeout-timer

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl.metadata (2.6 kB)
Collecting datasets==2.16.0
  Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)
Collecting fsspec==2023.10.0
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Collecting gcsfs==2023.10.0
  Downloading gcsfs-2023.10.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting pyarrow-hotfix (from datasets==2.16.0)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.0)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.16.0)
  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1

In [2]:
! cp -r /kaggle/input/fine-tune-w2v2-bert/w2v-bert-2.0-bd-regional-dialects/ /kaggle/working/

In [3]:
from datasets import Dataset, Audio

def df_to_dataset(df):
    dataset = Dataset.from_pandas(df).cast_column("audio", Audio(sampling_rate=16_000))
    return dataset

In [4]:
import re

def fix_text(text: str):
    # remove punctuations
    chars_to_ignore = '[-,\.:;\'"!\?।]'
    text = re.sub(chars_to_ignore, ' ', text)
    
    # remove special chars
    chars_to_ignore = '[\(\)\[\]<>=]'
    text = re.sub(chars_to_ignore, ' ', text)
    
    # remove numerals
    bangla_numerals = "[০১২৩৪৫৬৭৮৯]"
    text = re.sub(bangla_numerals, ' ', text)
    
    # match multiple spaces and replace them with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [5]:
import os
import pandas as pd

df = pd.read_csv("/kaggle/input/preprocessed-dataset/processed_train.csv")
df["sentence"] = [fix_text(text) for text in df["normalized"]]

audio_dir = "/kaggle/input/ben10/ben10/16_kHz_train_audio/"

df["audio"] = [
    os.path.join(audio_dir, filename)
    for filename in df["file_name"]
]

df = df[["audio", "sentence", "fold", "duration", "region"]]

In [6]:
df = df[df['sentence'] != '']

In [7]:
vocab_list = sorted(set("".join(df["sentence"])))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 'ঁ': 1,
 'ং': 2,
 'ঃ': 3,
 'অ': 4,
 'আ': 5,
 'ই': 6,
 'ঈ': 7,
 'উ': 8,
 'ঊ': 9,
 'ঋ': 10,
 'এ': 11,
 'ঐ': 12,
 'ও': 13,
 'ঔ': 14,
 'ক': 15,
 'খ': 16,
 'গ': 17,
 'ঘ': 18,
 'ঙ': 19,
 'চ': 20,
 'ছ': 21,
 'জ': 22,
 'ঝ': 23,
 'ঞ': 24,
 'ট': 25,
 'ঠ': 26,
 'ড': 27,
 'ঢ': 28,
 'ণ': 29,
 'ত': 30,
 'থ': 31,
 'দ': 32,
 'ধ': 33,
 'ন': 34,
 'প': 35,
 'ফ': 36,
 'ব': 37,
 'ভ': 38,
 'ম': 39,
 'য': 40,
 'র': 41,
 'ল': 42,
 'শ': 43,
 'ষ': 44,
 'স': 45,
 'হ': 46,
 'া': 47,
 'ি': 48,
 'ী': 49,
 'ু': 50,
 'ূ': 51,
 'ৃ': 52,
 'ে': 53,
 'ৈ': 54,
 'ো': 55,
 'ৌ': 56,
 '্': 57,
 'ৎ': 58,
 'ড়': 59,
 'ঢ়': 60,
 'য়': 61,
 '\u200d': 62}

In [8]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

65

In [9]:
valid_df = df[df["fold"] == 0].reset_index(drop=True)
train_df = df[df["fold"] != 0].reset_index(drop=True)

train_dataset = df_to_dataset(train_df)
valid_dataset = df_to_dataset(valid_df)

In [10]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [11]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import SeamlessM4TFeatureExtractor
from transformers import Wav2Vec2BertProcessor

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

In [12]:
repo_name = "w2v-bert-2.0-bd-regional-dialects"

In [13]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token")

login(token=hf_token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [14]:
processor.push_to_hub(repo_name)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Umong/w2v-bert-2.0-bd-regional-dialects/commit/75f5a4436966d01b4f4d3f99e7e5f7424008b391', commit_message='Upload processor', commit_description='', oid='75f5a4436966d01b4f4d3f99e7e5f7424008b391', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_length"] = len(batch["input_features"])
    batch["labels"] = processor(text=batch["sentence"]).input_ids
    return batch

In [16]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
valid_dataset = valid_dataset.map(prepare_dataset, remove_columns=valid_dataset.column_names)

Map:   0%|          | 0/10781 [00:00<?, ? examples/s]

Map:   0%|          | 0/2697 [00:00<?, ? examples/s]

In [17]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2BertProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [18]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [19]:
import evaluate

wer_metric = evaluate.load("wer")

2024-04-06 15:33:14.894384: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-06 15:33:14.894482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-06 15:33:15.017539: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [20]:
import numpy as np

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [21]:
from transformers import Wav2Vec2BertForCTC

model = Wav2Vec2BertForCTC.from_pretrained(
    "facebook/w2v-bert-2.0",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.0,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    add_adapter=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

Some weights of Wav2Vec2BertForCTC were not initialized from the model checkpoint at facebook/w2v-bert-2.0 and are newly initialized: ['adapter.layers.0.ffn.intermediate_dense.bias', 'adapter.layers.0.ffn.intermediate_dense.weight', 'adapter.layers.0.ffn.output_dense.bias', 'adapter.layers.0.ffn.output_dense.weight', 'adapter.layers.0.ffn_layer_norm.bias', 'adapter.layers.0.ffn_layer_norm.weight', 'adapter.layers.0.residual_conv.bias', 'adapter.layers.0.residual_conv.weight', 'adapter.layers.0.residual_layer_norm.bias', 'adapter.layers.0.residual_layer_norm.weight', 'adapter.layers.0.self_attn.linear_k.bias', 'adapter.layers.0.self_attn.linear_k.weight', 'adapter.layers.0.self_attn.linear_out.bias', 'adapter.layers.0.self_attn.linear_out.weight', 'adapter.layers.0.self_attn.linear_q.bias', 'adapter.layers.0.self_attn.linear_q.weight', 'adapter.layers.0.self_attn.linear_v.bias', 'adapter.layers.0.self_attn.linear_v.weight', 'adapter.layers.0.self_attn_conv.bias', 'adapter.layers.0.self_

In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=repo_name,
    group_by_length=True,
    length_column_name="input_length",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
#     evaluation_strategy="steps",
#     logging_strategy="steps",
#     save_strategy="steps",
#     eval_steps=300,
#     logging_steps=300,
#     save_steps=300,
    num_train_epochs=6,
    gradient_checkpointing=True,
    fp16=True,
    learning_rate=5e-5,
    warmup_steps=500,
    save_total_limit=1,
    push_to_hub=True,
    report_to="none",
)

In [23]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor.feature_extractor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [24]:
from timeout_timer import timeout, TimeoutInterrupt

HOURS = 11

@timeout(3600*HOURS)
def train():
    trainer.train(resume_from_checkpoint=True)
    trainer.push_to_hub()

In [25]:
try:
    train()
except TimeoutInterrupt:
    print("Training Paused...")



Epoch,Training Loss,Validation Loss,Wer
4,1.2426,1.348575,0.672418
5,1.0975,1.35199,0.652426
6,0.9929,1.310954,0.638432


