In [1]:
%env CUDA_VISIBLE_DEVICES=2

env: CUDA_VISIBLE_DEVICES=2


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [4]:
model_type = "mms-1b-all"
model_name = f"facebook/{model_type}"

In [5]:
from transformers import Wav2Vec2ForCTC, AutoProcessor
processor = AutoProcessor.from_pretrained(model_name, language="bn", task="transcription")
model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor.tokenizer.set_target_lang("ben")
model.load_adapter("ben")

In [6]:
import pandas as pd
import os
from torch.utils.data import Dataset
import librosa
from datasets import Audio

class AudioDataset(Dataset):
    def __init__(self, labels_df: pd.DataFrame, data_path: str, processor):
        self.labels_df = labels_df
        self.data_path = data_path
        self.processor = processor
    def __len__(self):
        return len(self.labels_df)
    def __getitem__(self,idx):
        row = self.labels_df.iloc[idx]
        path = os.path.join(self.data_path, row["id"]+".mp3")
        sentence = row["sentence"]
        with open(path, "rb") as f:
            speech = f.read()
            audio = Audio(sampling_rate=processor.feature_extractor.sampling_rate).decode_example({"path": path, "bytes": speech})

        example = processor(audio["array"], sampling_rate=16_000, return_tensors="pt", text=sentence)
        # example = processor(audio=audio["array"], sampling_rate=processor.feature_extractor.sampling_rate, text=sentence)
        # example["input_length"] = len(audio["array"]) // processor.feature_extractor.sampling_rate
        # speech, sr = librosa.load(path, sr=processor.feature_extractor.sampling_rate) 
#         print(speech.shape)
        return example

In [7]:
from datasets import DatasetDict
labels_df = pd.read_csv("bengaliai-speech/train.csv")

In [8]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(labels_df, test_size=0.1, random_state=42)

In [9]:
common_voice = DatasetDict()
train_dataset= AudioDataset(train_df, data_path="bengaliai-speech/train_mp3s", processor=processor)
test_dataset = AudioDataset(val_df, data_path="bengaliai-speech/train_mp3s", processor=processor)

In [10]:
common_voice = DatasetDict({"train":train_dataset,"test": test_dataset})

In [28]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __init__(self, processor, padding):
        self.processor = processor
        self.padding = padding

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [29]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)



In [30]:
import evaluate

metric = evaluate.load("wer")

In [31]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # compute orthographic wer
    wer_ortho = metric.compute(predictions=pred_str, references=label_str)

    # compute normalised WER
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    # filtering step to only evaluate the samples that correspond to non-zero references:
    pred_str_norm = [
        pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0
    ]
    label_str_norm = [
        label_str_norm[i]
        for i in range(len(label_str_norm))
        if len(label_str_norm[i]) > 0
    ]

    wer = metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}


In [32]:
# from datasets import load_dataset
# stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "bn", split="test", streaming=True)
# stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000))
# bn_sample = next(iter(stream_data))["audio"]["array"]


In [33]:
from functools import partial

# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(
    model.generate, language="bengali", task="transcribe", use_cache=True
)

In [34]:
import tempfile
import shutil
import json
import kaggle
from pathlib import Path
from transformers import TrainerCallback
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments

class KaggleUploader(TrainerCallback):
    def __init__(self, dataset_path: str, id: str, title: str, isPrivate: bool):
        self.api = kaggle.KaggleApi()
        self.api.authenticate()
        self.dataset_path = dataset_path
        self.meta_data = dict(
            id=id,
            title=title,
            isPrivate=isPrivate,
            licenses=[dict(name="other")]
        )
        self.previous_best = None

    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        best_model_checkpoint = str(Path(state.best_model_checkpoint).name)
        if best_model_checkpoint != self.previous_best:
            print(best_model_checkpoint)
            self.upload_dataset_to_kaggle(self.dataset_path, best_model_checkpoint)
            self.previous_best = best_model_checkpoint
        
        return super().on_evaluate(args, state, control, **kwargs)
    
    def upload_dataset_to_kaggle(self, dataset_path, checkpoint_to_save: str):
        # latest_checkpoint = find_latest_checkpoint(dataset_path)
        checkpoint = os.path.join(dataset_path, checkpoint_to_save)

        version_notes = checkpoint_to_save
        # The checkpoint has multiple files that we don't need.
        # We only need the pytorch_model.bin file, config.json and generation_config.json
        # Copy these files to a temporary folder
        with tempfile.TemporaryDirectory() as temp_dir:
            # create a directory inside named "model"
            temp_model_dir = os.path.join(temp_dir, "model")
            os.mkdir(temp_model_dir)
            # copy the files
            for file in ["pytorch_model.bin", "config.json", "generation_config.json"]:
                shutil.copy(os.path.join(checkpoint, file), temp_model_dir)

            # create dataset-metadata.json inside the temporary directory
            with open(os.path.join(temp_model_dir, "dataset-metadata.json"), "w") as f:
                json.dump(self.meta_data, f)

            self.api.dataset_create_version(temp_model_dir, version_notes=version_notes, dir_mode="zip")

In [35]:
kaggle_uploader = KaggleUploader("/workspaces/HuggingFace-Audio-Course/whisper-large-v2-bn",
                                 id="kurokabe/whisper-large-bn",
                                 title="Whisper large bengali", 
                                 isPrivate=True)

In [36]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=f"./{model_type}-bn",  # name on the HF Hub
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    # warmup_steps=50,
    max_steps=100000,  # increase to 4000 if you have your own GPU or a Colab paid plan
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    # predict_with_generate=True,
    # generation_max_length=225,
    save_steps=20,
    eval_steps=20,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_wer_ortho",
    greater_is_better=False,
    push_to_hub=True,
    dataloader_num_workers=64,
)


In [37]:
from transformers import Trainer

trainer = Trainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    # callbacks=[kaggle_uploader]
)

/workspaces/HuggingFace-Audio-Course/./mms-1b-all-bn is already a clone of https://huggingface.co/Kurokabe/mms-1b-all-bn. Make sure you pull the latest changes with `repo.git_pull()`.


In [38]:
# model.generation_config.max_length = 470

In [39]:
# while True:
#     try:
trainer.train()
    # except RuntimeError as e:
    #     print(e)
    #     continue



ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/transformers/feature_extraction_utils.py", line 175, in convert_to_tensors
    tensor = as_tensor(value)
  File "/opt/conda/lib/python3.10/site-packages/transformers/feature_extraction_utils.py", line 148, in as_tensor
    value = np.array(value)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 1) + inhomogeneous part.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer_utils.py", line 707, in __call__
    return self.data_collator(features)
  File "/tmp/ipykernel_2972267/1377888547.py", line 37, in __call__
    batch = self.processor.pad(
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py", line 123, in pad
    input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/transformers/feature_extraction_sequence_utils.py", line 224, in pad
    return BatchFeature(batch_outputs, tensor_type=return_tensors)
  File "/opt/conda/lib/python3.10/site-packages/transformers/feature_extraction_utils.py", line 78, in __init__
    self.convert_to_tensors(tensor_type=tensor_type)
  File "/opt/conda/lib/python3.10/site-packages/transformers/feature_extraction_utils.py", line 181, in convert_to_tensors
    raise ValueError(
ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.


In [40]:
kwargs = {
    "model_name": f"{model_type} bn",  # a 'pretty' name for your model
    "language": "bn",
    "finetuned_from": model_name,
    "tasks": "automatic-speech-recognition",
}
trainer.push_to_hub(**kwargs)

Upload file pytorch_model.bin:   0%|          | 1.00/5.75G [00:00<?, ?B/s]

Upload file runs/Jul26_11-23-49_772b1c8fe2c9/events.out.tfevents.1690370644.772b1c8fe2c9:   0%|          | 1.0…

To https://huggingface.co/Kurokabe/whisper-large-v2-bn
   aaba1ef..89d3a3c  main -> main

To https://huggingface.co/Kurokabe/whisper-large-v2-bn
   89d3a3c..9260f9e  main -> main



'https://huggingface.co/Kurokabe/whisper-large-v2-bn/commit/89d3a3c3dfc5d544cfb386d3a8314b6c1d2f4951'