In [1]:
!pip install --upgrade pip
!pip install --upgrade datasets transformers accelerate soundfile librosa evaluate jiwer tensorboard gradio
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg
!pip install zipfile
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl.metadata (2.6 kB)
Collecting tensorboard
  Downloading tensorboard-2.15.1-py3-none-any.whl.metadata (1.7 kB)
Collecting gradio
  Downloading gradio-4.8.0-py3-none-any.whl.metadata (17 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.m

In [11]:
from google.colab import drive
drive.mount('/content/drive')
load_path = '/content/drive/My Drive/whisperfinetune_preprocessed_data'

from datasets import DatasetDict

# Load the dataset
dataset = DatasetDict.load_from_disk(load_path)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
  processor: Any

  #__call__ makes it so that the any instance of the object can be called like a method eg
  # instance = DataCollatorSpeechSeq2SeqWithPadding()
  # instance(features) will return an output of type dictionary

  # input is a list of dictionaries of keys: string and value: list[int] or tensor
  # output is a dictionary of keys: string and value: tensor
  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

    input_features = [{"input_features": feature["input_features"]} for feature in features]

    batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

    # get the tokenized label sequences
    label_features = [{"input_ids": feature["labels"]} for feature in features]
    # pad the labels to max length
    labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

    # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
    if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
        labels = labels[:, 1:]

    batch["labels"] = labels

    return batch



In [12]:
from transformers import WhisperProcessor
from transformers import WhisperTokenizer

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe")



preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

In [13]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [4]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

save_path = '/content/drive/My Drive/whisperfinetune_modelcheckpoints'

In [6]:
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
import os

# Specify the directory where checkpoints are saved
checkpoint_directory = save_path

# Function to find the latest checkpoint
def get_latest_checkpoint(checkpoint_dir):
    if not os.path.exists(checkpoint_dir):
        return None
    checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, d))]
    if checkpoints:
        return max(checkpoints, key=os.path.getmtime)  # return the most recent checkpoint
    else:
        return None

# Find the latest checkpoint
latest_checkpoint = get_latest_checkpoint(checkpoint_directory)

# Load the model
if latest_checkpoint:
    print(f"Resuming from the latest checkpoint: {latest_checkpoint}")
    model = WhisperForConditionalGeneration.from_pretrained(latest_checkpoint)
else:
    print("No checkpoint found, starting from scratch")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

Resuming from the latest checkpoint: /content/drive/My Drive/whisperfinetune_modelcheckpoints/checkpoint-4000


In [7]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [8]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    num_train_epochs=1,
    output_dir=save_path,
    logging_dir = '/content/drive/My Drive/whisperfinetune_modelcheckpoints/logs', # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"], #something ain't working here
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [15]:
processor.save_pretrained(training_args.output_dir)

In [16]:
trainer.train(resume_from_checkpoint=latest_checkpoint if latest_checkpoint else None)


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=4001, training_loss=2.753051418938642e-08, metrics={'train_runtime': 41.1581, 'train_samples_per_second': 1554.979, 'train_steps_per_second': 97.186, 'total_flos': 1.83439309713408e+19, 'train_loss': 2.753051418938642e-08, 'epoch': 41.68})

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
kwargs = {
    "dataset_tags": "SingaporeASR",
    "dataset": "SingaporeASR",  # a 'pretty' name for the training dataset
    "dataset_args": "config: Eng, split: test",
    "language": "Eng",
    "model_name": "whisper-small-singapore",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}

In [20]:
trainer.push_to_hub(**kwargs)


BadRequestError: ignored

In [19]:
trainer.save_model()
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1702088165.f763a80c68af.632.0:   0%|          | 0.00/5.68k [00:00<?, ?B/s]

'https://huggingface.co/Mompansy/whisperfinetune_modelcheckpoints/tree/main/'