# Sanity Check
To prevent confusion further down in the notebook, a simple sanity check keeps track of what has been run and what has not.

First of all, make sure GPU is loaded (we do not train on the CPU).

In [1]:
sanity_prop = {}
def sanity_check(keys):
  for key in keys:
    if key not in sanity_prop:
      raise Exception(f"Requirement: '{key}' not satisfied!")

def sanity_check_not_added(keys):
  for key in keys:
    if key in sanity_prop:
      raise Exception(f"Requirement: '{key}' already satisfied!")

# should-use-gpu
# settings-set
# libraries-are-installed
# drive-must-be-mounted
# logged-in-to-huggingface

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if not gpu_info.find('failed') >= 0 and "command not found" not in gpu_info:
  sanity_prop['should-use-gpu'] = True

# Settings

In [2]:
sanity_check(['should-use-gpu'])

# Settings
# - Model
# - - From Multilingual Checkpoint
# pretrained_model = "openai/whisper-small"
# pretrained_model_language = "Swedish"
# pretrained_model_language_training = "Swedish"
# pretrained_model_task = "transcribe"
# pretrained_model_sampling_rate = 16000  # Hz
# - - From English Checkpoint
pretrained_model = "openai/whisper-small.en"
pretrained_model_language = "Swedish"
pretrained_model_language_training = "Swedish"
pretrained_model_task = "transcribe"
pretrained_model_sampling_rate = 16000  # Hz

# - Task
# - - From Multilingual Checkpoint
# task_repo_dataset = "GroupSix/common-voice-sv"
# - - From English Checkpoint
task_repo_dataset = "GroupSix/common-voice-en-sv"

sanity_prop['settings-set'] = True

# Install Libraries
Install necessary libraries

In [3]:
sanity_check(['should-use-gpu', 'settings-set'])
sanity_check_not_added(['libraries-are-installed'])  # Don't rerun

# Install updated libaries
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install accelerate==0.23.0
!pip install datasets transformers[sentencepiece]

sanity_prop['libraries-are-installed'] = True

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-oodxrs6a
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-oodxrs6a
  Resolved https://github.com/huggingface/transformers to commit 238d2e3c44366aba9dc5c770c95475765a6725cb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.37.0.dev0-py3-none-any.whl size=8269614 sha256=7995820aee7f4a0bf2b839c810a534b3d30033e51ab94c9e07a73efe2721f54e
  Stored in directory: /tmp/pip-ephem-wheel-cache-29hh8w1n/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

# Mount Google Drive
Mount Google Drive to save model and checkpoints.

In [4]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed'])

# Mount drive immediately
from google.colab import drive
drive.mount('/content/drive')

# Model path
# - Mårten
model_base = "/content/drive/MyDrive/id2223-lab"
model_path = model_base + "/whisper-small-sv"

# - Maria
# model_base = "/content/drive/MyDrive/id2223-lab"
# model_path = model_base + "/whisper-small-sv"

# Check if drive is mounted
import os
if os.path.exists(model_base):
  sanity_prop['drive-must-be-mounted'] = True

Mounted at /content/drive


# Hugging Face Login
Log in to Hugging Face to save dataset when it is processed.

In [13]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted'])

# Log in to HF
from huggingface_hub import notebook_login
notebook_login()

sanity_prop['logged-in-to-huggingface'] = True

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Training

## Processor

In [6]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface'])

# Load feature extractor from pre-trained checkpoint
# from transformers import WhisperFeatureExtractor
# feature_extractor = WhisperFeatureExtractor.from_pretrained(pretrained_model)

# Load WhisperTokenizer
from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained(pretrained_model, language=pretrained_model_language, task=pretrained_model_task)

# Create a WhisperProcessor
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained(pretrained_model, language=pretrained_model_language, task=pretrained_model_task)

sanity_prop['whisper-loaded'] = True

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

## Load Processed Data

In [7]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface',
              'whisper-loaded'])

# Load our dataset from cache
from datasets import load_dataset
common_voice = load_dataset(task_repo_dataset)

sanity_prop['dataset-downloaded'] = True

Downloading readme:   0%|          | 0.00/455 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/58.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/73.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/65.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/65.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/63.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/73.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/79.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/12588 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5141 [00:00<?, ? examples/s]

## Set up Collator and Metrics
Use WER for evaluation

In [8]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface',
              'whisper-loaded', 'dataset-downloaded'])

# Data collator
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

# Initialize data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Use WER metric for evaluation
import evaluate
metric = evaluate.load("wer")

# Define function to compute metrics based on WER
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

sanity_prop['collator-and-metric'] = True

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

## Load Pre-Trained Checkpoint

In [9]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface',
              'whisper-loaded', 'dataset-downloaded', 'collator-and-metric'])

# Load a Pre-Trained Checkpoint
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained(pretrained_model)

# Override arguments
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False # disable use cache during gradient checkpointing, disable during inference
# https://discuss.huggingface.co/t/why-is-use-cache-incompatible-with-gradient-checkpointing/18811/5

sanity_prop['pretrained-checkpoint-loaded'] = True

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

## Training Setup

In [10]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface',
              'whisper-loaded', 'dataset-downloaded', 'collator-and-metric', 'pretrained-checkpoint-loaded'])

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/id2223-lab/whisper-small-sv",            # Drive dir
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,                                              # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,                                                         # Maybe we should tune this then?
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':True},                       # Added to remove error
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # push_to_hub=True,                                                         # Don't push to hub, we use drive for checkpoints
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor
)

sanity_prop['trainer-settings-set'] = True

## Run Training

In [11]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface',
              'whisper-loaded', 'dataset-downloaded', 'collator-and-metric', 'pretrained-checkpoint-loaded',
              'trainer-settings-set'])

# Resume training if at least "checkpoint-1000" is saved, otherwise start from beginning (and overwrite)
if os.path.exists(model_path + "/checkpoint-1000"):
  trainer.train(resume_from_checkpoint=True)
else:
  # Save once before training
  processor.save_pretrained(training_args.output_dir)
  trainer.train()

# Make sure to save model
trainer.save_model()

sanity_prop['training-complete'] = True

There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Step,Training Loss,Validation Loss,Wer
4000,0.0729,0.601043,49.307443


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


## Upload Trainer, Tokenizer and Model

In [14]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface',
              'whisper-loaded', 'dataset-downloaded', 'collator-and-metric', 'pretrained-checkpoint-loaded',
              'trainer-settings-set', 'training-complete'])
sanity_check(['block-here-for-now'])  # TODO: Remove this to unblock and and upload model

# Push
trainer.push_to_hub("GroupSix/whisper-small-sv")
tokenizer.push_to_hub("GroupSix/whisper-small-sv")
model.push_to_hub("GroupSix/whisper-small-sv")

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

events.out.tfevents.1702682314.8a69d2194fc0.597.0:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

events.out.tfevents.1702651037.3095587ab87a.4304.0:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

events.out.tfevents.1702667039.0a8a8d0b6c69.1065.0:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

events.out.tfevents.1702328072.f6c49bbfeb49.479.0:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

events.out.tfevents.1702714843.a865efd1b0eb.257.0:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

events.out.tfevents.1702736529.2f782b30333b.362.0:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

events.out.tfevents.1702743492.84acc10a2b68.571.0:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GroupSix/whisper-small-sv/commit/ea7464bcf2c8b6cf1262a0aa2429c798525a6c15', commit_message='Upload WhisperForConditionalGeneration', commit_description='', oid='ea7464bcf2c8b6cf1262a0aa2429c798525a6c15', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
sanity_check(['should-use-gpu', 'settings-set', 'libraries-are-installed', 'drive-must-be-mounted', 'logged-in-to-huggingface',
              'whisper-loaded', 'dataset-downloaded', 'collator-and-metric', 'pretrained-checkpoint-loaded',
              'trainer-settings-set', 'training-complete'])
sanity_check(['block-here-for-now'])  # TODO: Remove this to unblock and and upload model

print(model)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f