## initial checks

Check that we are using a CPU

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Dec 10 01:28:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    22W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Install dependencies

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks

Log in to huggingface hub with read premission

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Resuming training

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks
!pip install evaluate
!pip install accelerate -U
# session sometimes needs to be restarted? if you are starting from checkpoint

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


Imports

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
import evaluate
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from datasets import load_from_disk
from transformers import Seq2SeqTrainer

log in with read and write premission

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


change "glob_checkpoint_url" to the most recent model chechpoint.
glob_resume_url wont change unless you want to change model/repo

In [None]:
glob_resume_url = "/content/drive/MyDrive/ID2223/swedish_m_2"
glob_checkpoint_url = "/content/drive/MyDrive/ID2223/swedish_m_2/checkpoint-1500"

Re-run some of the code (since the session timed out)

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

Load the processor

In [None]:
processor_url = glob_resume_url

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(processor_url)
tokenizer = WhisperTokenizer.from_pretrained(processor_url, language="Swedish", task="transcribe")
processor = WhisperProcessor.from_pretrained(processor_url, language="Swedish", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


re-run some more

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

update to the most recent url

In [None]:
checkpoint_model_url = glob_checkpoint_url

load the most recent model

In [None]:
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_model_url)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

Re-run some more code, load dataset

In [None]:
training_args = Seq2SeqTrainingArguments(
    num_train_epochs=1,
    output_dir="/content/drive/MyDrive/ID2223/swedish_m_2",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=250,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=250,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

common_voice_reloaded = load_from_disk('/content/drive/MyDrive/ID2223/common_voice')

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice_reloaded["train"],
    eval_dataset=common_voice_reloaded["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

to see that we start from checkpoint

In [None]:
import transformers
transformers.logging.set_verbosity_info()

Resume training

In [None]:
checkpoint_trainer_url = glob_checkpoint_url

In [None]:
trainer.train(resume_from_checkpoint=checkpoint_trainer_url)


Loading model from /content/drive/MyDrive/ID2223/swedish_m_2/checkpoint-1500.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
***** Running training *****
  Num examples = 12,360
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2,000
  Number of trainable parameters = 240,582,912
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 1500
  Will skip the first 0 epochs then the first 1500 batches in the first epoch.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1750,0.114,0.28217,67.080745
2000,0.1082,0.278629,74.343255


***** Running Evaluation *****
  Num examples = 5069
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/config.json
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/generation_config.json
Model weights saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-1750/preprocessor_config.json
Feature extractor saved in /content/drive/MyDrive/ID2223/swedish_m_2/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 5069
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-2000
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-checkpoint-2000/config.json
Configuration saved in /content/drive/MyDrive/ID2223/swedish_m_2/tmp-

TrainOutput(global_step=2000, training_loss=0.029645836114883424, metrics={'train_runtime': 8444.1157, 'train_samples_per_second': 1.895, 'train_steps_per_second': 0.237, 'total_flos': 4.61736640512e+18, 'train_loss': 0.029645836114883424, 'epoch': 1.29})