## Training Pipeline

### Code only for Collab

In [None]:
# Check if connected to a GPUa
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# Install dependencies
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks

## Connect to different platforms

Find your Huggingface authentication token [here](https://huggingface.co/settings/tokens):

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## Load pre-processed dataset

If you run this code locally, make sure to download the dataset from Google Drive and place it at the right location to avoid repeated downloads.

In [None]:
from datasets import load_from_disk

common_voice_local_path = "./common_voice"
common_voice_remote_path = F"/content/gdrive/My Drive/SML/lab2/common_voice"
NST_remote_path = F"/content/gdrive/My Drive/SML/lab2/NST"
NST_local_path = "./NST"

common_voice = load_from_disk(common_voice_local_path)
NST = load_from_disk(NST_local_path)

## Define a Data Collator

The data collator for a sequence-to-sequence speech model is unique in the sense that it 
treats the `input_features` and `labels` independently: the  `input_features` must be 
handled by the feature extractor and the `labels` by the tokenizer.

The `input_features` are already padded to 30s and converted to a log-Mel spectrogram 
of fixed dimension by action of the feature extractor, so all we have to do is convert the `input_features`
to batched PyTorch tensors. We do this using the feature extractor's `.pad` method with `return_tensors=pt`.

The `labels` on the other hand are un-padded. We first pad the sequences
to the maximum length in the batch using the tokenizer's `.pad` method. The padding tokens 
are then replaced by `-100` so that these tokens are **not** taken into account when 
computing the loss. We then cut the BOS token from the start of the label sequence as we 
append it later during training.

We can leverage the `WhisperProcessor` we defined earlier to perform both the 
feature extractor and the tokenizer operations:

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Load processing functions associated with whisper
from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Metrics

We'll use the word error rate (WER) metric, the 'de-facto' metric for assessing 
ASR systems. For more information, refer to the WER [docs](https://huggingface.co/metrics/wer). We'll load the WER metric from 🤗 Evaluate.

We then simply have to define a function that takes our model 
predictions and returns the WER metric. This function, called
`compute_metrics`, first replaces `-100` with the `pad_token_id`
in the `label_ids` (undoing the step we applied in the 
data collator to ignore padded tokens correctly in the loss).
It then decodes the predicted and label ids to strings. Finally,
it computes the WER between the predictions and reference labels:

In [None]:
import evaluate
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Model

For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments).

In [None]:
import os
import pickle
from transformers import WhisperForConditionalGeneration

# Load pretrained model if it is not already cached for multiple runs
raw_model_fpath = "./whisper-small-raw.pkl"
if not os.path.isfile(raw_model_fpath):
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    with open(raw_model_fpath, "wb") as f:
        pickle.dump(model, f)
else:
    with open(raw_model_fpath, "rb") as f:
        model = pickle.load(f)


# Override generation arguments
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import concatenate_datasets

def get_configuration(model, model_version="base", lr=1e-5, weight_decay=0):
    train_data = common_voice["train"]
    if model_version == "base":
        model_name = "STT-swedish-base-model"
    # "extended-dataset" took 16h to train on an rtx2070s with 8000 steps and eval every 1000 steps
    elif model_version == "extended-dataset":
        model_name = "STT-swedish-extended-dataset-model"
        train_data = concatenate_datasets([common_voice["train"], NST])
    elif model_version == "attention-dropout":
        model.config.attention_dropout = 0.3
        model_name = "STT-swedish-attention-dropout-model"
    elif model_version == "lr-decay":
        lr = 5e-5
        weight_decay = 0.001
        model_name = "STT-swedish-lr-decay-model"
    elif model_version == "lr-decay-dropout":
        model.config.attention_dropout = 0.3
        lr = 5e-5
        weight_decay = 0.001
        model_name = "STT-swedish-lr-decay-attentiondropout-model"
    # "extended-dataset-regularised" is the final model we used
    elif model_version == "extended-dataset-regularised":
        train_data = concatenate_datasets([common_voice["train"], NST])
        model.config.attention_dropout = 0.3
        model_name = "STT-Swedish-Whisper"
    return model_name, train_data, model, lr, weight_decay


# Check get_configuration() for the different possible model variants
model_variant = "extended-dataset-regularised"
model_name, train_data, model, lr, weight_decay = get_configuration(model, model_variant)

#checkpoint_path = F"/content/gdrive/My Drive/SML/lab2/model/checkpoint-1500"
checkpoint_path = None # f"{model_name}/checkpoint-18000"
output_dir_remote = F"/content/gdrive/My Drive/SML/lab2/{model_name}"
output_dir_local = f"./{model_name}"


training_args = Seq2SeqTrainingArguments(
    num_train_epochs=1,
    output_dir=output_dir_local,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # increase by 2x for every 2x decrease in batch size
    learning_rate=lr,
    weight_decay=weight_decay,
    warmup_steps=500,
    max_steps=18000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=2000,               # Steps until model checkpoint is created
    eval_steps=2000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    resume_from_checkpoint=checkpoint_path is not None,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_data,
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

processor.save_pretrained(training_args.output_dir)

## Train and push to 🤗

In [None]:
if checkpoint_path is not None:
  trainer.train(checkpoint_path)
else:
  trainer.train()

In [None]:
dataset_name = "Common Voice 11.0"
if "extended-dataset" in model_variant:
    dataset_name += " and NST"

kwargs = {
    # Some arguments are commented out because of errors
    #"dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": dataset_name,
    #"dataset_args": "config: sv, split: test",
    "language": "sv",
    "model_name": "Whisper Small - Swedish",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}

trainer.push_to_hub(**kwargs)

If using collab - keep browser busy and avoid timeout by using this code in the browser:

```javascript
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton, 60000);
```