# Master Thesis 3rd Version of a STT Model with new State of the Art: Whisper

**Author**: Karin Thommen

**Date**: June 2023


---

**Content of the Notebook**:  Version of OpenAi Whisper ASR Model

---

**References**:
- https://huggingface.co/blog/fine-tune-whisper
- https://github.com/vasistalodagala/whisper-finetune

## Step 1: Import and Setup

In [None]:
%%capture
!pip install datasets
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install audio-metadata
!pip install "dill<0.3.5"
!pip install git-lfs

In [None]:
import pandas as pd
import os
import transformers

from datasets.fingerprint import Hasher
import pickle
import dill

from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML
import re
import json

import IPython.display as ipd
import numpy as np
import random

import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import audio_metadata

from datasets import load_dataset, Audio, load_metric, load_from_disk, DatasetDict, list_datasets
from datasets import Dataset, Sequence

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

from transformers import WhisperTokenizer
from transformers import WhisperTokenizerFast
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from huggingface_hub import notebook_login

from google.colab import drive

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Step 2: Load Data

In [None]:
# Build connection to data folder on GDrive
drive.mount('/content/drive')

In [None]:
# login to huggingface account for data
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
print(list_datasets())

['acronym_identification', 'ade_corpus_v2', 'adversarial_qa', 'aeslc', 'afrikaans_ner_corpus', 'ag_news', 'ai2_arc', 'air_dialogue', 'ajgt_twitter_ar', 'allegro_reviews', 'allocine', 'alt', 'amazon_polarity', 'amazon_reviews_multi', 'amazon_us_reviews', 'ambig_qa', 'americas_nli', 'ami', 'amttl', 'anli', 'app_reviews', 'aqua_rat', 'aquamuse', 'ar_cov19', 'ar_res_reviews', 'ar_sarcasm', 'arabic_billion_words', 'arabic_pos_dialect', 'arabic_speech_corpus', 'arcd', 'arsentd_lev', 'art', 'arxiv_dataset', 'ascent_kb', 'aslg_pc12', 'asnq', 'asset', 'assin', 'assin2', 'atomic', 'autshumato', 'facebook/babi_qa', 'banking77', 'bbaw_egyptian', 'bbc_hindi_nli', 'bc2gm_corpus', 'beans', 'best2009', 'bianet', 'bible_para', 'big_patent', 'billsum', 'bing_coronavirus_query_set', 'biomrc', 'biosses', 'blbooks', 'blbooksgenre', 'blended_skill_talk', 'blimp', 'blog_authorship_corpus', 'bn_hate_speech', 'bnl_newspapers', 'bookcorpus', 'bookcorpusopen', 'boolq', 'bprec', 'break_data', 'brwac', 'bsd_ja_en'

In [None]:
# load dataset from huggingface (after uploading it via local machine to huggingface)
dataset = load_dataset("karinthommen/sds200")
# train = load_dataset("karinthommen/sds200", split="train")
# test = load_dataset("karinthommen/sds200", split="test")
# valid = load_dataset("karinthommen/sds200", split="valid")

Downloading readme:   0%|          | 0.00/620 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/karinthommen___parquet/karinthommen--sds200-a1893d366d27240a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/401M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/452M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/358M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/428M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/369M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/416M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/427M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/449M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/135271 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3638 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3636 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/karinthommen___parquet/karinthommen--sds200-a1893d366d27240a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# check if data loading worked
dataset["train"][0]

{'audio': {'path': '09966c7743291ccf1129c8136143bf5a6132947fe352795bc6d5456a3afeb4de.mp3',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          1.58690691e-05, -6.36559753e-06, -1.80013558e-05]),
  'sampling_rate': 32000},
 'transcription': 'Dadurch wird auch der Lebensraum von vielen Tier- und Pflanzenarten zerstört.',
 'canton': None,
 'duration': 6.732}

In [None]:
dataset.shape

{'train': (135271, 4), 'validation': (3638, 4), 'test': (3636, 4)}

In [None]:
dataset["train"] = dataset["train"].filter(lambda example: example["duration"] <= 6)

Filter:   0%|          | 0/135271 [00:00<?, ? examples/s]

In [None]:
dataset.shape

{'train': (113094, 4), 'validation': (3638, 4), 'test': (3636, 4)}

In [None]:
# remove columns from dataset that we do not need at the moment
dataset = dataset.remove_columns(["canton", "duration"])

### Preprocessing


In [None]:
# load tokenizer form Whisper Tokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="german")

Downloading (…)okenizer_config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

In [None]:
# load feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

## Step 3: Prepare Dataset and convert it into the correct Format

In [None]:
# downsample dataset to a sampling rate of 16kHz for the model
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# Check if audio loading worked with a random audio and sentence
rand_int = random.randint(0, len(dataset["train"])-1)
print(dataset["train"]["transcription"][rand_int])
ipd.Audio(data=dataset["train"][rand_int]["audio"]["array"], autoplay=True, rate=16000)

Die Schweizer Touristen seien aber nicht zu Schaden gekommen.


In [None]:
# Check sentence, input array shape and sampling rate
rand_int = random.randint(0, len(dataset["train"])-1)

print("Target text:", dataset["train"][rand_int]["transcription"])
print("Input array shape:", dataset["train"][rand_int]["audio"]["array"].shape)
print("Sampling rate:", dataset["train"][rand_int]["audio"]["sampling_rate"])

Target text: Über die Höhe kann man diskutieren.
Input array shape: (55872,)
Sampling rate: 16000


In [None]:
# show sentence decoded with the special characters ( in the format that is needed by whisper )
input_str = dataset["train"][0]["transcription"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 Karten dieser Bezirke gab es bisher aber nicht.
Decoded w/ special:    <|startoftranscript|><|de|><|notimestamps|>Karten dieser Bezirke gab es bisher aber nicht.<|endoftext|>
Decoded w/out special: Karten dieser Bezirke gab es bisher aber nicht.
Are equal:             True


In [None]:
# load processor from Whisper Processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", task="transcribe",)

In [None]:
# show format of train dataset
dataset["train"][0]

{'audio': {'path': '09d45d91d4a03720071316419bbf578c677bd4f72722ed8fa14613c244430e6c.mp3',
  'array': array([ 5.55111512e-16,  5.55111512e-17,  1.11022302e-16, ...,
          4.44828174e-09,  2.86900104e-09, -4.82032192e-10]),
  'sampling_rate': 16000},
 'transcription': 'Karten dieser Bezirke gab es bisher aber nicht.',
 'sentence': 'karten dieser bezirke gab es bisher aber nicht'}

In [None]:
import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores
cores

4

### Step 3a: Prepare Dataset (skip & jump to step 3b if you want to load the dataset directly from disk)

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # encode target text to label ids
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

In [None]:
dataset["train"].info

DatasetInfo(description='', citation='', homepage='', license='', features={'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'transcription': Value(dtype='string', id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, config_name=None, version=None, splits={'train': SplitInfo(name='train', num_bytes=3879736306, num_examples=135271, shard_lengths=[25031, 20060, 25030, 20060, 25030, 20060], dataset_name='parquet'), 'validation': SplitInfo(name='validation', num_bytes=113914361, num_examples=3638, shard_lengths=None, dataset_name='parquet'), 'test': SplitInfo(name='test', num_bytes=117785929, num_examples=3636, shard_lengths=None, dataset_name='parquet')}, download_checksums={'https://huggingface.co/datasets/karinthommen/sds200/resolve/97debe19b7c5a2a877c5a8c614e9b7047fcb1761/data/train-00000-of-00009-35225add198aac32.parquet': {'num_bytes': 401105409, 'checksum': None}, 'https://huggi

In [None]:
dataset = dataset.map(prepare_dataset, num_proc=2)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'input_features', 'labels'],
        num_rows: 113094
    })
    test: Dataset({
        features: ['audio', 'transcription', 'input_features', 'labels'],
        num_rows: 3636
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'input_features', 'labels'],
        num_rows: 3638
    })
})

In [None]:
#prep_dataset.push_to_hub("karinthommen/sds200-features-no-vocab", private=True)

### Step 3b: Load Dataset (skip if new dataset was loaded in Step 3a)

In [None]:
dataset_prep = load_dataset("karinthommen/sds200-features-no-vocab", cache_dir="content/cache")

## Fine-Tune & Train Model

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to("cuda")

Downloading pytorch_model.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-V2.1",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_prep["train"],
    eval_dataset=dataset_prep["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
processor.save_pretrained(training_args.output_dir)