## Imports

In [1]:
import pandas as pd
import os
import numpy as np
from datasets import Dataset
import pandas as pd
from datasets import Audio
import gc

## Login

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load dataset

In [3]:
# load in csv and take 50% as training data flemish and dutch
df = pd.read_csv('./metadata/cgn_cd_result_merge_meta.csv')


In [4]:
# Split into Flemish and Dutch
flemish = df[df['region'].notnull()]  # Filter where 'region' is not NaN
dutch = df[df['region'].isnull()]     # Filter where 'region' is NaN


In [6]:
# Determine the size of training data (50% of each)
flemish_train_size = int(0.5 * len(flemish))
dutch_train_size = int(0.5 * len(dutch))

# Take the first 50% as training data
flemish_train = flemish[:flemish_train_size]
dutch_train = dutch[:dutch_train_size]

# Filter by Flemish and Dutch
train_data_vlnl = pd.concat([flemish_train, dutch_train])

In [7]:
flemish_eval = flemish[flemish_train_size:]
dutch_eval = dutch[dutch_train_size:]

eval_data = pd.concat([flemish_eval, dutch_eval])

In [8]:
train_data_vlnl =  train_data_vlnl[["wav_filename", "transcript"]]
eval_data = eval_data[["wav_filename", "transcript"]]

In [9]:
train_data_vlnl.columns = ["audio", "sentence"]
eval_data.columns = ["audio", "sentence"]

In [10]:

## convert the pandas dataframes to dataset 
train_dataset = Dataset.from_pandas(train_data_vlnl)
test_dataset = Dataset.from_pandas(eval_data)


In [11]:
## convert the sample rate of every audio files using cast_column function
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))

## Prepare Feature Extractor, Tokenizer and Data


In [12]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")


In [13]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Dutch", task="transcribe")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Combine to create a WhisperProcessor

In [14]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Dutch", task="transcribe")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Prepare data

In [15]:
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array 
    audio = examples["audio"]
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0]
    del examples["audio"]
    sentences = examples["sentence"]

    # encode target text to label ids 
    examples["labels"] = tokenizer(sentences).input_ids
    del examples["sentence"]
    return examples

In [16]:
train_dataset = train_dataset.map(prepare_dataset, num_proc=1)

Map:   0%|          | 0/8616 [00:00<?, ? examples/s]

Map:   0%|          | 0/8617 [00:00<?, ? examples/s]

In [None]:
test_dataset = test_dataset.map(prepare_dataset, num_proc=1)

## Load a pretrained checkpoint

In [17]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")


In [18]:
model.generation_config.language = "dutch"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

## Define a Data Collator


In [19]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [20]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


## evaluation metric

In [21]:
import evaluate

metric = evaluate.load("wer")


In [22]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


## Define the training arguments

In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./JensCoet/whisper-small-nl",  # change to a repo name of your choice
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-5,
    warmup_steps=1,
    max_steps=2000,  # set max steps to > 2k
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,   # set to < max_steps
    eval_steps=500,  # set to < max_steps
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)


In [24]:
from transformers import Seq2SeqTrainer

# Now, initialize the trainer with the processed audio data
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Following took 1080min

In [25]:
trainer.train()

  0%|          | 0/2000 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 1.3742, 'learning_rate': 9.884942471235619e-06, 'epoch': 0.05}
{'loss': 0.9085, 'learning_rate': 9.759879939969985e-06, 'epoch': 0.09}
{'loss': 0.8672, 'learning_rate': 9.634817408704354e-06, 'epoch': 0.14}
{'loss': 0.8171, 'learning_rate': 9.50975487743872e-06, 'epoch': 0.19}
{'loss': 0.7356, 'learning_rate': 9.384692346173087e-06, 'epoch': 0.23}
{'loss': 0.7586, 'learning_rate': 9.259629814907455e-06, 'epoch': 0.28}
{'loss': 0.6844, 'learning_rate': 9.134567283641822e-06, 'epoch': 0.32}
{'loss': 0.6553, 'learning_rate': 9.009504752376189e-06, 'epoch': 0.37}
{'loss': 0.661, 'learning_rate': 8.884442221110557e-06, 'epoch': 0.42}
{'loss': 0.6048, 'learning_rate': 8.759379689844924e-06, 'epoch': 0.46}
{'loss': 0.5899, 'learning_rate': 8.63431715857929e-06, 'epoch': 0.51}
{'loss': 0.6135, 'learning_rate': 8.509254627313657e-06, 'epoch': 0.56}
{'loss': 0.5615, 'learning_rate': 8.384192096048025e-06, 'epoch': 0.6}
{'loss': 0.5084, 'learning_rate': 8.259129564782392e-06, 'epoch': 0.

  0%|          | 0/8617 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.9492278099060059, 'eval_wer': 117.07920639708462, 'eval_runtime': 8708.6343, 'eval_samples_per_second': 0.989, 'eval_steps_per_second': 0.989, 'epoch': 0.93}




{'loss': 0.4124, 'learning_rate': 7.383691845922962e-06, 'epoch': 0.97}
{'loss': 0.3163, 'learning_rate': 7.258629314657329e-06, 'epoch': 1.02}
{'loss': 0.231, 'learning_rate': 7.133566783391697e-06, 'epoch': 1.07}
{'loss': 0.2374, 'learning_rate': 7.0085042521260636e-06, 'epoch': 1.11}
{'loss': 0.2286, 'learning_rate': 6.883441720860431e-06, 'epoch': 1.16}
{'loss': 0.2262, 'learning_rate': 6.758379189594798e-06, 'epoch': 1.21}
{'loss': 0.2023, 'learning_rate': 6.633316658329165e-06, 'epoch': 1.25}
{'loss': 0.2094, 'learning_rate': 6.508254127063533e-06, 'epoch': 1.3}
{'loss': 0.1931, 'learning_rate': 6.383191595797899e-06, 'epoch': 1.35}
{'loss': 0.2039, 'learning_rate': 6.258129064532267e-06, 'epoch': 1.39}
{'loss': 0.1775, 'learning_rate': 6.1330665332666335e-06, 'epoch': 1.44}
{'loss': 0.1748, 'learning_rate': 6.008004002001001e-06, 'epoch': 1.49}
{'loss': 0.1834, 'learning_rate': 5.8829414707353685e-06, 'epoch': 1.53}
{'loss': 0.1741, 'learning_rate': 5.757878939469735e-06, 'epoch

  0%|          | 0/8617 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 1.028854489326477, 'eval_wer': 104.31643161853874, 'eval_runtime': 11731.5389, 'eval_samples_per_second': 0.735, 'eval_steps_per_second': 0.735, 'epoch': 1.86}




{'loss': 0.1348, 'learning_rate': 4.882441220610306e-06, 'epoch': 1.9}
{'loss': 0.1334, 'learning_rate': 4.757378689344673e-06, 'epoch': 1.95}
{'loss': 0.1095, 'learning_rate': 4.63231615807904e-06, 'epoch': 2.0}
{'loss': 0.0725, 'learning_rate': 4.507253626813407e-06, 'epoch': 2.04}
{'loss': 0.0606, 'learning_rate': 4.382191095547774e-06, 'epoch': 2.09}
{'loss': 0.0529, 'learning_rate': 4.257128564282142e-06, 'epoch': 2.14}
{'loss': 0.0561, 'learning_rate': 4.132066033016508e-06, 'epoch': 2.18}
{'loss': 0.0554, 'learning_rate': 4.007003501750876e-06, 'epoch': 2.23}
{'loss': 0.055, 'learning_rate': 3.8819409704852425e-06, 'epoch': 2.27}
{'loss': 0.0549, 'learning_rate': 3.75687843921961e-06, 'epoch': 2.32}
{'loss': 0.0513, 'learning_rate': 3.631815907953977e-06, 'epoch': 2.37}
{'loss': 0.0527, 'learning_rate': 3.506753376688344e-06, 'epoch': 2.41}
{'loss': 0.0488, 'learning_rate': 3.3816908454227117e-06, 'epoch': 2.46}
{'loss': 0.0493, 'learning_rate': 3.2566283141570788e-06, 'epoch': 

  0%|          | 0/8617 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 1.113805890083313, 'eval_wer': 103.94708493127074, 'eval_runtime': 16602.2241, 'eval_samples_per_second': 0.519, 'eval_steps_per_second': 0.519, 'epoch': 2.79}




{'loss': 0.0404, 'learning_rate': 2.381190595297649e-06, 'epoch': 2.83}
{'loss': 0.0316, 'learning_rate': 2.256128064032016e-06, 'epoch': 2.88}
{'loss': 0.0346, 'learning_rate': 2.1310655327663833e-06, 'epoch': 2.92}
{'loss': 0.0315, 'learning_rate': 2.0060030015007508e-06, 'epoch': 2.97}
{'loss': 0.0266, 'learning_rate': 1.8809404702351178e-06, 'epoch': 3.02}
{'loss': 0.0201, 'learning_rate': 1.755877938969485e-06, 'epoch': 3.06}
{'loss': 0.015, 'learning_rate': 1.6308154077038522e-06, 'epoch': 3.11}
{'loss': 0.0161, 'learning_rate': 1.5057528764382193e-06, 'epoch': 3.16}
{'loss': 0.0182, 'learning_rate': 1.3806903451725863e-06, 'epoch': 3.2}
{'loss': 0.0162, 'learning_rate': 1.2556278139069536e-06, 'epoch': 3.25}
{'loss': 0.0165, 'learning_rate': 1.1305652826413207e-06, 'epoch': 3.3}
{'loss': 0.016, 'learning_rate': 1.005502751375688e-06, 'epoch': 3.34}
{'loss': 0.0145, 'learning_rate': 8.804402201100551e-07, 'epoch': 3.39}
{'loss': 0.0148, 'learning_rate': 7.553776888444222e-07, 'ep

  0%|          | 0/8617 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 1.1677577495574951, 'eval_wer': 103.5346477971548, 'eval_runtime': 8318.4558, 'eval_samples_per_second': 1.036, 'eval_steps_per_second': 1.036, 'epoch': 3.71}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


{'train_runtime': 64288.8047, 'train_samples_per_second': 0.498, 'train_steps_per_second': 0.031, 'train_loss': 0.23456191690266132, 'epoch': 3.71}


TrainOutput(global_step=2000, training_loss=0.23456191690266132, metrics={'train_runtime': 64288.8047, 'train_samples_per_second': 0.498, 'train_steps_per_second': 0.031, 'train_loss': 0.23456191690266132, 'epoch': 3.71})

before finetuning:
wer: 171 of -71wa
after:
wer: 103 of -3wa