In [1]:
## Imports
import pandas as pd
import os
import numpy as np
from datasets import Dataset
from datasets import Audio
import gc
import torch
import torch.nn 
import accelerate
import librosa
import bitsandbytes
import peft

In [2]:
## Login
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
## Load dataset
# load in csv and take 50% as training data flemish and dutch
df = pd.read_csv('./cgn_cd_result_merge_meta.csv')



In [4]:
for idx, value in enumerate(df['wav_filename']):
    # Replace the relevant part of the path
    value = value.replace("S:\\Downloads\\20151207_CGN_2_0_3\\CGN_2.0.3\\data/audio/wav\\", "/home/vicuser/whisper/whisper/cgn/")
    value = value.replace('\\','/')
    
    # Update the value in the dataframe
    df.at[idx, 'wav_filename'] = value
    


In [5]:
# Split into Flemish and Dutch
flemish = df[df['region'].notnull()]  # Filter where 'region' is not NaN

# Determine the size of training data (50% of each)
flemish_train_size = int(0.5 * len(flemish))

# Take the first 50% as training data
flemish_train = flemish[:flemish_train_size]

# Filter by Flemish and Dutch
train_data_vlnl = flemish_train
flemish_eval = flemish[flemish_train_size:]
eval_data = flemish_eval
train_data_vlnl =  train_data_vlnl[["wav_filename", "transcript"]]
eval_data = eval_data[["wav_filename", "transcript"]]
train_data_vlnl.columns = ["audio", "sentence"]
eval_data.columns = ["audio", "sentence"]

## convert the pandas dataframes to dataset 
train_dataset = Dataset.from_pandas(train_data_vlnl)
test_dataset = Dataset.from_pandas(eval_data)

## convert the sample rate of every audio files using cast_column function
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
train_dataset
test_dataset
train_dataset = train_dataset.remove_columns('__index_level_0__')
test_dataset = test_dataset.remove_columns('__index_level_0__')


In [6]:
## Prepare Feature Extractor, Tokenizer and Data

from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v3")

from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v3", language="Dutch", task="transcribe")

## Combine to create a WhisperProcessor
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3", language="Dutch", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
## Prepare data
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array 
    audio = examples["audio"]
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0]
    del examples["audio"]
    sentences = examples["sentence"]

    # encode target text to label ids 
    examples["labels"] = tokenizer(sentences).input_ids
    del examples["sentence"]
    return examples
train_dataset = train_dataset.map(prepare_dataset, num_proc=1)
test_dataset = test_dataset.map(prepare_dataset, num_proc=1)



Map:   0%|          | 0/3271 [00:00<?, ? examples/s]

Map:   0%|          | 0/3272 [00:00<?, ? examples/s]

In [8]:
## Define a Data Collator

import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [9]:
## evaluation metric
import evaluate

metric = evaluate.load("wer")


In [10]:
## quantize model
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-large-v3', load_in_8bit=True, device_map="auto")



The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

In [11]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [12]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)

model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)


<torch.utils.hooks.RemovableHandle at 0x7fe75916ac80>

In [13]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,219,200 || trainable%: 1.0087510466777219


In [14]:
## Define the training arguments
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./JensCoet/whisper-large-v3-nl",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=20,
    num_train_epochs=6,
    evaluation_strategy="steps",
    fp16=True,
    per_device_eval_batch_size=16,
    generation_max_length=225,
    logging_steps=50,
#    max_steps=100, # only for testing purposes, remove this from your final run :)
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
)


In [15]:
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

# This callback helps to save only the adapter weights and remove the base model weights.
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [16]:
trainer.train()




Step,Training Loss,Validation Loss
50,2.2161,2.171844
100,1.9845,1.931059
150,1.6843,1.716064
200,1.5458,1.52145
250,1.3642,1.366033
300,1.1792,1.258479
350,1.0873,1.177055
400,1.0392,1.115163
450,0.9476,1.063822
500,0.9071,1.023322




TrainOutput(global_step=1230, training_loss=0.9633891361515696, metrics={'train_runtime': 35259.4001, 'train_samples_per_second': 0.557, 'train_steps_per_second': 0.035, 'total_flos': 6.739052418957312e+19, 'train_loss': 0.9633891361515696, 'epoch': 6.0})

In [None]:
peft_model_id = "JensCoet/whisper-large-v3-nl"
model.push_to_hub(peft_model_id)

In [10]:
## evaluation
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer

peft_model_id = "JensCoet/whisper-large-v3-nl" # Use the same model ID as before.
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model.config.use_cache = True

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [17]:
import gc
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

# Assuming `eval_data`, `data_collator`, `processor`, `metric`, and `model` are defined elsewhere

eval_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="Dutch", task="transcribe")
normalizer = BasicTextNormalizer()

predictions = []
references = []
normalized_predictions = []
normalized_references = []

model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            input_features = batch["input_features"].to("cuda")
            forced_decoder_ids_batch = forced_decoder_ids[:len(input_features)]  # Adjust forced_decoder_ids for this batch
            generated_tokens = (
                model.generate(
                    input_features=input_features,
                    forced_decoder_ids=forced_decoder_ids_batch,
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
            decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)
            normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
            normalized_references.extend([normalizer(label).strip() for label in decoded_labels])
        del generated_tokens, labels, batch
    gc.collect()

wer = 100 * metric.compute(predictions=predictions, references=references)
normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)
eval_metrics = {"eval/wer": wer, "eval/normalized_wer": normalized_wer}

print(f"wer={wer} and normalized_wer={normalized_wer}")
print(eval_metrics)


100%|███████████████████████████████████████████████████████████████████████████████| 409/409 [1:57:11<00:00, 17.19s/it]


wer=52.264307343562336 and normalized_wer=52.72575110887941
{'eval/wer': 52.264307343562336, 'eval/normalized_wer': 52.72575110887941}


In [None]:
## wer=52.264307343562336 and normalized_wer=52.72575110887941
## {'eval/wer': 52.264307343562336, 'eval/normalized_wer': 52.72575110887941}

In [None]:
import gc
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

eval_dataloader = DataLoader(common_voice["test"], batch_size=8, collate_fn=data_collator)
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
normalizer = BasicTextNormalizer()

predictions = []
references = []
normalized_predictions = []
normalized_references = []

model.eval()
model.generation_config = forced_decoder_ids
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            generated_tokens = (
                model.generate(
                    input_features=batch["input_features"].to("cuda"),
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
            decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)
            normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])
            normalized_references.extend([normalizer(label).strip() for label in decoded_labels])
        del generated_tokens, labels, batch
    gc.collect()
wer = 100 * metric.compute(predictions=predictions, references=references)
normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)
eval_metrics = {"eval/wer": wer, "eval/normalized_wer": normalized_wer}

print(f"{wer=} and {normalized_wer=}")
print(eval_metrics)