In [1]:
from datasets import Audio, Dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

import torch
import numpy as np
import pandas as pd

# https://huggingface.co/blog/fine-tune-whisper#prepare-environment
# https://github.com/Vaibhavs10/fast-whisper-finetuning
# https://colab.research.google.com/drive/1DOkD_5OUjFa0r5Ik3SgywJLJtEo2qLxO?usp=sharing#scrollTo=62038ba3-88ed-4fce-84db-338f50dcd04f

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [3]:
from huggingface_hub import login
login("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")


In [4]:
torch.cuda.set_device("cuda:0")
torch.cuda.current_device()

0

# Loading data

In [5]:
train_df = pd.read_csv("arquivos/cv-corpus-19.0-2024-09-13/pt/train.tsv", sep='\t')

train_df["path"] = train_df["path"].apply(lambda x: f"arquivos/cv-corpus-19.0-2024-09-13/pt/clips/{x}")
train_df = train_df[["path", "sentence"]]
train_df.head()

Unnamed: 0,path,sentence
0,arquivos/cv-corpus-19.0-2024-09-13/pt/clips/co...,Comunique-se com os outros
1,arquivos/cv-corpus-19.0-2024-09-13/pt/clips/co...,Eu não faço as regras infelizmente.
2,arquivos/cv-corpus-19.0-2024-09-13/pt/clips/co...,"Sim, mas depende de você."
3,arquivos/cv-corpus-19.0-2024-09-13/pt/clips/co...,"Retifique o sal, se necessário, e sirva."
4,arquivos/cv-corpus-19.0-2024-09-13/pt/clips/co...,Está praticamente lá em cima agora.


In [6]:
eval_df = train_df.sample(frac=0.3, random_state=42)
train_df.drop(eval_df.index, inplace=True)

In [7]:
# eval_df = eval_df.iloc[:100]
# train_df = eval_df.iloc[:100]

In [8]:
eval_df.shape

(6529, 2)

In [9]:
train_df.shape

(15235, 2)

In [10]:
train = Dataset.from_pandas(train_df.copy())
train = train.cast_column("path", Audio(sampling_rate=16000))
train = train.rename_column("path", "audio")

In [11]:
eval = Dataset.from_pandas(eval_df.copy())
eval = eval.cast_column("path", Audio(sampling_rate=16000))
eval = eval.rename_column("path", "audio")

# Testing model

In [12]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda:0


In [13]:
result = pipe(train[2]["audio"])
result

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


{'text': ' Está praticamente lá em cima agora.'}

In [14]:
train[2]

{'audio': {'path': 'arquivos/cv-corpus-19.0-2024-09-13/pt/clips/common_voice_pt_22021460.mp3',
  'array': array([1.01863407e-10, 0.00000000e+00, 3.63797881e-11, ...,
         2.10580311e-06, 2.08810479e-06, 1.17524883e-06], shape=(60288,)),
  'sampling_rate': 16000},
 'sentence': 'Está praticamente lá em cima agora.',
 '__index_level_0__': 4}

# Fine-tunning

#### Loading feature extractor and tokenizer

In [15]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
tokenizer = WhisperTokenizer.from_pretrained(model_id, language="portuguese", task="transcribe")

#### Testing the tokenizer

In [16]:
labels = tokenizer(train[0]["sentence"]).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
print(labels)
print(decoded_with_special)
print(decoded_str)

[50258, 50267, 50360, 50364, 37611, 2431, 38091, 382, 319, 861, 296, 1536, 338, 590, 4082, 13, 50257]
<|startoftranscript|><|pt|><|transcribe|><|notimestamps|>Eu não faço as regras infelizmente.<|endoftext|>
Eu não faço as regras infelizmente.


#### Loading processor

In [17]:
processor = WhisperProcessor.from_pretrained(model_id, language="portuguese", task="transcribe")

In [18]:
audio = train[0]["audio"]

#### Preparing dataset

In [19]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [20]:
next(iter(train))

{'audio': {'path': 'arquivos/cv-corpus-19.0-2024-09-13/pt/clips/common_voice_pt_22021172.mp3',
  'array': array([ 7.27595761e-11,  2.01907824e-10,  3.78349796e-10, ...,
          5.72202771e-06, -1.66574173e-05, -3.81715727e-05], shape=(51456,)),
  'sampling_rate': 16000},
 'sentence': 'Eu não faço as regras infelizmente.',
 '__index_level_0__': 1}

In [None]:
train = train.map(prepare_dataset)

Map:   0%|          | 0/15235 [00:00<?, ? examples/s]

In [None]:
eval = eval.map(prepare_dataset)

Map:   0%|          | 0/6529 [00:00<?, ? examples/s]

In [None]:
# https://discuss.huggingface.co/t/trainer-runtimeerror-the-size-of-tensor-a-462-must-match-the-size-of-tensor-b-448-at-non-singleton-dimension-1/26010/6

MAX_DURATION_IN_SECONDS = 30.0
max_input_length = MAX_DURATION_IN_SECONDS * 16000
max_label_length = model.config.max_length

def filter_values(x):
    """Filter label sequences longer than max length (448)"""
    labels_length = len(x["labels"])
    return labels_length < max_label_length


In [None]:
print(train.shape)
train = train.filter(filter_values)
train.shape

(15235, 5)


Filter:   0%|          | 0/15235 [00:00<?, ? examples/s]

(15234, 5)

In [None]:
print(eval.shape)
eval = eval.filter(filter_values)
eval.shape

(6529, 5)


Filter:   0%|          | 0/6529 [00:00<?, ? examples/s]

(6528, 5)

#### Defining a data collator

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

#### Evaluation metric

In [None]:
import evaluate
metric = evaluate.load("wer")

In [None]:
# def replace_after_end_token(token_matrix, end_token_id=50257):
#     if token_matrix.ndim != 2:
#         print("does not apply")
#         return
#     rows_with_end_token, cols_of_end_token = np.where(token_matrix == end_token_id)

#     replacement_mask = np.full(token_matrix.shape, False, dtype=bool)

#     unique_rows = np.unique(rows_with_end_token)
#     for r in unique_rows:
#         first_end_token_col = cols_of_end_token[rows_with_end_token == r].min()
#         replacement_mask[r, first_end_token_col + 1:] = True

#     # Apply the replacement using the mask
#     token_matrix[replacement_mask] = end_token_id

def compute_metrics(pred):
    # try:
    #     print(pred.predictions)
    #     print(pred.label_ids)
    #     print(pred.predictions[0].shape, pred.label_ids.shape)
    # except:
    #     pass
    # pred_ids = np.argmax(pred.predictions[0], axis=-1)
    # print(pred_ids.shape)
    label_ids = pred.label_ids
    pred_ids = pred.predictions

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    # we do not want to group tokens when computing the metrics
    # print(pred_ids[0])
    # replace_after_end_token(pred_ids)
    # print(pred_ids[0])

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    # print(pred_str[0])
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    # print(label_str[0])

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    # raise TypeError("a")
    return {"wer": wer}

#### Lora

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_id, device_map="cuda:0")

In [None]:
model.generation_config.language = "portuguese"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [None]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [None]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)

model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x73ec7610af10>

In [None]:
from peft import LoraConfig, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,219,200 || trainable%: 1.0088


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="arquivos/whisper-l-v3-ft-backup",  # change to a repo name of your choice
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=6000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    remove_unused_columns=False,  # required as the PeftModel forward doesn't have the signature of the wrapped model's forward
    label_names=["labels"],  # same reason as above
)

In [None]:
model.device

device(type='cuda', index=0)

In [None]:
import os
from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train,
    eval_dataset=eval,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


KeyboardInterrupt: 

In [None]:
model.push_to_hub("jp003/whisper-large-v3-lora-cv-pt")


#### Testing

In [None]:
test_df = pd.read_csv("arquivos/cv-corpus-19.0-2024-09-13/pt/test.tsv", sep='\t')

test_df["path"] = test_df["path"].apply(lambda x: f"arquivos/cv-corpus-19.0-2024-09-13/pt/clips/{x}")
test_df = test_df[["path", "sentence"]]
test_df.head()

In [None]:
test = Dataset.from_pandas(test_df.copy())
test = test.cast_column("path", Audio(sampling_rate=16000))
test = test.rename_column("path", "audio")

In [None]:
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
)
from peft import PeftModel, PeftConfig


peft_model_id = "jp003/whisper-large-v3-lora-cv-pt" # Use the same model ID as before.
language = "portuguese"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path,device_map="cuda:0"
)

model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

In [None]:
def transcribe(audio):
    with torch.cuda.amp.autocast():
        text = pipe(audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids})["text"]
    return text

In [None]:
transcribe(test_df["path"][0])

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, device_map="cuda:0"
)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
)

In [None]:
result = pipe(test[0]["audio"])
result

In [None]:
test[0]["sentence"]