<a href="https://colab.research.google.com/github/MikeCorv/WhisperFineTuning/blob/main/WHISPERV3TURBOFINETUNING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets librosa soundfile accelerate
!pip install -q bitsandbytes peft

In [None]:
pip install torchcodec

In [None]:
!pip install -U datasets[audio]

In [None]:
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Check if Colab GPU is detected
print("3. Checking Hardware...")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"SUCCESS: GPU Detected -> {gpu_name}")
    # Checking VRAM
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"VRAM available: {vram:.2f} GB")
else:
    print("CRITICAL ERROR: No GPU detected. Go to Runtime -> Change runtime type -> T4 GPU.")

In [None]:
MODEL_ID = "openai/whisper-large-v3-turbo"

In [None]:
#4bit quantization since I don't have Colab Pro (buying it soon lol)
print(f"Defining Quantization Config for {MODEL_ID}...")
# ->4bit format (Cut the model size by 4x, again: I don't have Colab Pro)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # Best precision for 4-bit/more details here: https://www.emergentmind.com/topics/4-bit-normalfloat-nf4-quantization
    bnb_4bit_quant_type="nf4",
    #From QLORA Paper(https://arxiv.org/pdf/2305.14314): QLORA has one low-precision storage data type, in our case usually 4-bit, and one computation data type that is usually BFloat16.
    #In practice, this means whenever a QLORA weight tensor is used, we dequantize the tensor to BFloat16, and then perform a matrix multiplication in 16-bit.
    #for CausalLM models, the last lm_head is kept in its original dtype."
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)


In [None]:
print("Downloading and Loading the Model (this handles the weights)...")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",    # Automatically put it on the GPU
    trust_remote_code=True
)

In [None]:
#Training the "compressed" model
model = prepare_model_for_kbit_training(model)

In [None]:
print("3. Loading the Processor (The 'Translator')...")
# The processor handles: Audio -> Spectrogram AND Text -> Tokens
processor = AutoProcessor.from_pretrained(MODEL_ID, language="italian", task="transcribe")

In [None]:
# --- RESET MODEL STATE ---
print("Checking model state...")

# Check if the model is currently wrapped in LoRA
if hasattr(model, "unload"):
    # This strips off the LoRA layers and returns the raw Base Model
    model = model.unload()
    print("✅ SUCCESS: Old adapters unloaded. Model is back to clean base state.")
else:
    print("ℹ️ NOTE: Model was already clean (no adapters found).")

In [None]:
#Adapters
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
print("Defining LoRA Config...")
#Removed "task_type" argument after checking this: https://github.com/huggingface/peft/issues/1988
lora_config = LoraConfig(
    r=32,               # The "Rank": How complex the new brain paths are
    lora_alpha=64,      # Scaling factor (usually 2x the Rank)
    target_modules=["q_proj", "v_proj"], # Attach only to Attention layers
    lora_dropout=0.05,  # Randomly turn off 5% of neurons to prevent memorization
    bias="none"
)

In [None]:
print("Injecting Adapters into the Model...")
# This wraps the base model with the new LoRA layers
model = get_peft_model(model, lora_config)

In [None]:
print("\n--- PARAMETER CHECK ---")
model.print_trainable_parameters()

In [None]:
#Reloading clean data from my Drive

from datasets import load_from_disk
from dataclasses import dataclass
from typing import Any, Dict, List, Union

DATA_PATH = "/content/drive/MyDrive/fleurs_it_processed"
dataset = load_from_disk(DATA_PATH)
print(f"Dataset Loaded! Train size: {len(dataset['train'])} | Test size: {len(dataset['test'])}")



In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # A. Separate Audio (Inputs) and Text (Labels)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # B. Pad Audio to the longest in the batch
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # C. Pad Text to the longest sentence
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # D. Mask Padding
        # We replace the "padding zeros" with -100.
        # This tells the model: "Don't try to predict these empty spots, they don't count."
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # E. Remove "Start of Sentence" token if present (Whisper adds it automatically)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

print("SUCCESS: Data is loaded and the Collator is ready.")

In [None]:
# --- INSPECTION BLOCK ---
import numpy as np

raw_example = dataset["train"][0]
print(f"Keys available: {list(raw_example.keys())}")
print(f"Audio content:   '{raw_example['audio']}'")
print(f"Text content:   '{raw_example['sentence']}'")
print(f"Duration:   '{raw_example['duration']}'")

In [None]:
print(f"Columns currently in dataset: {dataset['train'].column_names}")

In [None]:
audio_test = dataset["train"][0]["audio"]
print(f"Audio content:   '{audio_test}'")

In [None]:
audio_extraction_test = processor.feature_extractor(audio_test["array"], sampling_rate=audio_test["sampling_rate"])
print(f"Audio content:   '{audio_extraction_test}'")

In [None]:
print(audio_extraction_test.input_features[0])

In [None]:
tokenizer_test = processor.tokenizer(dataset['train'][0]["sentence"])
print(tokenizer_test)

In [None]:
def prepare_dataset(batch):
    #Extracting the Audio Object
    audio = batch["audio"]

    #Audio -> Spectrogram
    batch["input_features"] = processor.feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    # 3.Text -> Token IDs
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids

    return batch

dataset = dataset.map(
prepare_dataset,
remove_columns=dataset["train"].column_names, # Remove 'audio', 'sentence', etc.
num_proc=1, # Keep it safe on RAM
desc="Feature Extraction"
)

print("SUCCESS: Dataset is now in 'Model Format' (input_features, labels).")

In [None]:
print(f"Columns currently in dataset: {dataset['train'].column_names}")

In [None]:
#for evaluating the model, we'll use Just In Time WER
!pip install -q evaluate jiwer

In [None]:
import evaluate
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace the "ignore" index (-100) with the pad token so we can decode
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode the model's guess and the correct answer back to text
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # Calculate the error rate
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-lora",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-3,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    save_steps=250,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    remove_unused_columns=False,
    label_names=["labels"] # Explicitly tell it where the answers are
)

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
ADAPTER_PATH = "/content/drive/MyDrive/whisper-large-v3-turbo-italian-lora"

print(f"Saving adapters to {ADAPTER_PATH}...")

model.save_pretrained(ADAPTER_PATH)
processor.save_pretrained(ADAPTER_PATH)
