LoRA is a fine-tuning technique that decreases the compute needed by making a low-rank decomposition of the weight matrix. Let's evaluate its effectiveness on our Cantonese ASR fine-tuning problem.

# Setup
Make a fresh environment:
- `conda create -n HuggingFace python=3.12.3`
- `conda activate HuggingFace`

Install CUDA + Juypter:
- `conda install -c conda-forge cudatoolkit-dev -y`
- `conda install -n HuggingFace ipykernel --update-deps --force-reinstall`

Install FFMPEG
- `sudo apt update`
- `sudo apt install ffmpeg -y`

In [22]:
!pip install --upgrade pip
!pip install --upgrade datasets[audio] accelerate evaluate jiwer tensorboard gradio torchaudio torchcodec huggingface-hub python-dotenv peft
!pip install transformers==4.52.0 # Versions reported to work: 4.48.0, 4.50.3, 4.52.0

Collecting torchaudio
  Downloading torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.9 kB)
Collecting huggingface-hub
  Using cached huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
Downloading torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.4 MB/s[0m  [33m0:00:00[0m [31m13.2 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: torchaudio
Successfully installed torchaudio-2.9.1


In [3]:
from dotenv import load_dotenv
import os

load_dotenv()
hf_key = os.getenv("HF_KEY")
mf_key = os.getenv('MF_KEY')

from huggingface_hub import login
login(token=hf_key)
print("Successfully logged in")

  from .autonotebook import tqdm as notebook_tqdm


Successfully logged in


In [9]:
# Download the dataset (4 mins download + 2 mins unzip) #

# Create download session
import subprocess
import json
result = subprocess.run([
    "curl", "-X", "POST", "https://datacollective.mozillafoundation.org/api/datasets/cmj8u3q2b00v9nxxborfkm824/download",
    "-H", f"Authorization: Bearer {mf_key}",
    "-H", "Content-Type: application/json" ],
                        capture_output=True, text=True
)
response_json = json.loads(result.stdout)
download_url = response_json["downloadUrl"]
print("Download URL acquired, starting download")

# Download
subprocess.run([ 
    "curl", "-L", "-o", "Common Voice Scripted Speech 24.0 - Cantonese.tar.gz", download_url 
])

# Unzip
!tar -xzf 'Common Voice Scripted Speech 24.0 - Cantonese.tar.gz'
print("Finished unzipping")

Download URL acquired, starting download


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6123M  100 6123M    0     0  5009k      0  0:20:51  0:20:51 --:--:-- 4443k


Finished unzipping


In [3]:
# Create the non-audio dataset #

# Create dataframe 'path' and 'sentence'
from datasets import Dataset, DatasetDict, Audio
import os
import pandas as pd
train_df = pd.read_csv('cv-corpus-24.0-2025-12-05/yue/train.tsv', delimiter='\t')[['path', 'sentence']]
test_df  = pd.read_csv('cv-corpus-24.0-2025-12-05/yue/test.tsv',  delimiter='\t')[['path', 'sentence']]

# Write fulle path
base_dir = "cv-corpus-24.0-2025-12-05/yue/clips"
train_df["path"] = train_df["path"].apply(lambda p: os.path.join(base_dir, p))
test_df["path"]  = test_df["path"].apply(lambda p: os.path.join(base_dir, p))

# Turn to hugging face format
train_hf = Dataset.from_pandas(train_df, preserve_index=False)
test_hf  = Dataset.from_pandas(test_df,  preserve_index=False)
common_voice = DatasetDict({
    "train": train_hf,
    "test": test_hf
})

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Get data ready for Whisper #

# Load Whisper processors
from transformers import WhisperFeatureExtractor, WhisperTokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Cantonese", task="transcribe")

# Define generator for processing data
# input_features: path -> load audio w/ torchaudio -> resample to 16k -> feature extraction w/ Whisper -> input_features
# labels: sentence -> tokenize w/ Whisper -> labels
from tqdm import tqdm
import numpy as np
import torchaudio
from datasets import Dataset
target_sr = 16000
def row_generator(stage):
    for i in range(len(common_voice[stage])):
        row = common_voice[stage][i]
        wf, sr = torchaudio.load(row['path'])
        if sr != target_sr:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
            wf = resampler(wf)
            sr = target_sr
        wf = wf.flatten().numpy()
        features = feature_extractor(wf, sampling_rate=sr).input_features[0]
        features = features.astype("float64")
        labels = tokenizer(row["sentence"]).input_ids
        yield {
            "input_features": features,
            "labels": labels
        }
common_voice["train"] = Dataset.from_generator(lambda: row_generator("train"))
common_voice["test"]  = Dataset.from_generator(lambda: row_generator("test"))
print(common_voice)
print(common_voice['train'][0])

Generating train split: 5130 examples [01:42, 50.22 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 7420
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5130
    })
})
{'input_features': [[-0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6238925457000732, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.6592628955841064, -0.659262

In [16]:
# Load model & get ready for training #

# Load model
from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Cantonese", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("Device:", str(device))

# Configure for Cantonese
model.generation_config.language = "cantonese"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
from transformers.models.whisper.tokenization_whisper import LANGUAGES, TO_LANGUAGE_CODE
language = "cantonese"
language_code = TO_LANGUAGE_CODE[language]
token = f"<|{language_code}|>"
token_id = processor.tokenizer.convert_tokens_to_ids(token)
model.generation_config.lang_to_id[token] = token_id

# Define data collator
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

# Define evaluation metric: CER for Cantonese
import evaluate
metric = evaluate.load("cer")
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    cer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}

# Import loRA related stuff
from peft import LoraConfig, get_peft_model
config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
model = get_peft_model(model, config)
model.print_trainable_parameters()


# Define arguments
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-canto-lora",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=10000,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    push_to_hub=True,
)

# Define trainer
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor.feature_extractor,
)

# Allow for training
for param in model.parameters():
    param.requires_grad = True   



'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 27526794-1d1f-4fed-bd80-df94967a0082)')' thrown while requesting HEAD https://huggingface.co/openai/whisper-small/resolve/main/preprocessor_config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ea53aacc-46c2-4e76-af16-c4c447bfa2c2)')' thrown while requesting HEAD https://huggingface.co/openai/whisper-small/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Device: cuda
trainable params: 3,538,944 || all params: 245,273,856 || trainable%: 1.4429


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [17]:
# Train #
trainer.train()

Step,Training Loss,Validation Loss,Cer
1000,0.0793,0.284264,13.328677
2000,0.0192,0.317738,12.812219
3000,0.0072,0.356729,12.747891
4000,0.0016,0.378363,12.483229
5000,0.0007,0.397273,12.459336
6000,0.0004,0.419011,12.350898
7000,0.0002,0.423546,12.505284
8000,0.0001,0.438299,12.444632
9000,0.0001,0.442476,12.543881
10000,0.0001,0.443586,12.435443


TrainOutput(global_step=10000, training_loss=0.05463887736317702, metrics={'train_runtime': 36239.2591, 'train_samples_per_second': 4.415, 'train_steps_per_second': 0.276, 'total_flos': 4.696436750450688e+19, 'train_loss': 0.05463887736317702, 'epoch': 21.551724137931036})

In [18]:
# Push to HuggingFace #
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_24",
    "dataset": "Common Voice 24.0 - Cantonese",
    "dataset_args": "config: cantonese, split: test",
    "language": "yue",
    "model_name": "Whisper Small Canto - Chengyi Li",
    "finetuned_from": "openai/whisper-small-lora",
    "tasks": "automatic-speech-recognition",
}
trainer.push_to_hub(**kwargs)
processor.push_to_hub("chengyili2005/whisper-small-canto-lora")


Processing Files (3 / 3): 100%|██████████| 14.3MB / 14.3MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  


CommitInfo(commit_url='https://huggingface.co/chengyili2005/whisper-small-canto-lora/commit/0f9be8d0206b5bae63be717dd74c229ea79dd527', commit_message='Upload processor', commit_description='', oid='0f9be8d0206b5bae63be717dd74c229ea79dd527', pr_url=None, repo_url=RepoUrl('https://huggingface.co/chengyili2005/whisper-small-canto-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='chengyili2005/whisper-small-canto-lora'), pr_revision=None, pr_num=None)