In [None]:
# ============================================================
# üöÄ Fine-tune Whisper Small (OpenAI) ti·∫øng Vi·ªát ‚Äî An to√†n v·ªõi Unsloth
# Author: Mr.Jack (https://github.com/Mr-Jack-Tung)
# Date: 2025-11-01
# Description: H∆∞·ªõng d·∫´n fine-tune m√¥ h√¨nh Whisper Small c·ªßa OpenAI v·ªõi ti·∫øng Vi·ªát
# ============================================================

In [2]:
# !pip install -q unsloth  # ‚ö° C√†i ƒë·∫∑t unsloth ƒë·ªÉ patch nhanh
!pip install -q "pyarrow<20.0.0" transformers datasets accelerate librosa jiwer evaluate

In [3]:
!pip install -q datasets soundfile torchcodec

In [4]:
# ‚ö° Install PEFT / LoRA dependencies (CPU-friendly)
!pip install -q peft accelerate safetensors
# Note: bitsandbytes is GPU-only and is not installed in this CPU-only environment

In [None]:
# 1Ô∏è‚É£ Import unsloth tr∆∞·ªõc (r·∫•t quan tr·ªçng)
# import unsloth  # ‚ö° b·∫≠t patch nhanh cho Trainer, torch, dataset

# 2Ô∏è‚É£ Import c√°c th∆∞ vi·ªán kh√°c
import torch
from datasets import load_dataset, Audio
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    TrainingArguments,
    Trainer,
)
import evaluate

In [6]:
# CPU-only notes
# - This notebook is configured for CPU-only fine-tuning. Training will be significantly slower than on GPU.
# - Keep datasets small, use small batch sizes, and prefer fewer epochs for experiments.
# - bitsandbytes and k-bit training are GPU-only and are not used here.
# - Run cells in order: installs -> imports -> model -> dataset -> training args -> LoRA prep -> training -> save

In [7]:
# 3Ô∏è‚É£ Khai b√°o model
model_name = "openai/whisper-small"
language = "vi"
task = "transcribe"

processor = WhisperProcessor.from_pretrained(model_name, language=language, task=task)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [8]:
# 4Ô∏è‚É£ Load dataset (v√≠ d·ª• VIVOS)
# dataset = load_dataset("vivos")
dataset = load_dataset("quocanh34/viet_vivos")
# dataset = load_dataset("quocanh34/viet_vivos", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 9964
    })
    validation: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 685
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 686
    })
})


In [9]:
train_dataset = dataset["train"]
train_dataset

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 9964
})

In [10]:
print("üìÑ Example:", dataset["train"]["transcription"][0])

üìÑ Example: hi·ªán ch∆∞a c√≥ th√¥ng tin chi ti·∫øt v·ªÅ v·ª• b·∫Øn s√∫ng n√†y


In [11]:
# Ch·ªâ l·∫•y 1 sample ƒë·∫ßu ti√™n an to√†n
sample = dataset["train"].select([0]).to_dict()

print("Transcription:", sample["transcription"][0])
print("Audio file path:", sample["audio"][0]["path"])

Transcription: hi·ªán ch∆∞a c√≥ th√¥ng tin chi ti·∫øt v·ªÅ v·ª• b·∫Øn s√∫ng n√†y
Audio file path: VIVOSSPK22_080.wav


In [12]:
print(sample["audio"][0].keys())

dict_keys(['bytes', 'path'])


In [13]:
import io
import torchaudio
from IPython.display import Audio as IPyAudio

# L·∫•y bytes t·ª´ sample
audio_bytes = sample["audio"][0]["bytes"] if "bytes" in sample["audio"][0] else sample["audio"][0]["array"]

# D√πng BytesIO ƒë·ªÉ gi·∫£ l·∫≠p file WAV trong RAM
audio_buffer = io.BytesIO(audio_bytes)

# Load waveform
waveform, sr = torchaudio.load(audio_buffer)

# Ph√°t √¢m thanh
IPyAudio(waveform.numpy()[0], rate=sr)

In [14]:
# 5Ô∏è‚É£ collate_fn ‚Äî x·ª≠ l√Ω d·ªØ li·ªáu on-the-fly
def collate_fn(batch):
    input_features = [
        processor.feature_extractor(
            sample["audio"]["array"], sampling_rate=16000
        ).input_features[0]
        for sample in batch
    ]
    labels = [
        processor.tokenizer(sample["transcription"]).input_ids
        for sample in batch
    ]

    labels = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(l) for l in labels],
        batch_first=True,
        padding_value=processor.tokenizer.pad_token_id,
    )

    return {
        "input_features": torch.tensor(input_features),
        "labels": labels,
    }

# 6Ô∏è‚É£ Metric
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
# 7Ô∏è‚É£ TrainingArguments
training_args = TrainingArguments(
    output_dir="./whisper-small-vi",
    per_device_train_batch_size=1,  # small batch for CPU
    gradient_accumulation_steps=2,
    # eval_strategy="steps",
    # save_steps=100,
    # eval_steps=100,
    save_strategy="no",   # kh√¥ng cho Trainer t·ª± save gi·ªØa ch·ª´ng
    save_safetensors=False,  # t·∫Øt safe serialization
    logging_steps=1,
    num_train_epochs=1,
    learning_rate=1e-4,
    fp16=False,  # disabled on CPU
    no_cuda=True,  # force CPU
    dataloader_num_workers=0,
    # save_total_limit=1,
    report_to="none",
    remove_unused_columns=False,  # Allow custom batch keys for Whisper
 )

# 8Ô∏è‚É£ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"].select(range(10)),
    # eval_dataset=dataset["test"].select(range(10)),
    data_collator=collate_fn,
    # compute_metrics=compute_metrics,
 )

In [16]:
model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [None]:
# 8Ô∏è‚É£ Prepare LoRA (PEFT) adapter and run training
# CPU-only path: do not use bitsandbytes or k-bit preparation
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Ensure model is on CPU and uses float32
device = torch.device('cpu')
# model = model.to(device).to(torch.float32)

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["k_proj", "v_proj", "q_proj", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    inference_mode=False,
 )

# Wrap the original model with PEFT's LoRA
peft_model = get_peft_model(model, lora_config)

# Print the number of trainable parameters (will be a small fraction of the total)
peft_model.print_trainable_parameters() 

# Attach the PEFT model to the Trainer and run training
trainer.model = peft_model
trainer.train()

In [19]:
# 9Ô∏è‚É£ L∆∞u model (g·ªëc)
trainer.save_model("./whisper-small-vi")
processor.save_pretrained("./whisper-small-vi")
# Save LoRA/PEFT adapter as well
try:
    peft_model.save_pretrained("./whisper-small-vi-lora")
    print("Saved LoRA adapter to ./whisper-small-vi-lora")
except NameError:
    print("peft_model not found ‚Äî if you ran the LoRA cell the adapter will be saved automatically.")

Saved LoRA adapter to ./whisper-small-vi-lora


In [20]:
# üîü Ki·ªÉm th·ª≠ inference
sample = dataset["test"][0]

inputs = processor(
    sample["audio"]["array"],
    sampling_rate=16000,
    return_tensors="pt"
)

# √âp ki·ªÉu v√† ƒë∆∞a l√™n ƒë√∫ng device
inputs = {k: v.to(model.device).to(model.dtype) for k, v in inputs.items()}

# G·ªçi generate k√®m attention_mask
predicted_ids = model.generate(
    input_features=inputs["input_features"],
    attention_mask=inputs.get("attention_mask", None),
    task="transcribe",
    language="vi"
)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

print("üó£Ô∏è Ground truth:", sample["transcription"])
print("‚ú® Whisper prediction:", transcription[0])

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


üó£Ô∏è Ground truth: n·ª≠a v√≤ng tr√°i ƒë·∫•t h∆°n b·∫£y nƒÉm
‚ú® Whisper prediction:  N·ª≠a v√≤ng tr√°i ƒë·∫•t h∆°n 7 nƒÉm.
