# Fine-tune Whisper for Georgian on Google Colab (Free GPU!)

**Instructions:**
1. Open this notebook in Google Colab
2. Go to Runtime → Change runtime type → Select GPU (T4 or better)
3. Upload your dataset or mount Google Drive
4. Run all cells

**Training time on Colab GPU:** ~8-15 hours

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install dependencies
!pip install -q transformers datasets accelerate evaluate jiwer librosa soundfile gradio

In [None]:
# Mount Google Drive (if your dataset is there)
from google.colab import drive
drive.mount('/content/drive')

# Adjust this path to where your dataset is:
# Option 1: Upload the tar.gz and extract it
# Option 2: Put the cv-corpus folder in Google Drive and point to it

In [None]:
# If you uploaded the tar.gz file, extract it:
!tar -xzf mcv-scripted-ka-v23.0.tar.gz

In [None]:
import os
import pandas as pd
import torch
from pathlib import Path
from datasets import Dataset, DatasetDict, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

print("✓ Imports successful")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

In [None]:
# Configuration - ADJUST THESE PATHS!
DATA_DIR = Path("cv-corpus-23.0-2025-09-05/ka")  # Adjust if needed
CLIPS_DIR = DATA_DIR / "clips"
OUTPUT_DIR = "./whisper-georgian-finetuned"

MODEL_NAME = "openai/whisper-small"  # tiny, base, small, medium, large-v3
LANGUAGE = "ka"
TASK = "transcribe"

# Training hyperparameters
BATCH_SIZE = 16  # Colab T4 can handle 16, reduce to 8 if OOM
LEARNING_RATE = 1e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 500
SAVE_STEPS = 1000
EVAL_STEPS = 1000

In [None]:
# Load dataset
print("Loading data...")
train_df = pd.read_csv(DATA_DIR / "train.tsv", sep='\t')
test_df = pd.read_csv(DATA_DIR / "test.tsv", sep='\t')

# Prepare data
def prepare_df(df):
    df['audio'] = df['path'].apply(lambda x: str(CLIPS_DIR / x))
    df = df[df['audio'].apply(lambda x: os.path.exists(x))]
    return df[['audio', 'sentence']].rename(columns={'sentence': 'transcription'})

train_prepared = prepare_df(train_df)
test_prepared = prepare_df(test_df)

print(f"Training samples: {len(train_prepared)}")
print(f"Test samples: {len(test_prepared)}")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_prepared)
test_dataset = Dataset.from_pandas(test_prepared)

# Cast audio column
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print("✓ Dataset loaded")

In [None]:
# Load Whisper components
print(f"Loading {MODEL_NAME}...")
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)
print("✓ Components loaded")

In [None]:
# Prepare data for training
def prepare_data_for_training(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

print("Processing datasets...")
dataset = dataset.map(
    prepare_data_for_training,
    remove_columns=dataset.column_names["train"],
    num_proc=2
)
print("✓ Data processed")

In [None]:
# Data collator
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
# Load model
print("Loading pre-trained model...")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
print("✓ Model loaded")

In [None]:
# Metric
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    learning_rate=LEARNING_RATE,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=NUM_EPOCHS,
    gradient_checkpointing=True,
    fp16=True,  # Use mixed precision on GPU
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=SAVE_STEPS,
    eval_steps=EVAL_STEPS,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

# Trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

print("✓ Trainer initialized")

In [None]:
# Train!
print("\n" + "="*50)
print("Starting training...")
print("="*50)

trainer.train()

In [None]:
# Save model
print("Saving model...")
trainer.save_model(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)
print(f"✓ Model saved to {OUTPUT_DIR}")

# Download model to your computer
print("\nTo download the model:")
print("1. Find the folder 'whisper-georgian-finetuned' in the file browser")
print("2. Right-click and download it")
print("3. Use it locally with the transcribe.py script!")

In [None]:
# Test the model
import librosa

def transcribe_test(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
    input_features = input_features.to("cuda")
    
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Test with a sample
test_audio = list(CLIPS_DIR.glob("*.mp3"))[0]
result = transcribe_test(test_audio)
print(f"Test transcription: {result}")