## Testing the Environment

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
import importlib

packages = [
    "datasets", "transformers", "accelerate", "soundfile",
    "librosa", "evaluate", "jiwer", "tensorboard", "gradio"
]

for pkg in packages:
    try:
        version = importlib.import_module(pkg).__version__
        print(f"{pkg}: ✅ Installed (version {version})")
    except ImportError:
        print(f"{pkg}: ❌ Not installed")
    except AttributeError:
        print(f"{pkg}: ✅ Installed (version unknown)")

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
from jiwer import wer

# ground truth
reference = "this is a test"
# model output
hypothesis = "this is test"

print("WER:", wer(reference, hypothesis))

## Transcribing with whisper-large-v3 model the single long audio

In [None]:
import os
from datasets import Dataset, Audio as HF_Audio
from pydub import AudioSegment
from stable_whisper import load_model  # from stable-ts
import torch

# 0. Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# 1. Convert your MP3 to WAV (if needed)
input_path = r"C:\OpenAI Whisper Fine-Tune\raw_audio.mp3"
output_path = r"C:\OpenAI Whisper Fine-Tune\raw_audio.wav"

if not os.path.exists(output_path):
    audio = AudioSegment.from_file(input_path)
    audio.export(output_path, format="wav")
    print(f"🎵 Converted MP3 to WAV → {output_path}")

# 2. Load the Stable Whisper model with forced GPU
model = load_model("large-v3", device=device)  # ✅ FORCES GPU if available

# 3. Transcribe with automatic chunking and timestamps
result = model.transcribe(output_path, language="az", regroup=True)

# Optional: Show detailed segments
for i, seg in enumerate(result.segments):
    print(f"[{seg.start:.2f} → {seg.end:.2f}] {seg.text}")

# 4. Join the full transcription
full_text = result.text

# 5. Save to 🤗 Hugging Face Dataset
records = {"audio": [output_path], "sentence": [full_text]}
ds = Dataset.from_dict(records)
ds.save_to_disk("./az_transcription_dataset_full")
print("\n✅ Saved transcription dataset to ./az_transcription_dataset_full")

In [None]:
from IPython.display import Audio
from datasets import load_from_disk, Audio as HF_Audio

# Load and cast the dataset
ds = load_from_disk("./az_transcription_dataset_full")
ds = ds.cast_column("audio", HF_Audio(sampling_rate=16000))

# Preview the full audio and its transcription
example = ds[0]
print("▶️ Full Audio")
print("Transcription:", example["sentence"])
display(Audio(example["audio"]["array"], rate=example["audio"]["sampling_rate"]))
print("-" * 60)

In [None]:
from IPython.display import Audio

# Preview first 5 examples (or however many you want)
for i in range(5):
    example = ds[i]
    print(f"▶️ Chunk {i}")
    print("Transcription:", example["sentence"])
    display(Audio(example["audio"]["array"], rate=example["audio"]["sampling_rate"]))
    print("-" * 60)

## Comparison with the Whisper-Small Model

In [None]:
import os
from datasets import Dataset, Audio as HF_Audio
from pydub import AudioSegment
from stable_whisper import load_model  # from stable-ts
import torch

# 0. Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# 1. Convert your MP3 to WAV (if needed)
input_path = r"C:\OpenAI Whisper Fine-Tune\raw_audio.mp3"
output_path = r"C:\OpenAI Whisper Fine-Tune\raw_audio.wav"

if not os.path.exists(output_path):
    audio = AudioSegment.from_file(input_path)
    audio.export(output_path, format="wav")
    print(f"🎵 Converted MP3 to WAV → {output_path}")

# 2. Load the Stable Whisper model with forced GPU
model = load_model("small", device=device)  # ✅ FORCES GPU if available

# 3. Transcribe with automatic chunking and timestamps
result = model.transcribe(output_path, language="az", regroup=True)

# Optional: Show detailed segments
for i, seg in enumerate(result.segments):
    print(f"[{seg.start:.2f} → {seg.end:.2f}] {seg.text}")

# 4. Join the full transcription
full_text = result.text

# 5. Save to 🤗 Hugging Face Dataset
records = {"audio": [output_path], "sentence": [full_text]}
ds = Dataset.from_dict(records)
ds.save_to_disk("./az_transcription_dataset_small")
print("\n✅ Saved transcription dataset to ./az_transcription_dataset_small")

In [None]:
from IPython.display import Audio
from datasets import load_from_disk, Audio as HF_Audio

# Load and cast the dataset
ds = load_from_disk("./az_transcription_dataset_small")
ds = ds.cast_column("audio", HF_Audio(sampling_rate=16000))

# Preview the full audio and its transcription
example = ds[0]
print("▶️ Full Audio")
print("Transcription:", example["sentence"])
display(Audio(example["audio"]["array"], rate=example["audio"]["sampling_rate"]))
print("-" * 60)

📌 **Conclusion**  

In this step, we transcribed the same Azerbaijani audio using both the **Whisper Large-v3** and **Whisper Small** models via `stable-ts`. The comparison revealed that the **Whisper Small model's output was significantly less accurate** than that of the **Large-v3** model. However, this gap presents a great opportunity: by using the high-quality transcriptions from the Large-v3 model as reference data, we can **fine-tune the Small model** to significantly improve its performance — especially for transcription tasks in under-resourced languages like Azerbaijani.

## Fine-Tuning Whisper-Small

### Chop long audio into segments using the time stamps

In [None]:
import os
import torch
import soundfile as sf
from pydub import AudioSegment
from datasets import Dataset, Audio
from stable_whisper import load_model  # stable-ts

# 0. Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("🚀 Using device:", device)

# 1. MP3 → WAV (if needed)
input_path  = r"C:\OpenAI Whisper Fine-Tune\raw_audio.mp3"
output_path = r"C:\OpenAI Whisper Fine-Tune\raw_audio.wav"

if not os.path.exists(output_path):
    audio_mp3 = AudioSegment.from_file(input_path)
    audio_mp3.export(output_path, format="wav")
    print("Converted MP3 → WAV")

# 2. Load the raw wave into a NumPy array
audio_arr, sr = sf.read(output_path, dtype="float32")
print(f"Loaded WAV, {audio_arr.shape[0]/sr:.1f}s at {sr} Hz")

# 3. Transcribe with stable-ts
model = load_model("large-v3", device=device)
result = model.transcribe(output_path, language="az", regroup=True)

# 4. Chop into segments using the time stamps
records = []
for seg in result.segments:
    start, end, txt = seg.start, seg.end, seg.text.strip()
    s_idx, e_idx = int(start * sr), int(end * sr)
    clip = audio_arr[s_idx:e_idx]
    records.append({
        "audio": {"array": clip, "sampling_rate": sr},
        "sentence": txt
    })

# 5. Build a Hugging Face Dataset of many short examples
ds = Dataset.from_list(records)
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

# 6. Inspect & save
print(ds)           # you'll see e.g.  hundred rows of ~2–30 s clips
print("Example:", ds[0])
ds.save_to_disk("./az_transcription_dataset_segments")
print("✅ Saved segmented dataset.")

### Fine-Tuning

In [None]:
# 1. Check for GPU
import torch
print("🚀 Using device:", "cuda" if torch.cuda.is_available() else "cpu")

# 2. Install dependencies (uncomment if running in a fresh environment)
# !pip install -q datasets>=2.6.1 transformers>=4.30.0 accelerate librosa evaluate jiwer soundfile

# 3. Imports
import os
from datasets import load_from_disk, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

# 4. Load your dataset
ds = load_from_disk("./az_transcription_dataset_segments")
print(ds)

# 5. Cast audio column to the 16 kHz Whisper format
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

# 6. Load Whisper processor for "small" and set to Azerbaijani
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small",
    language="Azerbaijani",   # Whisper’s language token
    task="transcribe"
)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
    language="Azerbaijani",
    task="transcribe"
)

# 7. Prepare the dataset: compute log-Mel inputs and tokenize transcripts
def prepare_batch(batch):
    # load + resample audio
    audio_arr = batch["audio"]["array"]
    sr = batch["audio"]["sampling_rate"]
    # feature extraction
    features = feature_extractor(audio_arr, sampling_rate=sr).input_features[0]
    # tokenize
    labels = tokenizer(batch["sentence"]).input_ids
    return {"input_features": features, "labels": labels}

ds = ds.map(
    prepare_batch,
    remove_columns=ds.column_names,
    num_proc=1
)

# Split into train & test sets
ds = ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = ds["train"], ds["test"]

# 8. Data collator to batch and pad correctly
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any  # the WhisperProcessor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # separate audio inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        inputs = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(inputs, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        label_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = label_batch["input_ids"].masked_fill(label_batch.attention_mask.ne(1), -100)
        
        # remove leading bos (beginning of a sentence) if present:
        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# 9. Load WER metric
wer_metric = evaluate.load("wer")

# We need to craft a function that processes our model predictions and calculates the WER metric. 
# This function, named `compute_metrics`, initially substitutes `-100` with the `pad_token_id` in the `label_ids`, 
# reversing the adjustment made in the data collator to accurately exclude padded tokens from the loss calculation. 
# Subsequently, it translates the predicted and label ids into strings. 
# Ultimately, it determines the WER by comparing the predictions with the reference labels:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


# 10. Load the pretrained Whisper-Small model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.generation_config.language = "az" # Set decoding language
model.generation_config.task = "transcribe"

# Adjust generation parameters - no tokens are predetermined as decoder outputs, and 
# no tokens are excluded during the generation process
model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens    = []

# 11. Setup training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-az-small-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=10,
    max_steps=500,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    logging_steps=20,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=225,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

# 12. Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

# 13. Save processor (necessary for inference)
processor.save_pretrained(training_args.output_dir)

# 14. Launch training
trainer.train()

In the Hugging Face fine-tuning guide for Whisper, the `Seq2SeqTrainer` is instantiated with `tokenizer=processor.feature_extractor` so that the Trainer treats the **audio** pre-processor as its main input handler rather than a text tokenizer. This is because the Trainer’s `tokenizer` parameter is not strictly for text tokenization—it designates the object used to name, save, and map model inputs during training and checkpointing. By passing the feature extractor, you ensure that audio padding, resampling, and log-Mel spectrogram conversion are correctly registered in the training pipeline, while text tokenization for labels remains under the separate `processor.tokenizer` in your data collator.

https://huggingface.co/blog/fine-tune-whisper

---