## Single File Pre-Processing

In [None]:
import pickle
import numpy as np
from datasets import Dataset, Audio as HF_Audio
import os
import torch
import soundfile as sf
from pydub import AudioSegment
from stable_whisper import load_model  # stable-ts

# Set root directory where your WAV files are stored
root_dir = r"C:\..."

# 0. Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("🚀 Using device:", device)

# 1. Load your full audio and transcription segments
output_path = r"C:\...wav"
audio_arr, sr = sf.read(output_path, dtype="float32")

# 2. Transcribe with stable-ts
model = load_model("large-v3", device=device)
result = model.transcribe(output_path, language="az", regroup=True)

segments = result.segments  # from the stable_whisper transcription

# Save to a pickle file
with open("segments.pkl", "wb") as f:
    pickle.dump(segments, f)

In [None]:
import pickle
import numpy as np
import soundfile as sf
from datasets import Dataset, Audio as HF_Audio

# 1. load
output_path = r"C:\...wav"
audio_arr, sr = sf.read(output_path, dtype="float32")
duration = audio_arr.shape[0] / sr
with open("segments.pkl", "rb") as f:
    segments = pickle.load(f)

# 2. params
WINDOW = 30.0   # seconds
records = []
num_win = int(np.ceil(duration / WINDOW))

for i in range(num_win):
    w_start = i * WINDOW
    w_end   = min(w_start + WINDOW, duration)

    # slice exactly 30s (or less on the last window)
    s_idx, e_idx = int(w_start * sr), int(w_end * sr)
    clip = audio_arr[s_idx:e_idx]

    # grab only those segments that *end* within this window
    texts = [
        seg.text.strip()
        for seg in segments
        if seg.end > w_start and seg.end <= w_end
    ]
    if not texts:
        continue

    records.append({
      "audio": {"array": clip, "sampling_rate": sr},
      "text":  " ".join(texts)
    })

# 3. build & save
ds30 = Dataset.from_list(records)
ds30 = ds30.cast_column("audio", HF_Audio(sampling_rate=16_000))
print(ds30)
ds30.save_to_disk("./transcription_30s_fixed")

In [None]:
from IPython.display import Audio
from datasets import load_from_disk, Audio as HF_Audio

# Load and cast the dataset
ds = load_from_disk("./transcription_30s_fixed")
ds = ds.cast_column("audio", HF_Audio(sampling_rate=16000))

# Preview first 5 examples (or however many you want)
for i in range(len(ds)):
    example = ds[i]
    print(f"▶️ Chunk {i}")
    print("Transcription:", example["text"])
    display(Audio(example["audio"]["array"], rate=example["audio"]["sampling_rate"]))
    print("-" * 60)

In [None]:
# To check if any segments in the resultant Dataset are longer than 30 seconds
for i, item in enumerate(ds):
    audio_len = len(item['audio']['array']) / item['audio']['sampling_rate']
    if audio_len > 30:
        print(f"Dataset item {i}: {audio_len:.2f}s — {item['sentence']}")

In [None]:
# Check the Sampling Rate of Raw WAV Files
import os
import soundfile as sf

root = r"C:\..."
for dirpath, _, filenames in os.walk(root):
    for fn in filenames:
        if fn.lower().endswith(".wav"):
            path = os.path.join(dirpath, fn)
            info = sf.info(path)
            print(f"{path}: {info.samplerate} Hz")

## Walk through all subdirectories of root_dir

In [None]:
import os
import pickle
from pydub import AudioSegment
import soundfile as sf
import torch
from stable_whisper import load_model  # stable-ts

# 0. Settings
root_dir = r"C:\..."
device   = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# 1. Load model once
model = load_model("large-v3", device=device)

# 2. Traverse your root_dir
all_segments = {}
for dirpath, dirnames, filenames in os.walk(root_dir):
    for fn in filenames:
        base, ext = os.path.splitext(fn.lower())
        if ext not in (".wav", ".mp3"):
            continue

        src_path = os.path.join(dirpath, fn)
        wav_path = src_path

        # 2a. If it's an MP3, convert to WAV in-place
        if ext == ".mp3":
            wav_path = os.path.join(dirpath, base + ".wav")
            if not os.path.exists(wav_path):
                print(f"Converting {src_path!r} → {wav_path!r}")
                AudioSegment.from_file(src_path).export(wav_path, format="wav")

        # 2b. Read & report duration (optional)
        info = sf.info(wav_path)
        dur = info.frames / info.samplerate
        print(f"→ Transcribing {wav_path!r} ({dur:.1f}s)")

        # 3. Transcribe
        result = model.transcribe(wav_path, language="az", regroup=True)

        # 4. Save only the segments (you can pickle the whole result if you want)
        all_segments[wav_path] = result.segments

# 5. Dump to a single pickle
with open("all_segments.pkl", "wb") as f:
    pickle.dump(all_segments, f)

print(f"\n✅ Transcribed {len(all_segments)} files. Saved segments to all_segments.pkl")

In [None]:
import pickle
import numpy as np
import soundfile as sf
from datasets import Dataset, Audio as HF_Audio

# 1️⃣  Load your pre-computed segments for *all* files
with open("all_segments.pkl", "rb") as f:
    all_segments = pickle.load(f)
    
WINDOW = 30.0  # seconds
records = []

# 2️⃣  Loop over every file
for wav_path, segments in all_segments.items():
    # read its audio
    audio_arr, sr = sf.read(wav_path, dtype="float32")
    duration = audio_arr.shape[0] / sr
    num_win = int(np.ceil(duration / WINDOW))
    
    # 3️⃣  slice into fixed windows & gather transcripts
    for i in range(num_win):
        w_start = i * WINDOW
        w_end   = min(w_start + WINDOW, duration)
        
        s_idx, e_idx = int(w_start * sr), int(w_end * sr)
        clip = audio_arr[s_idx:e_idx]
        
        # pick segments whose *end* lies in this window
        texts = [
            seg.text.strip()
            for seg in segments
            if (seg.end > w_start and seg.end <= w_end)
        ]
        if not texts:
            continue
        
        records.append({
            "audio": {"array": clip, "sampling_rate": sr},
            "text":  " ".join(texts)
        })

# 4️⃣  Build & save one big Dataset
ds30 = Dataset.from_list(records)
ds30 = ds30.cast_column("audio", HF_Audio(sampling_rate=16_000))

print(ds30)  
ds30.save_to_disk("./transcription_all_30s_fixed")

In [None]:
from IPython.display import Audio
from datasets import load_from_disk, Audio as HF_Audio

# Load and cast the dataset
ds = load_from_disk("./transcription_all_30s_fixed")
ds = ds.cast_column("audio", HF_Audio(sampling_rate=16000))

instance = ds[400]
print("Transcription:", instance["text"])
display(Audio(instance["audio"]["array"], rate=instance["audio"]["sampling_rate"]))

In [None]:
from IPython.display import Audio
from datasets import load_from_disk, Audio as HF_Audio

# Load and cast the dataset
ds = load_from_disk("./transcription_all_30s_fixed")
ds = ds.cast_column("audio", HF_Audio(sampling_rate=16000))

# Preview first 5 examples (or however many you want - len(ds))
for i in range(5):
    example = ds[i]
    print(f"▶️ Chunk {i}")
    print("Transcription:", example["text"])
    display(Audio(example["audio"]["array"], rate=example["audio"]["sampling_rate"]))
    print("-" * 60)

In [None]:
# To check if any segments in the resultant Dataset are longer than 30 seconds
for i, item in enumerate(ds):
    audio_len = len(item['audio']['array']) / item['audio']['sampling_rate']
    if audio_len > 30:
        print(f"Dataset item {i}: {audio_len:.2f}s — {item['sentence']}")

## Fine-Tuning

In [None]:
# 1. Check for GPU
import torch
print("🚀 Using device:", "cuda" if torch.cuda.is_available() else "cpu")

# 2. Install dependencies (uncomment if running in a fresh environment)
# !pip install -q datasets>=2.6.1 transformers>=4.30.0 accelerate librosa evaluate jiwer soundfile

# 3. Imports
import os
from datasets import load_from_disk, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

# 4. Load your dataset
ds = load_from_disk("./transcription_all_30s_fixed")
print(ds)

# 5. Cast audio column to the 16 kHz Whisper format
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

# 6. Load Whisper processor for "small" and set to Azerbaijani
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small",
    language="Azerbaijani",   # Whisper’s language token
    task="transcribe"
)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
    language="Azerbaijani",
    task="transcribe"
)

# 7. Prepare the dataset: compute log-Mel inputs and tokenize transcripts
def prepare_batch(batch):
    # load + resample audio
    audio_arr = batch["audio"]["array"]
    sr = batch["audio"]["sampling_rate"]
    # feature extraction
    features = feature_extractor(audio_arr, sampling_rate=sr).input_features[0]
    # tokenize
    labels = tokenizer(batch["text"]).input_ids
    return {"input_features": features, "labels": labels}

ds = ds.map(
    prepare_batch,
    remove_columns=ds.column_names,
    num_proc=1
)

# Split into train & test sets
ds = ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = ds["train"], ds["test"]

# 8. Data collator to batch and pad correctly
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any  # the WhisperProcessor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # separate audio inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        inputs = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(inputs, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        label_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = label_batch["input_ids"].masked_fill(label_batch.attention_mask.ne(1), -100)
        
        # remove leading bos (beginning of a sentence) if present:
        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# 9. Load WER metric
wer_metric = evaluate.load("wer")

# We need to craft a function that processes our model predictions and calculates the WER metric. 
# This function, named `compute_metrics`, initially substitutes `-100` with the `pad_token_id` in the `label_ids`, 
# reversing the adjustment made in the data collator to accurately exclude padded tokens from the loss calculation. 
# Subsequently, it translates the predicted and label ids into strings. 
# Ultimately, it determines the WER by comparing the predictions with the reference labels:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


# 10. Load the pretrained Whisper-Small model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.generation_config.language = "az" # Set decoding language
model.generation_config.task = "transcribe"

# Adjust generation parameters - no tokens are predetermined as decoder outputs, and 
# no tokens are excluded during the generation process
model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens    = []

# 11. Setup training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-az-small-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=225,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

# 12. Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor, # or tokenizer=processor - the whole processor, which knows how to pad both audio and text
    compute_metrics=compute_metrics,
)

# 13. Save processor (necessary for inference)
processor.save_pretrained(training_args.output_dir)

# 14. Launch training
trainer.train()

## Demo

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline

# Load model and processor
model_path = "./whisper-az-small-finetuned/checkpoint-210"
processor_path = "./whisper-az-small-finetuned"  # top-level directory where we saved processor

model = WhisperForConditionalGeneration.from_pretrained(model_path)
processor = WhisperProcessor.from_pretrained(processor_path)

model.generation_config.language = "az"
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="azerbaijani", task="transcribe")

# Create pipeline for ASR
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
)

# Load original (non-fine-tuned) Whisper Small
original_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
original_processor = WhisperProcessor.from_pretrained("openai/whisper-small")

original_model.generation_config.language = "az"
original_model.config.forced_decoder_ids = original_processor.get_decoder_prompt_ids(language="azerbaijani", task="transcribe")

original_pipe = pipeline(
    "automatic-speech-recognition",
    model=original_model,
    tokenizer=original_processor.tokenizer,
    feature_extractor=original_processor.feature_extractor
)

import gradio as gr

def compare_models(audio):
    fine_tuned_result = pipe(audio)["text"]
    original_result = original_pipe(audio)["text"]
    return fine_tuned_result, original_result

outputs = [
    gr.Textbox(label="Fine-tuned"),
    gr.Textbox(label="Original")
]



iface = gr.Interface(
    fn=compare_models,
    inputs=gr.Audio(sources=["microphone"], type="filepath"),
    outputs=outputs,
    title="Whisper Small Azerbaijani: Fine-tuned vs Original",
    description="Speak Azerbaijani and compare outputs from your fine-tuned Whisper model and the original OpenAI Whisper Small model."
)

iface.launch()