In [None]:
import pandas as pd
from datasets import Dataset, Audio

# Load full CSV
df = pd.read_csv("cv-valid-train-clean.csv")

# Replace .mp3 with .wav
df["filename"] = df["file_name"].str.replace(".mp3", ".wav", regex=False)

# Optional: Keep only existing files (recommended)
import os
df = df[df["filename"].apply(os.path.exists)].reset_index(drop=True)

# Randomly sample 2,000 examples
df_sampled = df.sample(n=2000, random_state=2).reset_index(drop=True)

# Rename for Hugging Face
df_sampled = df_sampled.rename(columns={"filename": "audio", "text": "sentence"})


dataset = Dataset.from_pandas(df_sampled)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))


In [None]:
import re

def normalize_text(batch):
    text = batch["sentence"].lower()
    text = re.sub(r"[^\w\s]", "", text)  # removes punctuation
    text = re.sub(r"\s+", " ", text).strip()  # clean up extra whitespace
    batch["text"] = text
    return batch

dataset = dataset.map(normalize_text)


In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="en", task="transcribe")

def preprocess_safe(example):
    try:
        audio = example["audio"]
        inputs = feature_extractor(audio["array"], sampling_rate=16000, return_tensors="pt")
        labels = tokenizer(example["sentence"], return_tensors="pt").input_ids
        return {
            "input_features": inputs.input_features[0],
            "labels": labels[0]
        }
    except Exception as e:
        print(f"Skipping file due to error: {e}")
        return {}  # Drop this sample




In [None]:
dataset = dataset.train_test_split(test_size=0.1)  # 90% train / 10% test

In [None]:
dataset = dataset.map(
    preprocess_safe,  # or preprocess if it works now
    remove_columns=["file_name", "sentence", "audio"],  # explicitly remove them
    num_proc=1,
    batched=False,
    desc="Preprocessing"
)



In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class WhisperDataCollator:
    processor: Any
    return_tensors: str = "pt"
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Convert input_features to tensors
        input_features = [torch.tensor(f["input_features"]) for f in features]

        # Stack them into a batch
        batch = {
            "input_features": torch.stack(input_features)
        }

        # Tokenized labels may still be lists of ints — pad and convert
        label_batch = self.processor.tokenizer.pad(
            {"input_ids": [f["labels"] for f in features]},
            padding=self.padding,
            return_tensors=self.return_tensors,
        )

        # Replace padding token with -100 so it's ignored in loss
        label_batch["input_ids"][label_batch["input_ids"] == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = label_batch["input_ids"]

        return batch


In [None]:
from transformers import WhisperForConditionalGeneration, DataCollatorForSeq2Seq, WhisperProcessor

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to("cuda")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="en", task="transcribe")



data_collator = WhisperDataCollator(processor=processor)




In [None]:
import evaluate
import numpy as np
import random

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Convert to numpy if needed
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]

    # Decode the entire predicted sentence (for comparison)
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True, temperature=0.9)

    # Replace -100 in the labels before decoding
    label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True, temperature=0.9)

    # Manually check the token IDs for the first few predictions and labels
    i = random.randint(1,199)
    print(f"Prediction {i}: {pred_str[i]}")

    print(f"Label {i}: {label_str[i]}")

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}




In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned_4-18",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="steps",
    save_strategy="epoch",
    eval_steps = 100,
    learning_rate=1e-5,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    report_to="none",
    remove_unused_columns=False
)


In [None]:
import torch
torch.cuda.empty_cache()  # Clears memory that is no longer needed


In [None]:
from transformers import Seq2SeqTrainer
from transformers import WhisperProcessor




trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()


In [None]:
import torchaudio

def load_audio(file_path, target_sr=16000):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)
    return waveform.squeeze().numpy()


In [None]:
def transcribe(file_path):
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language = "en", task = "transcribe")
    model.config.suppress_tokens = []
    # Load and resample
    audio = load_audio(file_path)

    # Preprocess
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")

    # Generate
    with torch.no_grad():
        generated_ids = model.generate(inputs.input_features)

    # Decode
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription


In [None]:
text = transcribe("cv-valid-train/sample-000031.wav")
print("Transcription:", text)


In [None]:
# Save both feature extractor + tokenizer together
processor.save_pretrained("./whisper-finetuned_final_4-15")

# Save model
model.save_pretrained("./whisper-finetuned_final_4-15")
