In [None]:
import pandas as pd
from datasets import Dataset
from transformers import Wav2Vec2Processor
import os
import re


# Load your CSV file (adjust paths accordingly)
df = pd.read_csv("cv-valid-train-clean.csv")

# Replace .mp3 with .wav if necessary
df["filename"] = df["file_name"].str.replace(".mp3", ".wav", regex=False)

df = df[df["filename"].apply(os.path.exists)].reset_index(drop=True)

# Randomly sample 2,000 examples
df_sampled = df.sample(n=2000, random_state=2).reset_index(drop=True)


# Rename columns to match the expected format
df_sampled = df_sampled.rename(columns={"filename": "audio", "text": "sentence"})

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df_sampled)

# Initialize the processor (we'll use this for both feature extraction and tokenization)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")


In [None]:
dataset = dataset.train_test_split(test_size=0.1)  # 90% train / 10% test

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")


In [None]:
vocab_dict = processor.tokenizer.get_vocab()

# Print vocab size
print(f"Vocab size: {len(vocab_dict)}")

# Print vocab items (sorted by ID for readability)
for token, idx in sorted(vocab_dict.items(), key=lambda x: x[1]):
    print(f"{idx}: {token}")

In [None]:
import librosa
from transformers import Wav2Vec2Processor

def preprocess_example(example):
    # Load audio
    speech_array, _ = librosa.load(example["audio"], sr=16000)

    # Extract features
    input_values = processor.feature_extractor(
        speech_array, sampling_rate=16000, return_tensors="pt"
    ).input_values.squeeze()

    # Normalize and tokenize the label
    text = example["sentence"].upper().replace(" ", "|")  # ✅ default for pretrained is lowercase
    with processor.as_target_processor():
        labels = processor.tokenizer(text, return_tensors="pt").input_ids.squeeze()

    return {
        "input_values": input_values,
        "labels": labels
    }



dataset = dataset.map(
    preprocess_example,
    remove_columns=["audio", "sentence", "file_name"],
    num_proc=1,
    batched=False,
    desc="Normalizing and Preprocessing"
)






In [None]:
# Print tokenized labels for a few examples
for i in range(3):
    sample = dataset["train"][i]
    print(f"\n--- Sample {i} ---")
    print("Token IDs:", sample["labels"])
    print("Decoded Text:", processor.tokenizer.decode(sample["labels"], skip_special_tokens=True))


In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Any
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Separate inputs and labels
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        # Pad inputs
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        # Replace padding with -100 to ignore in loss computation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch


In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./wav2vec2-finetuned_f2",  # Save the model here
    per_device_train_batch_size=8,      # Adjust based on GPU memory
    per_device_eval_batch_size=8,
    eval_strategy="steps",
    eval_steps = 100,
    save_steps=500,                     # Save every 500 steps
    save_total_limit=2,                 # Keep only 2 recent checkpoints
    logging_dir="./logs",               # Log to this folder
    logging_steps=100,                  # Log every 100 steps
    num_train_epochs=10,                 # Number of epochs
    fp16=True,                          # Enable mixed-precision training
    load_best_model_at_end=True,        # Load the best model based on eval metric
    metric_for_best_model="wer",        # Best model based on WER metric
    report_to="none",                   # Disable huggingface reporting (for local training)
)


In [None]:
from transformers import Seq2SeqTrainer
import evaluate
import random

# Define the WER metric
wer_metric = evaluate.load("wer")
import numpy as np
from transformers import Wav2Vec2Processor

def compute_metrics(pred):
    pred_ids = torch.argmax(torch.tensor(pred.predictions), dim=-1)
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Optional: print a few
    i = random.randint(1,199)
    print(f"Prediction {i}: {pred_str[i]}")

    print(f"Label {i}: {label_str[i]}")

    return {"wer": wer_metric.compute(predictions=pred_str, references=label_str)}




# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,                # The model to fine-tune
    args=training_args,         # The training arguments
    train_dataset=dataset["train"],   # Train dataset
    eval_dataset=dataset["test"],    # Evaluation dataset
    data_collator=data_collator,     # Data collator
    compute_metrics=compute_metrics  # Metrics function (WER)
    #label_names=["labels"]
)

# Train the model
trainer.train()


In [None]:
import torchaudio

def load_audio(file_path, target_sr=16000):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)
    return waveform.squeeze().numpy()

In [None]:
def transcribe(file_path):
    audio = load_audio(file_path)

    # Wav2Vec2 expects a batch of float values (1D tensor)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True).to("cuda")

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription


In [None]:
text = transcribe("cv-valid-train/sample-000031.wav")
print("Transcription:", text)


In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

model.save_pretrained("./wav2vec2-finetuned-commonvoice")
processor.save_pretrained("./wav2vec2-finetuned-commonvoice")

