In [2]:
import os
import re
import json
import torch
import torchaudio
import evaluate
import pandas as pd
from datasets import Dataset, Audio
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC
)
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer

# Check for CUDA availability and print device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Using device: NVIDIA GeForce RTX 4070 Laptop GPU


# --- Configuration ---
# IMPORTANT: Update these paths for Kapampangan training

In [3]:
VALIDATED_DATA_FOLDER = 'data/Cleaned_Audio_Files'  # The folder with Kapampangan audio and transcriptions
MODEL_OUTPUT_DIR = './kapampangan_wav2vec2_model'  # Directory to save the trained model
BASE_MODEL = "facebook/wav2vec2-large-xlsr-53"  # Base model for fine-tuning


# --- 1. Load the Dataset ---

In [4]:
def load_custom_dataset(data_folder):
    """Loads the dataset from the metadata.csv file."""
    metadata_path = os.path.join(data_folder, "metadata.csv")
    if not os.path.exists(metadata_path):
        raise FileNotFoundError(
            f"metadata.csv not found in {data_folder}. "
            "Please ensure you have run the prepare_kapampangan_dataset.py script first."
        )
    dataset_df = pd.read_csv(metadata_path)
    # Convert DataFrame to Hugging Face Dataset object
    custom_dataset = Dataset.from_pandas(dataset_df)
    return custom_dataset

# --- 2. Create Vocabulary for Kapampangan ---

In [5]:
def create_vocabulary(data):
    """
    Extracts all unique characters from the Kapampangan transcription column
    and creates a vocabulary file.
    """
    # Regex to extract characters, handling potential variations
    chars_to_ignore_regex = r"[\,\?\.\!\-\;\:\"'%\[\]]"

    def extract_all_chars(batch):
        all_text = " ".join(batch["transcription"])
        # Normalize and remove special characters
        all_text = re.sub(chars_to_ignore_regex, '', all_text).lower()
        # Create a set of unique characters
        vocab = list(set(all_text))
        return {"vocab": [vocab], "all_text": [all_text]}

    # Extract vocabulary from the dataset
    vocab_result = data.map(
        extract_all_chars,
        batched=True,
        batch_size=-1,
        keep_in_memory=True,
        remove_columns=data.column_names
    )

    # Combine all unique characters from all batches
    vocab_list = list(set(vocab_result["vocab"][0]))
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}

    # Add special tokens for CTC loss
    vocab_dict["|"] = vocab_dict.pop(" ") if " " in vocab_dict else len(vocab_dict)
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)
    
    # Save the vocabulary as a json file
    vocab_path = os.path.join(MODEL_OUTPUT_DIR, 'vocab.json')
    if not os.path.exists(MODEL_OUTPUT_DIR):
        os.makedirs(MODEL_OUTPUT_DIR)
    with open(vocab_path, 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)
    
    print(f"Vocabulary created and saved to {vocab_path}")
    print(f"Vocabulary size: {len(vocab_dict)}")
    return vocab_path

# --- 3. Preprocess the Data ---

In [6]:
def preprocess_data(dataset, processor):
    """
    Prepares the dataset for training:
    1. Loads and resamples audio.
    2. Tokenizes Kapampangan transcriptions.
    """
    import librosa
    import soundfile as sf

    total_before = len(dataset)
    
    def prepare_dataset(batch):
        try:
            audio_path = batch["file_path"]
            waveform, sr = torchaudio.load(audio_path)

            if sr != 16000:
                resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
                waveform = resampler(waveform)

            # Convert waveform tensor to numpy array and flatten
            audio_array = waveform.squeeze().numpy()

            batch["input_values"] = processor(audio_array, sampling_rate=16000).input_values[0]
            batch["input_length"] = len(batch["input_values"])

            with processor.as_target_processor():
                batch["labels"] = processor(batch["transcription"]).input_ids
            return batch

        except Exception as e:
            print("Failed to Process")
            print(f"File: {batch.get('file_path', 'Path not found')}")
            print(f"Error: {repr(e)}")  # This will give the actual exception message
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            return None

    # Process each example individually
    processed_examples = []
    for i in range(len(dataset)):
        example = dataset[i]
        processed_example = prepare_dataset(example)
        if processed_example is not None:
            processed_examples.append(processed_example)
    
    # Create a new dataset from processed examples
    from datasets import Dataset
    dataset = Dataset.from_list(processed_examples)
    total_after = len(dataset)
    print(f"Preprocessing complete: {total_after} / {total_before} samples successfully processed.")
    return dataset

# --- 4. Define Metrics and Data Collator ---

In [7]:
class DataCollatorCTCWithPadding:
    """
    Data collator that dynamically pads the inputs and labels for CTC.
    """
    def __init__(self, processor):
        self.processor = processor
        self.padding = "longest"

    def __call__(self, features):
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

def compute_metrics(pred, processor, wer_metric):
    """Computes Word Error Rate (WER) for evaluation."""
    pred_logits = pred.predictions
    pred_ids = torch.argmax(torch.from_numpy(pred_logits), dim=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids)
    label_str = processor.tokenizer.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# --- Main Training Execution ---

# Step 1: Load data

In [8]:
# Load custom dataset
print("--- Step 1: Loading Kapampangan Dataset ---")
raw_dataset = load_custom_dataset(VALIDATED_DATA_FOLDER)

from datasets import DatasetDict
RANDOM_SEED = 42
SPLIT_RATIO = 0.2  # 20% for evaluation

if len(raw_dataset) > 1:
    dataset_split = raw_dataset.train_test_split(
        test_size=SPLIT_RATIO,
        shuffle=True,
        seed=RANDOM_SEED
    )
    train_dataset = dataset_split['train']
    eval_dataset = dataset_split['test']
    print(f"Dataset split into {len(train_dataset)} training samples and {len(eval_dataset)} evaluation samples.")
else:
    train_dataset = raw_dataset
    eval_dataset = raw_dataset
    print("Warning: Dataset is too small for a split. Evaluating on the training set.")


--- Step 1: Loading Kapampangan Dataset ---
Dataset split into 1990 training samples and 498 evaluation samples.


# Step 2: Create Vocabulary

In [9]:
print("\n--- Step 2: Creating Kapampangan Vocabulary ---")
vocab_path = create_vocabulary(train_dataset)


--- Step 2: Creating Kapampangan Vocabulary ---


Map: 100%|██████████| 1990/1990 [00:00<00:00, 142153.16 examples/s]

Vocabulary created and saved to ./kapampangan_wav2vec2_model\vocab.json
Vocabulary size: 31





# Step 3: Setup Processor (Tokenizer + Feature Extractor)

In [10]:
print("\n--- Step 3: Setting up Processor ---")

tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file=vocab_path,
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Save for reuse
processor.save_pretrained(MODEL_OUTPUT_DIR)
print("Processor created and saved.")



--- Step 3: Setting up Processor ---
Processor created and saved.




# Step 4: Preprocess the dataset

In [11]:
print("\n--- Step 4: Preprocessing Data ---")
processed_train_dataset = preprocess_data(train_dataset, processor)
processed_eval_dataset = preprocess_data(eval_dataset, processor)



--- Step 4: Preprocessing Data ---




Preprocessing complete: 1990 / 1990 samples successfully processed.
Preprocessing complete: 498 / 498 samples successfully processed.


# Step 5: Setup Trainer

In [None]:
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments, TrainerCallback
import evaluate
import numpy as np
import torch
from rich.console import Console
from rich.table import Table
print("\n--- Step 5: Setting up Model and Trainer ---")

data_collator = DataCollatorCTCWithPadding(processor=processor)

# Load metrics
wer_metric = evaluate.load("wer")

model = Wav2Vec2ForCTC.from_pretrained(
    BASE_MODEL,
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer)
)
model.freeze_feature_encoder()

training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    gradient_accumulation_steps=4,
    group_by_length=True,
    length_column_name="input_length",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    save_strategy="steps",              
    evaluation_strategy="steps",      
    save_steps=200,                     
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=500,                   
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    learning_rate=5e-5,
    warmup_steps=200,
    push_to_hub=False,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    report_to="tensorboard",
    
)
# Custom accuracy function (exact string match)
def compute_metrics_wrapper(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Decode predictions
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    # Decode labels
    label_ids = pred.label_ids
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    # Compute exact-match accuracy
    correct = sum([p.strip() == r.strip() for p, r in zip(pred_str, label_str)])
    acc = correct / len(label_str) if len(label_str) > 0 else 0.0

    return {"wer": wer, "accuracy": acc}

console = Console()

class TableLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # logs can be None if Trainer skips logging
        if logs is None:
            return

        # only print useful keys (ignore "epoch", "total_flos", etc.)
        keys_to_show = ["loss", "learning_rate", "eval_loss", "wer"]
        logs_to_show = {k: v for k, v in logs.items() if k in keys_to_show}

        if not logs_to_show:
            return

        table = Table(title=f"Step {state.global_step}")
        for key in logs_to_show.keys():
            table.add_column(key, justify="center")

        table.add_row(*[
            f"{v:.4f}" if isinstance(v, (float, int)) else str(v)
            for v in logs_to_show.values()
        ])

        console.print(table)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics_wrapper,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), early_stopping_threshold=0.01,
               TableLoggerCallback()],
    
)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (1167958232.py, line 103)

In [19]:
import torch

print(f"Pytorch ver: {torch.__version__}")
print(f"Cuda avail: {torch.cuda.is_available()}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

Pytorch ver: 2.6.0+cu124
Cuda avail: True
Device: NVIDIA GeForce RTX 4070 Laptop GPU


# Step 6: Train the model

In [20]:
print("\n--- Step 6: Starting Kapampangan Training ---")
print("This may take some time depending on your machine and dataset size.")

trainer.train()



--- Step 6: Starting Kapampangan Training ---
This may take some time depending on your machine and dataset size.


  return fn(*args, **kwargs)


KeyboardInterrupt: 

# Step 7: Save the final model

In [None]:
print("\n--- Step 7: Saving Final Model ---")
trainer.save_model(MODEL_OUTPUT_DIR)
print(f"Training complete! Model saved in: {MODEL_OUTPUT_DIR}")


# Step 8: Evaluate on Evaluation Set

In [None]:
print("\n--- Step 8: Evaluating on Validation Set ---")

# Optionally reload model
# model = Wav2Vec2ForCTC.from_pretrained(MODEL_OUTPUT_DIR).to(device)

predictions = trainer.predict(processed_eval_dataset)
metrics = compute_metrics_wrapper(predictions)

print(f"Validation WER: {metrics['wer']:.4f}")


# Show Sample Predictions

In [None]:
from random import sample

print("\n--- Sample Predictions ---")
pred_ids = torch.argmax(torch.from_numpy(predictions.predictions), dim=-1)
decoded_preds = processor.batch_decode(pred_ids)
decoded_labels = processor.batch_decode(predictions.label_ids, group_tokens=False)

for i in sample(range(len(decoded_preds)), 5):
    print(f"[{i+1}]")
    print(f"Kapampangan (Predicted) : {decoded_preds[i]}")
    print(f"Kapampangan (Reference) : {decoded_labels[i]}")
    print("-" * 40)
