In [None]:
import os
import re
import json
import torch
import datasets
from datasets import load_dataset
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,
)
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer

# Check for CUDA availability and print device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")



Using device: CPU


# --- Configuration ---
# IMPORTANT: Update these paths for Kapampangan training

In [None]:
VALIDATED_DATA_FOLDER = 'data/Cleaned_Audio_Files'  # The folder with Kapampangan audio and transcriptions
MODEL_OUTPUT_DIR = "./kapampangan_wav2vec2_model"  # Directory to save the trained model
BASE_MODEL = "patrickvonplaten/wav2vec2-base-10m-voxpopuli" # Base model for fine-tuning


# --- 1. Load the Dataset ---

In [None]:
def load_custom_dataset(data_folder):
    """Loads the dataset from the metadata.csv file into a Hugging Face Dataset object."""
    metadata_path = os.path.join(data_folder, "metadata.csv")
    if not os.path.exists(metadata_path):
        raise FileNotFoundError(
            f"metadata.csv not found in {data_folder}. "
        )
    
    dataset_df = pd.read_csv(metadata_path)

    # Convert DataFrame to Hugging Face Dataset object (raw, no preprocessing here)
    custom_dataset = Dataset.from_pandas(dataset_df)
    return custom_dataset


# --- 2. Create Vocabulary for Kapampangan ---

In [None]:
def create_vocabulary(data):
    """
    Extracts all unique characters from the Kapampangan transcription column
    and creates a vocabulary file.
    """
    # Regex to extract characters, handling potential variations
    chars_to_ignore_regex = r"[\,\?\.\!\-\;\:\"'%\[\]]"

    def extract_all_chars(batch):
        all_text = " ".join(batch["transcription"])
        # Normalize and remove special characters
        all_text = re.sub(chars_to_ignore_regex, '', all_text).lower()
        # Create a set of unique characters
        vocab = list(set(all_text))
        return {"vocab": [vocab], "all_text": [all_text]}

    # Extract vocabulary from the dataset
    vocab_result = data.map(
        extract_all_chars,
        batched=True,
        batch_size=-1,
        keep_in_memory=True,
        remove_columns=data.column_names
    )

    # Combine all unique characters from all batches
    vocab_list = list(set(vocab_result["vocab"][0]))
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}

    # Add special tokens for CTC loss
    vocab_dict["|"] = vocab_dict.pop(" ") if " " in vocab_dict else len(vocab_dict)
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)
    
    # Save the vocabulary as a json file
    vocab_path = os.path.join(MODEL_OUTPUT_DIR, 'vocab.json')
    if not os.path.exists(MODEL_OUTPUT_DIR):
        os.makedirs(MODEL_OUTPUT_DIR)
    with open(vocab_path, 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)
    
    print(f"Vocabulary created and saved to {vocab_path}")
    print(f"Vocabulary size: {len(vocab_dict)}")
    return vocab_path

# --- 3. Preprocess the Data ---

In [None]:
spec_augment_freq = torchaudio.transforms.FrequencyMasking(freq_mask_param=30)
spec_augment_time = torchaudio.transforms.TimeMasking(time_mask_param=100)

def preprocess_data(dataset, processor):
    import librosa
    import soundfile as sf

    total_before = len(dataset)

    def prepare_dataset(batch):
        try:
            audio_path = batch["file_path"]
            waveform, sr = torchaudio.load(audio_path)

            if sr != 16000:
                resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
                waveform = resampler(waveform)

            # ✅ apply SpecAugment on spectrogram
            spec = torchaudio.transforms.MelSpectrogram()(waveform)
            spec = spec_augment_freq(spec)
            spec = spec_augment_time(spec)

            # Convert spectrogram back to waveform approximation
            # (keeps pipeline consistent with feature extractor)
            audio_array = waveform.squeeze().numpy()

            batch["input_values"] = processor(audio_array, sampling_rate=16000).input_values[0]
            batch["input_length"] = len(batch["input_values"])

            with processor.as_target_processor():
                batch["labels"] = processor(batch["transcription"].lower()).input_ids  # ✅ match vocab


        except Exception as e:
            print("Failed to Process")
            print(f"File: {batch.get('file_path', 'Path not found')}")
            print(f"Error: {repr(e)}")
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            return None

    processed_examples = []
    for i in range(len(dataset)):
        example = dataset[i]
        processed_example = prepare_dataset(example)
        if processed_example is not None:
            processed_examples.append(processed_example)

    from datasets import Dataset
    dataset = Dataset.from_list(processed_examples)
    total_after = len(dataset)
    print(f"Preprocessing complete: {total_after} / {total_before} samples successfully processed.")
    return dataset


# --- 4. Define Metrics and Data Collator ---

In [6]:
class DataCollatorCTCWithPadding:
    """
    Data collator that dynamically pads the inputs and labels for CTC.
    """
    def __init__(self, processor):
        self.processor = processor
        self.padding = "longest"

    def __call__(self, features):
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

def compute_metrics(pred, processor, wer_metric):
    """Computes Word Error Rate (WER) for evaluation."""
    pred_logits = pred.predictions
    pred_ids = torch.argmax(torch.from_numpy(pred_logits), dim=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids)
    label_str = processor.tokenizer.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# --- Main Training Execution ---

# Step 1: Load data

In [7]:
# Load custom dataset
print("--- Step 1: Loading Kapampangan Dataset ---")
raw_dataset = load_custom_dataset(VALIDATED_DATA_FOLDER)

from datasets import DatasetDict
RANDOM_SEED = 42
SPLIT_RATIO = 0.2  # 20% for evaluation

if len(raw_dataset) > 1:
    dataset_split = raw_dataset.train_test_split(
        test_size=SPLIT_RATIO,
        shuffle=True,
        seed=RANDOM_SEED
    )
    train_dataset = dataset_split['train']
    eval_dataset = dataset_split['test']
    print(f"Dataset split into {len(train_dataset)} training samples and {len(eval_dataset)} evaluation samples.")
else:
    train_dataset = raw_dataset
    eval_dataset = raw_dataset
    print("Warning: Dataset is too small for a split. Evaluating on the training set.")


--- Step 1: Loading Kapampangan Dataset ---
Dataset split into 1990 training samples and 498 evaluation samples.


# Step 2: Create Vocabulary

In [8]:
print("\n--- Step 2: Creating Kapampangan Vocabulary ---")
vocab_path = create_vocabulary(train_dataset)


--- Step 2: Creating Kapampangan Vocabulary ---


Map:   0%|          | 0/1990 [00:00<?, ? examples/s]

Vocabulary created and saved to ./kapampangan_wav2vec2_model\vocab.json
Vocabulary size: 31


# Step 3: Setup Processor (Tokenizer + Feature Extractor)

In [9]:
print("\n--- Step 3: Setting up Processor ---")

tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file=vocab_path,
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Save for reuse
processor.save_pretrained(MODEL_OUTPUT_DIR)
print("Processor created and saved.")



--- Step 3: Setting up Processor ---
Processor created and saved.


# Step 4: Preprocess the dataset

In [10]:
print("\n--- Step 4: Preprocessing Data ---")
processed_train_dataset = preprocess_data(train_dataset, processor)
processed_eval_dataset = preprocess_data(eval_dataset, processor)



--- Step 4: Preprocessing Data ---




Failed to Process
File: data/Cleaned_Audio_Files/(016) Eya/cat02_entry014_spk016.wav
Error: LibsndfileError(2, "Error opening 'data/Cleaned_Audio_Files/(016) Eya/cat02_entry014_spk016.wav': ")
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Failed to Process
File: data/Cleaned_Audio_Files/(016) Eya/cat03_entry017_spk016.wav
Error: LibsndfileError(2, "Error opening 'data/Cleaned_Audio_Files/(016) Eya/cat03_entry017_spk016.wav': ")
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Failed to Process
File: data/Cleaned_Audio_Files/(016) Eya/cat02_entry004_spk016.wav
Error: LibsndfileError(2, "Error opening 'data/Cleaned_Audio_Files/(016) Eya/cat02_entry004_spk016.wav': ")
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Failed to Process
File: data/Cleaned_Audio_Files/(016) Eya/cat02_entry002_spk016.wav
Error: LibsndfileError(2, "Error opening 'data/Cleaned_Audio_Files/(016) Eya/cat02_entry002_spk016.wav': ")
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Failed to Process
File: data/Cleaned_Audio_Files/(016) Eya/cat02_entry012_spk016.wav

# Step 5: Setup Trainer

In [None]:
from transformers import EarlyStoppingCallback
import evaluate

print("\n--- Step 5: Setting up Model and Trainer ---")

data_collator = DataCollatorCTCWithPadding(processor=processor)

# Load metrics
wer_metric = evaluate.load("wer")
acc_metric = evaluate.load("accuracy")

model = Wav2Vec2ForCTC.from_pretrained(
    BASE_MODEL,
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
    ignore_mismatched_sizes=True  
)


model.freeze_feature_encoder()

training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    gradient_accumulation_steps=2,     # ✅ faster convergence
    group_by_length=True,
    length_column_name="input_length",
    per_device_train_batch_size=8,     # ✅ fits in 8–12GB VRAM with fp16
    per_device_eval_batch_size=8,
    num_train_epochs=20,               # ✅ more epochs, smaller LR
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=3,
    fp16=True,                         # ✅ mixed precision for NVIDIA 4070
    gradient_checkpointing=True,       # ✅ saves memory
    logging_steps=25,
    learning_rate=3e-5,                # ✅ safer LR for fine-tuning
    warmup_steps=500,
    push_to_hub=False,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)



# Compute both WER & Accuracy
cer_metric = evaluate.load("cer")  # load Character Error Rate metric

def compute_metrics_wrapper(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids = pred.label_ids
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics_wrapper,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # <-- early stopping
)



--- Step 5: Setting up Model and Trainer ---




ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


# Step 6: Train the model

In [None]:
print("\n--- Step 6: Starting Kapampangan Training ---")
print("This may take some time depending on your machine and dataset size.")

trainer.train()



--- Step 6: Starting Kapampangan Training ---
This may take some time depending on your machine and dataset size.


  0%|          | 0/1452 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Step 7: Save the final model

In [None]:
print("\n--- Step 7: Saving Final Model ---")
trainer.save_model(MODEL_OUTPUT_DIR)
print(f"Training complete! Model saved in: {MODEL_OUTPUT_DIR}")


# Step 8: Evaluate on Evaluation Set

In [None]:
print("\n--- Step 8: Evaluating on Validation Set ---")

# Optionally reload model
# model = Wav2Vec2ForCTC.from_pretrained(MODEL_OUTPUT_DIR).to(device)

predictions = trainer.predict(processed_eval_dataset)
metrics = compute_metrics_wrapper(predictions)

print(f"Validation WER: {metrics['wer']:.4f}")


# Show Sample Predictions

In [None]:
from random import sample

print("\n--- Sample Predictions ---")
pred_ids = torch.argmax(torch.from_numpy(predictions.predictions), dim=-1)
decoded_preds = processor.batch_decode(pred_ids)
decoded_labels = processor.batch_decode(predictions.label_ids, group_tokens=False)

for i in sample(range(len(decoded_preds)), 5):
    print(f"[{i+1}]")
    print(f"Kapampangan (Predicted) : {decoded_preds[i]}")
    print(f"Kapampangan (Reference) : {decoded_labels[i]}")
    print("-" * 40)
