## 1. Configuration and Imports

In [1]:

import pandas as pd
from datasets import Dataset
from transformers import (
    MarianTokenizer,
    MarianMTModel,
)
from transformers.trainer_seq2seq import Seq2SeqTrainer
from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
from transformers.data.data_collator import DataCollatorForSeq2Seq
import torch
import evaluate

CSV_PATH = "data/kapampangan_english.csv"
MODEL_NAME = "Helsinki-NLP/opus-mt-en-ROMANCE"
MODEL_DIR = "./kapampangan_mt_model"

## 2. Load and Clean CSV

In [2]:
df = pd.read_csv(CSV_PATH)
df = df.rename(columns={"kapampangan": "src_text", "english": "tgt_text"})
df = df.dropna(subset=["src_text", "tgt_text"])

## 3. Convert to Hugging Face Dataset

In [3]:
dataset = Dataset.from_pandas(df[["src_text", "tgt_text"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

## 4. Load Tokenizer and Base Model

In [4]:
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)

## 5. Tokenization Function

In [5]:
def preprocess(example):
    model_inputs = tokenizer(
        example["src_text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["tgt_text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/926 [00:00<?, ? examples/s]



Map:   0%|          | 0/232 [00:00<?, ? examples/s]

## 6. Define Training Arguments

In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_DIR,
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01,
    predict_with_generate=True,
    save_total_limit=2,
    logging_dir="./logs",
    eval_steps=500,       # evaluate every 500 steps
    logging_steps=500,    # log every 500 steps
    save_steps=500,       # save every 500 steps
    fp16=torch.cuda.is_available(),
)


## 7. Trainer Setup

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

## 8. Train the Model

In [8]:
trainer.train()

Step,Training Loss
500,0.385
1000,0.1708
1500,0.0899
2000,0.0405
2500,0.0151
3000,0.006




TrainOutput(global_step=3480, training_loss=0.10205824399131468, metrics={'train_runtime': 563.6584, 'train_samples_per_second': 24.643, 'train_steps_per_second': 6.174, 'total_flos': 470848288849920.0, 'train_loss': 0.10205824399131468, 'epoch': 15.0})

## 9. Save Final Model

In [9]:
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
print(f"✅ Model saved to: {MODEL_DIR}")

✅ Model saved to: ./kapampangan_mt_model


## 10. Define Translation Function

In [10]:
def kapampangan_translate(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## 11. Evaluate BLEU Score

In [11]:
print("\n--- Evaluating BLEU Score ---")
bleu = evaluate.load("bleu")

preds = [kapampangan_translate(x) for x in df["src_text"]]
refs = [[x] for x in df["tgt_text"]]

bleu_score = bleu.compute(predictions=preds, references=refs)
print(" BLEU Score:", bleu_score)


--- Evaluating BLEU Score ---
 BLEU Score: {'bleu': 0.8473219723056034, 'precisions': [0.8985145260943868, 0.8436966971623507, 0.8304668304668305, 0.8261185006045949], 'brevity_penalty': 0.9977677115182587, 'length_ratio': 0.9977701993704092, 'translation_length': 7607, 'reference_length': 7624}


## 12. Manual Translation Test

In [12]:
print("\n--- Manual Test ---")
sample_texts = [
    "Ali ku balu",
    "Anya ka?",
    "Masanting ya ing panaun ngeni",
    "E ku makanyan",
]

for i, kap_text in enumerate(sample_texts):
    translated = kapampangan_translate(kap_text)
    print(f"[{i+1}] Kapampangan: {kap_text}")
    print(f"    \u27a4 English: {translated}")


--- Manual Test ---
[1] Kapampangan: Ali ku balu
    ➤ English: It doesn't know.
[2] Kapampangan: Anya ka?
    ➤ English: Are you any?
[3] Kapampangan: Masanting ya ing panaun ngeni
    ➤ English: It's in the waste of his waiting now.
[4] Kapampangan: E ku makanyan
    ➤ English: I'm not that before.
