# === Kapampangan-to-English NLLB-200 Training Pipeline (with <kap> tag) ===

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)
import torch
import evaluate
import numpy as np

  warn(





# === 1. Config ===

In [2]:
CSV_PATH = "data/kapampangan_english.csv"
MODEL_NAME = "facebook/nllb-200-distilled-600M"
MODEL_DIR = "./kapampangan_mt_nllb"

SPECIAL_SRC_TOKEN = "<kap>"
TGT_LANG = "eng_Latn"   # English (Latin script)

EXTRA_TOKENS = ["banua", "masanting", "eku", "ala", "nung", "manyaman"]       # English (Latin script)

In [4]:
import pandas as pd

df = pd.read_csv(CSV_PATH)
print(df.columns)


Index(['English', 'Kapampangan'], dtype='object')


# === 2. Load CSV ===

In [3]:
df = pd.read_csv(CSV_PATH)
df = df.rename(columns={"kapampangan": "src_text", "english": "tgt_text"})
df = df.dropna(subset=["src_text", "tgt_text"])
df.head()

KeyError: ['src_text', 'tgt_text']

# === 3. Convert to HF Dataset ===

In [None]:
dataset = Dataset.from_pandas(df[["src_text", "tgt_text"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

import random
import nltk
from nltk.corpus import wordnet

nltk.download("wordnet")
nltk.download("omw-1.4")

def synonym_replacement_en(text, prob=0.1):
    words = text.split()
    new_words = []
    for w in words:
        if random.random() < prob:
            syns = wordnet.synsets(w)
            if syns:
                lemmas = syns[0].lemma_names()
                if lemmas:
                    w = random.choice(lemmas)
        new_words.append(w)
    return " ".join(new_words)

def noise_injection_kap(text, prob=0.02):
    chars = list(text)
    for i in range(len(chars)):
        if random.random() < prob:
            op = random.choice(["swap","delete","insert"])
            if op == "swap" and i < len(chars)-1:
                chars[i], chars[i+1] = chars[i+1], chars[i]
            elif op == "delete":
                chars[i] = ""
            elif op == "insert":
                chars[i] = chars[i] + random.choice("aeiou")
    return "".join(chars)

def word_dropout(text, prob=0.05):
    words = text.split()
    return " ".join([w for w in words if random.random() > prob])

def augment_pair(src, tgt):
    src = noise_injection_kap(src, prob=0.02)
    src = word_dropout(src, prob=0.05)
    tgt = synonym_replacement_en(tgt, prob=0.05)
    return src, tgt

def augment_dataset(dataset, multiplier=2):
    augmented = {"src_text": [], "tgt_text": []}
    for ex in dataset:
        src, tgt = ex["src_text"], ex["tgt_text"]
        # keep original
        augmented["src_text"].append(src)
        augmented["tgt_text"].append(tgt)
        # add augmented samples
        for _ in range(multiplier):
            new_src, new_tgt = augment_pair(src, tgt)
            augmented["src_text"].append(new_src)
            augmented["tgt_text"].append(new_tgt)
    return Dataset.from_dict(augmented)

# Only augment training set
dataset["train"] = augment_dataset(dataset["train"], multiplier=2)

# === 4. Load Tokenizer & Model ===

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Add special tokens
tokenizer.add_special_tokens({'additional_special_tokens': [SPECIAL_SRC_TOKEN] + EXTRA_TOKENS})
model.resize_token_embeddings(len(tokenizer))

# Force English output
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)


# === 5. Preprocess ===

In [None]:
def preprocess(examples):
    src_texts = [f"{SPECIAL_SRC_TOKEN} {text}" for text in examples["src_text"]]

    model_inputs = tokenizer(
        src_texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    labels = tokenizer(
        examples["tgt_text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )["input_ids"]

    labels = [[(t if t != tokenizer.pad_token_id else -100) for t in label] for label in labels]
    model_inputs["labels"] = labels

    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# === 6. Training Args ===

In [None]:
metric_bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric_bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"sacrebleu": result["score"]}

training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_DIR,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    warmup_steps=200,
    logging_steps=50,
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
)

# === 7. Trainer ===

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# === 8. Train ===

In [None]:
trainer.train()

# === 9. Save ===

In [None]:
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
print(f"✅ Model saved to: {MODEL_DIR}")

# === 10. Translation Function ===

In [None]:
def batch_translate(texts, batch_size=8):
    results = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    for i in range(0, len(texts), batch_size):
        src_texts = [f"{SPECIAL_SRC_TOKEN} {t}" for t in texts[i:i+batch_size]]
        inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs)
        results.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    return results

# === 11. Evaluate BLEU ===

In [None]:
print("\n--- Evaluating BLEU Score ---")
bleu = evaluate.load("bleu")

test_df = dataset["test"].to_pandas()
preds = batch_translate(test_df["src_text"].tolist())
refs = [[x] for x in test_df["tgt_text"].tolist()]

bleu_score = bleu.compute(predictions=preds, references=refs)
print(" BLEU Score:", bleu_score)

# === 12. Manual Test ===

In [None]:
print("\n--- Manual Test ---")
sample_texts = [
    "Ali ku balu",
    "Anya ka?",
    "Masanting ya ing panaun ngeni",
    "E ku makanyan",
]

for i, kap_text in enumerate(sample_texts):
    translated = batch_translate([kap_text])[0]
    print(f"[{i+1}] Kapampangan: {kap_text}")
    print(f"    ➤ English: {translated}")