# === Kapampangan-to-English NLLB-200 Training Pipeline (with <kap> tag) ===

In [37]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM
)
from transformers.trainer_seq2seq import Seq2SeqTrainer
from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
from transformers.data.data_collator import DataCollatorForSeq2Seq
import torch
import evaluate

# === 1. Config ===

In [38]:
CSV_PATH = "data/kapampangan_english.csv"
MODEL_NAME = "facebook/nllb-200-distilled-600M"  # NLLB model
MODEL_DIR = "./kapampangan_mt_nllb"

SPECIAL_SRC_TOKEN = "<kap>"   
TGT_LANG = "eng_Latn"   

# === 2. Load CSV ===

In [39]:
df = pd.read_csv(CSV_PATH)
df = df.rename(columns={"kapampangan": "src_text", "english": "tgt_text"})
df = df.dropna(subset=["src_text", "tgt_text"])

# === 3. Convert to HF Dataset ===

In [40]:
dataset = Dataset.from_pandas(df[["src_text", "tgt_text"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# === 4. Load Tokenizer & Model ===

In [41]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, src_lang="eng_Latn")  # temp placeholder
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Add <kap> as a special token
tokenizer.add_special_tokens({'additional_special_tokens': [SPECIAL_SRC_TOKEN]})
model.resize_token_embeddings(len(tokenizer))

M2M100ScaledWordEmbedding(256205, 1024, padding_idx=1)

# === 5. Preprocess ===

In [50]:
print(df.columns)
print(df.head())
df = df.rename(columns={"kapampangan": "src_text", "english": "tgt_text"})
df = df.dropna(subset=["src_text", "tgt_text"])
df = df[df["src_text"].str.strip() != ""]
df = df[df["tgt_text"].str.strip() != ""]



Index(['src_text', 'tgt_text'], dtype='object')
          src_text  tgt_text
0          Sakoru!     Help!
1            Tuto?   Really?
2  Kutang mu kaya.  Ask him.
3     Mako ka ken.  Go away.
4        Yaku man.  Me, too.


In [None]:
def preprocess(examples):
    src_texts = [f"{SPECIAL_SRC_TOKEN} {text}" if text is not None else f"{SPECIAL_SRC_TOKEN} " 
                 for text in examples["src_text"]]
    
    model_inputs = tokenizer(
        src_texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    labels = tokenizer(
        text_target=[t if t is not None else "" for t in examples["tgt_text"]],
        truncation=True,
        padding="max_length",
        max_length=128
    )

    labels_input_ids = [
        [(t if t != tokenizer.pad_token_id else -100) for t in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels_input_ids
    return model_inputs


Map:   0%|          | 0/926 [00:00<?, ? examples/s]

TypeError: 'NoneType' object is not iterable

# === 6. Training Args ===

In [56]:
training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_DIR,
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01,
    predict_with_generate=True,
    save_total_limit=2,
    logging_dir="./logs",
    eval_steps=500,       # evaluate every 500 steps
    logging_steps=500,    # log every 500 steps
    save_steps=500,       # save every 500 steps
    fp16=torch.cuda.is_available(),
)


# === 7. Trainer ===

In [58]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

NameError: name 'tokenized_dataset' is not defined

# === 8. Train ===

In [None]:
trainer.train()

# === 9. Save ===

In [None]:
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
print(f"✅ Model saved to: {MODEL_DIR}")

# === 10. Translation Function ===

In [None]:
def kapampangan_translate(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    src_text = f"{SPECIAL_SRC_TOKEN} {text}"
    inputs = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.convert_tokens_to_ids(TGT_LANG)  # Force English output
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === 11. Evaluate BLEU ===

In [None]:
print("\n--- Evaluating BLEU Score ---")
bleu = evaluate.load("bleu")

preds = [kapampangan_translate(x) for x in df["src_text"]]
refs = [[x] for x in df["tgt_text"]]

bleu_score = bleu.compute(predictions=preds, references=refs)
print(" BLEU Score:", bleu_score)


# === 12. Manual Test ===

In [None]:
print("\n--- Manual Test ---")
sample_texts = [
    "Ali ku balu",
    "Anya ka?",
    "Masanting ya ing panaun ngeni",
    "E ku makanyan",
]

for i, kap_text in enumerate(sample_texts):
    translated = kapampangan_translate(kap_text)
    print(f"[{i+1}] Kapampangan: {kap_text}")
    print(f"    ➤ English: {translated}")
