# === Kapampangan-to-English M2M100 Training Pipeline ===

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    M2M100ForConditionalGeneration
)
from transformers.trainer_seq2seq import Seq2SeqTrainer
from transformers.training_args_seq2seq import Seq2SeqTrainingArguments
from transformers.data.data_collator import DataCollatorForSeq2Seq
import torch
import evaluate
import numpy as np




# === 1. Config ===

In [2]:
# === 1. Config ===
CSV_PATH = "data/kapampangan_english.csv"
MODEL_NAME = "facebook/m2m100_418M"
MODEL_DIR = "./kapampangan_mt_model"

SPECIAL_SRC_TOKEN = "<kap>"   # Kapampangan marker
TGT_LANG = "en"               # English

# === 2. Load CSV ===

In [3]:
df = pd.read_csv(CSV_PATH)
df = df.rename(columns={"kapampangan": "src_text", "english": "tgt_text"})
df = df.dropna(subset=["src_text", "tgt_text"])

# === 3. Convert to HF Dataset ===

In [4]:
dataset = Dataset.from_pandas(df[["src_text", "tgt_text"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# === 4. Load Tokenizer & Model ===

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Add <kap> as a special token
tokenizer.add_special_tokens({'additional_special_tokens': [SPECIAL_SRC_TOKEN]})
model.resize_token_embeddings(len(tokenizer))

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  88%|########8 | 1.71G/1.94G [00:00<?, ?B/s]

M2M100ScaledWordEmbedding(128105, 1024, padding_idx=1)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


# === 5. Preprocess ===

In [6]:
def preprocess(examples):
    # Prepend <kap> to source text
    src_texts = [f"{SPECIAL_SRC_TOKEN} {text}" for text in examples["src_text"]]

    # Tokenize source
    model_inputs = tokenizer(
        src_texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    # Tokenize target
    labels = tokenizer(
        examples["tgt_text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )["input_ids"]

    # Replace pad token IDs with -100
    labels = [[(t if t != tokenizer.pad_token_id else -100) for t in label] for label in labels]
    model_inputs["labels"] = labels

    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)


Map:   0%|          | 0/926 [00:00<?, ? examples/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

# === 6. Training Args ===

In [7]:
metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    # Replace -100 with pad_token_id so metric can handle it
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    accuracy = metric_acc.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_DIR,
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01,
    predict_with_generate=True,
    save_total_limit=2,
    logging_dir="./logs",
    eval_steps=500,       # evaluate every 500 steps
    logging_steps=500,    # log every 500 steps
    save_steps=500,       # save every 500 steps
    fp16=torch.cuda.is_available(),
)

# === 7. Trainer ===

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


: 

# === 8. Train ===

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


# === 9. Save ===

In [None]:
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
print(f"✅ Model saved to: {MODEL_DIR}")

# === 10. Translation Function ===

In [None]:
def kapampangan_translate(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    src_text = f"{SPECIAL_SRC_TOKEN} {text}"
    inputs = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.get_lang_id(TGT_LANG)  # Force English output
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === 11. Evaluate BLEU ===

In [None]:
print("\n--- Evaluating BLEU Score ---")
bleu = evaluate.load("bleu")

preds = [kapampangan_translate(x) for x in df["src_text"]]
refs = [[x] for x in df["tgt_text"]]

bleu_score = bleu.compute(predictions=preds, references=refs)
print(" BLEU Score:", bleu_score)

# === 12. Manual Test ===

In [None]:
print("\n--- Manual Test ---")
sample_texts = [
    "Ali ku balu",
    "Anya ka?",
    "Masanting ya ing panaun ngeni",
    "E ku makanyan",
]

for i, kap_text in enumerate(sample_texts):
    translated = kapampangan_translate(kap_text)
    print(f"[{i+1}] Kapampangan: {kap_text}")
    print(f"    ➤ English: {translated}")