In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from peft import get_peft_model, LoraConfig, TaskType
from torch import optim
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

# Load tokenizer correctly
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "tl_XX"
target_lang = "en_XX"

# Load dataset
df = pd.read_csv("test.csv", names=["src", "tgt"], header=0)
df.columns = df.columns.str.strip()  # Remove any hidden spaces
df = df.dropna(subset=["src", "tgt"])  # Remove bad rows
df["src"] = df["src"].astype(str)  # Ensure correct type
df["tgt"] = df["tgt"].astype(str)
# Custom Dataset
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        src = self.tokenizer(row["src"], return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)
        tgt = self.tokenizer(row["tgt"], return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)

        return {
            "input_ids": src["input_ids"].squeeze(),
            "attention_mask": src["attention_mask"].squeeze(),
            "labels": tgt["input_ids"].squeeze()
        }

# Prepare DataLoader
dataset = TranslationDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=4)

# Load model and apply LoRA
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model = model.to("cuda")

# Training Setup
optimizer = optim.AdamW(model.parameters(), lr=2e-4)
scaler = GradScaler()
model.train()

# Training loop
for epoch in range(5):
    loop = tqdm(dataloader, desc=f"Epoch {epoch}")
    for batch in loop:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        loop.set_postfix(loss=loss.item())

# Save model
model.save_pretrained("fine-tuned-mbart-tl2en")

# -------------------
# Inference Section
# -------------------
def translate_text(text, model, tokenizer, src_lang="tl_XX", tgt_lang="en_XX", max_len=128):
    tokenizer.src_lang = src_lang
    encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to("cuda")
    
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=max_len
    )
    
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

model.eval()

test_sentences = [
    "Thank you!  Teka, di baliktad? Di ba dapat ako lilibre mo next year?",
    "Hindi ko makita ang susi ko.",
    "Ang daming trabaho ngayon."
]

print("\n📘 Translation Results:")
for sentence in test_sentences:
    translation = translate_text(sentence, model, tokenizer)
    print(f"TL: {sentence}\nEN: {translation}\n")


  scaler = GradScaler()
  with autocast():
Epoch 0: 100%|██████████| 25/25 [00:26<00:00,  1.07s/it, loss=10.8]
Epoch 1: 100%|██████████| 25/25 [00:27<00:00,  1.12s/it, loss=10.3]
Epoch 2: 100%|██████████| 25/25 [00:28<00:00,  1.13s/it, loss=10.1]
Epoch 3: 100%|██████████| 25/25 [00:27<00:00,  1.11s/it, loss=9.99]
Epoch 4: 100%|██████████| 25/25 [00:29<00:00,  1.19s/it, loss=9.94]



📘 Translation Results:
TL: Thank you!  Teka, di baliktad? Di ba dapat ako lilibre mo next year?
EN: Thank you! Teka, isn't that right? Can't you be happy next year?

TL: Hindi ko makita ang susi ko.
EN: I don't think it's too late for him.

TL: Ang daming trabaho ngayon.
EN: Ang daming can't help it.

