___
# 1. Download Dependencies

In [None]:
# %%capture installation
%pip install gdown
# %pip install transformers
# %pip install peft accelerate bitsandbytes triton
%pip install evaluate sacrebleu rouge_score
# !sudo apt install tree

In [None]:
# %%capture download_files
!gdown 1fij-ftlExnP9X7l0Z0EvJHjRLzO1689c # train  https://drive.google.com/file/d/1fij-ftlExnP9X7l0Z0EvJHjRLzO1689c/view?usp=drive_link
!gdown 1D8_p329vuN99_kPak-yZ9NUNKR8CS-hJ # val    https://drive.google.com/file/d/1D8_p329vuN99_kPak-yZ9NUNKR8CS-hJ/view?usp=drive_link
!gdown 1XUNXKCX2KxroPekMEo59St7sx-STQM1g # test   https://drive.google.com/file/d/1XUNXKCX2KxroPekMEo59St7sx-STQM1g/view?usp=drive_link

In [None]:
import gc
import os

import numpy as np
import evaluate
import torch
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    get_scheduler,
)

___
# 2. Configurations

In [None]:
# MODEL_NAME: str = "facebook/mbart-large-50-many-to-many-mmt"
# SRC_LANG: str = "de_DE"  # German
# TGT_LANG: str = "vi_VN"  # Vietnamese
# MODEL_NAME: str = "Shahm/bart-german"
# MODEL_NAME: str = "Helsinki-NLP/opus-mt-de-de"
# MODEL_NAME: str = "Helsinki-NLP/opus-mt-de-vi"
MODEL_NAME: str = "vinai/bartpho-word"

# Dataset - Optimized for German Gloss → Vietnamese
MAX_INPUT_LENGTH: int = 128  # Increased for longer German gloss sequences
MAX_TARGET_LENGTH: int = 128  # Increased for better Vietnamese sentence generation
BATCH_SIZE: int = 12  # Optimized for T4 GPU memory

# LoRA Configuration - Optimized for translation quality
LORA_R: int = 32  # Higher rank for better capacity
LORA_ALPHA: int = 64  # 2x rank for optimal scaling
LORA_DROPOUT: float = 0.05  # Lower dropout for better learning
LORA_TARGET_MODULES: list[str] = [
    "q_proj",
    "k_proj",
    "v_proj",
    "out_proj",  # All attention layers
    "fc1",
    "fc2",  # Feed-forward layers for better translation
]

# Training Configuration
OUTPUT_DIR = "./gloss2vn_model"  # More descriptive name
GRADIENT_ACCUMULATION_STEPS = 4  # Adjusted for larger batch size
EPOCHS = 30  # More epochs for better convergence
LEARNING_RATE = 5e-4  # Higher learning rate for faster convergence
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01

# Optimization
# USE_GRADIENT_CHECKPOINTING = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA devices available: {torch.cuda.device_count()}")
device

___
# 3. Dataset Loader and Preprocess

In [None]:
dataset = load_dataset(
    "csv",
    data_files={
        "train": "PHOENIX-2014-T.train.csv",
        "val": "PHOENIX-2014-T.dev.csv",
        "test": "PHOENIX-2014-T.test.csv",
    },
    delimiter="|",
)
dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# tokenizer.src_lang = SRC_LANG
# tokenizer.tgt_lang = TGT_LANG

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def preprocess_function(examples):
    # Enhanced preprocessing for German gloss → Vietnamese translation
    inputs = []
    targets = []

    for gloss, sentence in zip(examples["geGloss"], examples["viSentence"]):
        # Clean and normalize German gloss
        clean_gloss = gloss.strip(" .").replace("  ", " ")  # Remove extra spaces
        # Add special prefix to indicate gloss-to-text translation
        # formatted_input = f"German gloss: {clean_gloss}"
        formatted_input = clean_gloss

        # Clean Vietnamese target
        clean_target = sentence.strip(" .").replace("  ", " ")

        inputs.append(formatted_input)
        targets.append(clean_target)

    # Tokenize with improved settings
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding=False,  # Let data collator handle padding
        add_special_tokens=True,
        return_attention_mask=True,
    )

    # Tokenize targets with proper language setting
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding=False,
        add_special_tokens=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = dataset.map(
    preprocess_function, batched=True, remove_columns=dataset["train"].column_names, desc="Tokenizing datasets"
)

# Data Collator will dynamically pad the batches
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=MODEL_NAME,
    label_pad_token_id=-100,
    pad_to_multiple_of=8,  # Optimize for tensor cores
    return_tensors="pt"
)

# Create DataLoaders
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
    pin_memory=True,
)
eval_dataloader = DataLoader(
    tokenized_datasets["val"],
    collate_fn=data_collator,
    batch_size=BATCH_SIZE * 2,
    pin_memory=True,
)
test_dataloader = DataLoader(
    tokenized_datasets["test"],
    collate_fn=data_collator,
    batch_size=BATCH_SIZE * 2,
    pin_memory=True,
)

___
# 4. Model Loader

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    use_safetensors=True,
)

# Move model to device
model = model.to(device)

# Create optimized LoRA config
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    target_modules=LORA_TARGET_MODULES,
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Ensure LoRA parameters require gradients
for name, param in model.named_parameters():
    if "lora_" in name:
        param.requires_grad_(True)

model.train()
model.print_trainable_parameters()

In [None]:
trainable_params = [p for p in model.parameters() if p.requires_grad]

# Optimized optimizer with better settings
optimizer = torch.optim.AdamW(
    trainable_params,
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    betas=(0.9, 0.98),
    eps=1e-6,
)

# Advanced learning rate scheduler
num_update_steps_per_epoch = len(train_dataloader) // GRADIENT_ACCUMULATION_STEPS
num_training_steps = EPOCHS * num_update_steps_per_epoch
num_warmup_steps = int(num_training_steps * WARMUP_RATIO)

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)
f"Training steps: {num_training_steps}, Warmup: {num_warmup_steps}"

___
# 5. Metrics Definition

In [None]:
# Load metrics
bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
chrf_metric = evaluate.load("chrf")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}

            # Enhanced generation for better quality
            generated_tokens = model.generate(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=MAX_TARGET_LENGTH,
                min_length=10,  # Ensure minimum translation length
                num_beams=6,  # More beams for better search
                length_penalty=1.2,  # Encourage longer, more complete translations
                early_stopping=True,
                do_sample=False,
                repetition_penalty=1.1,  # Reduce repetition
                no_repeat_ngram_size=3,  # Prevent 3-gram repetition
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                # forced_bos_token_id=tokenizer.lang_code_to_id[TGT_LANG], # BART mnt-mnt
            )

            labels = batch["labels"]
            labels = np.where(
                labels.cpu() != -100, labels.cpu(), tokenizer.pad_token_id
            )

            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            post_preds, post_labels = postprocess_text(decoded_preds, decoded_labels)
            all_preds.extend(post_preds)
            all_labels.extend(post_labels)

    # Compute metrics
    bleu = bleu_metric.compute(predictions=all_preds, references=all_labels)
    rouge = rouge_metric.compute(predictions=all_preds, references=all_labels)
    chrf = chrf_metric.compute(predictions=all_preds, references=all_labels)

    model.train()  # Set back to training mode

    return {
        "bleu": bleu["score"],
        "rougeL": rouge["rougeL"],
        "chrf": chrf["score"],
        "sample_pred": all_preds[0] if all_preds else "",
        "sample_label": all_labels[0][0] if all_labels else "",
    }

___
# 6. Train

In [None]:
# --- SIMPLIFIED TRAINING LOOP FOR GERMAN GLOSS → VIETNAMESE ---
import time

best_bleu = float("-inf")
best_loss = float("inf")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Training history
training_history = []

print(f"Starting training: {EPOCHS} epochs, {len(train_dataloader)} batches")

for epoch in range(EPOCHS):
    epoch_start = time.time()
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{EPOCHS}")

    for step, batch in enumerate(progress_bar):
        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}

        # Standard forward pass
        outputs = model(**batch)
        loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
        loss.backward()

        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS

        # Update progress bar with current average loss
        current_avg_loss = total_loss / (step + 1)
        progress_bar.set_postfix({"avg_loss": f"{current_avg_loss:.4f}"})

    # Epoch summary
    epoch_time = time.time() - epoch_start
    avg_loss = total_loss / len(train_dataloader)

    # Evaluate model
    metrics = evaluate_model(model, eval_dataloader)
    print(
        f"🎯Epoch {epoch + 1} - Metrics: BLEU={metrics['bleu']:.2f}, ROUGE-L={metrics['rougeL']:.2f}, chrF={metrics['chrf']:.2f}"
    )

    # Save training history
    training_history.append(
        {
            "epoch": epoch + 1,
            "train_loss": avg_loss,
            "bleu": metrics["bleu"],
            "rouge": metrics["rougeL"],
            "chrf": metrics["chrf"],
            "time": epoch_time,
        }
    )

    # Save best model
    if metrics["bleu"] > best_bleu:
        best_bleu = metrics["bleu"]
        print(f"New best BLEU: {best_bleu:.2f} - Model saved")

        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)

        import json

        with open(f"{OUTPUT_DIR}/training_history.json", "w") as f:
            json.dump(training_history, f, indent=2)

    torch.cuda.empty_cache()

print(f"Training completed! Best BLEU: {best_bleu:.2f}")
print(f"Model saved to: {OUTPUT_DIR}")

In [None]:
# Load the best model from training
from peft import PeftModel

best_model_path = OUTPUT_DIR
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, use_safetensors=True)
final_model = PeftModel.from_pretrained(base_model, best_model_path).to(device)
final_tokenizer = AutoTokenizer.from_pretrained(best_model_path)

print("Evaluating best model on test set...")
test_metrics = evaluate_model(final_model, test_dataloader)
print(f"Test Results - BLEU: {test_metrics['bleu']:.3f}, ROUGE-L: {test_metrics['rougeL']:.3f}, chrF: {test_metrics['chrf']:.3f}")
print(f"Model saved to: {best_model_path}")
# !tree -h {best_model_path}
!ls

___
___
___