In [None]:
!nvidia-smi

# A1. Install required libraries

In [None]:
!pip install -q evaluate sacrebleu

# A2. Imports & config

In [None]:
import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate

# A3. Set constants (DO NOT change yet)

In [None]:
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/byt5-base"
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "./byt5-base-saved"

# ByT5 is character-based, so sequences are longer. 
# Akkadian texts can be long, but 512 is usually a safe upper limit for sentence-level.
MAX_LENGTH = 400 
PREFIX = "translate Akkadian to English: "

# A4. Data Loading & Cleaning

In [None]:
def load_and_align_data(filepath):
    """
    Reads the csv and aligns transliterations to translations at the sentence level.
    This is critical because the raw data is document-level.
    """
    df = pd.read_csv(filepath)
    aligned_rows = []

    print(f"Raw documents: {len(df)}")

    for _, row in df.iterrows():
        src = str(row["transliteration"])
        tgt = str(row["translation"])

        # Simple heuristic alignment
        # Split source by newlines (traditional for cuneiform transliteration)
        src_lines = [s.strip() for s in src.split("\n") if s.strip()]
        
        # Split target by sentence punctuation
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if t.strip()]

        # If the counts match roughly, we assume 1:1 mapping (High Quality Data)
        if len(src_lines) == len(tgt_sents) and len(src_lines) > 1:
            for s, t in zip(src_lines, tgt_sents):
                # Filter out very short garbage fragments
                if len(s) > 3 and len(t) > 3:
                    aligned_rows.append({"transliteration": s, "translation": t})
        else:
            # Fallback: Use the whole document if we can't align perfectly
            # This ensures we don't lose data, even if it's noisy
            aligned_rows.append({"transliteration": src, "translation": tgt})

    print(f"Aligned training examples: {len(aligned_rows)}")
    return pd.DataFrame(aligned_rows)

# Load data
train_df = load_and_align_data(f"{DATA_DIR}/train.csv")

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(train_df)
# Split: 95% Train, 5% Validation (we need max data for training)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

# A5 . Tokenization

In [None]:
print("Loading Tokenizer from:", MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    inputs = [PREFIX + doc for doc in examples["transliteration"]]
    targets = examples["translation"]

    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length" # Consistent padding helps training stability
    )
    
    labels = tokenizer(
        targets, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    # Replace padding token id with -100 so it's ignored by the loss function
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Process datasets
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset["test"].map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names)

# A6. Model Setup

In [None]:
print("Loading Model from:", MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

# Data Collator handles dynamic padding during batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

# A7. Training Arguments

In [None]:
# --- 5. Training Arguments ---
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # --- CRITICAL FIX: Stop saving checkpoints to save disk space ---
    save_strategy="no",           # Do not save every epoch
    eval_strategy="epoch",        # Still evaluate to see progress
    load_best_model_at_end=False, # Must be False if save_strategy is "no"
    # ---------------------------------------------------------------

    learning_rate=3e-4,
    
    # Keep your memory fixes
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    
    num_train_epochs=9, #increased to 9 from 5
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=False,
    report_to="none"
)

# A8. Trainer

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# A9. Execution

In [None]:
gc.collect()
torch.cuda.empty_cache()

print("Starting Training with Memory Fixes...")
trainer.train()

# A10. Save Final Model

In [None]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Notebook A Complete.")