In [None]:
!nvidia-smi

In [None]:
!pip install -q evaluate sacrebleu

# B1. Imports & Configuration

In [None]:
import os
import re
import gc
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)

# --- Configuration ---
# Path provided by you for Notebook B
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/t5-base"
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "./t5-base-fine-tuned"

# T5 uses subwords, so sequences are shorter than ByT5. 
# 256 is usually sufficient for sentence-level Akkadian.
MAX_LENGTH = 266 

# T5 is a multi-task model, so we must provide a task prefix.
PREFIX = "translate Akkadian to English: "

# B2. Data Loading & Alignment

In [None]:
def load_and_align_data(filepath):
    """
    Aligns Akkadian transliterations to English translations.
    Tries to split by line/sentence; falls back to document-level if counts mismatch.
    """
    df = pd.read_csv(filepath)
    aligned_rows = []

    print(f"Raw documents: {len(df)}")

    for _, row in df.iterrows():
        src = str(row["transliteration"]).strip()
        tgt = str(row["translation"]).strip()

        # Split source by newlines (standard cuneiform formatting)
        src_lines = [s.strip() for s in src.split("\n") if len(s.strip()) > 1]
        
        # Split target by sentence punctuation (. ! ?)
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if len(t.strip()) > 1]

        # Heuristic: If line counts match, assume 1:1 alignment
        if len(src_lines) == len(tgt_sents) and len(src_lines) > 1:
            for s, t in zip(src_lines, tgt_sents):
                aligned_rows.append({"src": s, "tgt": t})
        else:
            # Fallback: Use the whole document to avoid data loss
            aligned_rows.append({"src": src.replace("\n", " "), "tgt": tgt})

    print(f"Aligned training examples: {len(aligned_rows)}")
    return pd.DataFrame(aligned_rows)

# Load and Split Data
df = load_and_align_data(f"{DATA_DIR}/train.csv")
dataset = Dataset.from_pandas(df)

# 95% Train / 5% Validation
dataset = dataset.train_test_split(test_size=0.05, seed=42)
print("Data loaded successfully.")

# B3. Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    # Add prefix to inputs
    inputs = [PREFIX + doc for doc in examples["src"]]
    targets = examples["tgt"]

    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    labels = tokenizer(
        targets, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    # Replace padding token id with -100 to ignore in loss calculation
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply processing
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["test"].map(preprocess_function, batched=True)

# B4. Model Setup

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

# B5 . Training Configuration

In [None]:
# --- CORRECTED B5. Training Configuration (Disk Space Safe) ---
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # --- DISK SPACE FIXES ---
    save_strategy="no",           # Do NOT save checkpoints during training
    eval_strategy="epoch",        # Evaluate every epoch
    load_best_model_at_end=False, # Must be False if we aren't saving checkpoints
    # ------------------------
    
    learning_rate=2e-4, 
    
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    
    num_train_epochs=7,
    weight_decay=0.01,
    predict_with_generate=True,
    
    fp16=True, # T5 is safe with fp16
    report_to="none"
)

# B6. Execution

In [None]:
# Clear cache before training
torch.cuda.empty_cache()
gc.collect()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting T5-Base Training...")
trainer.train()

# B7. Save Model

In [None]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Notebook B (T5) Complete.")