# C1. Imports & Configuration

In [None]:
pip install sacremoses

In [1]:
import os
import re
import gc
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)

# --- Configuration ---
# Path provided by you for Notebook C
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/opus-mt-mul-en"
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "./marian-mt-saved"

# MarianMT is optimized for sentence-level translation.
# 128 tokens is usually plenty for a single Akkadian sentence.
MAX_LENGTH = 160 

# MarianMT does not strictly require a prefix, but adding one can help alignment.
# We will use an empty prefix here as the model is already aligned for mul->en.
PREFIX = ""

2025-12-20 17:21:00.834252: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766251261.017805      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766251261.070683      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766251261.512574      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766251261.512610      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766251261.512613      55 computation_placer.cc:177] computation placer alr

# C2.Data Loading & Alignment

In [2]:
def load_and_align_data(filepath):
    """
    Aligns Akkadian transliterations to English translations.
    """
    df = pd.read_csv(filepath)
    aligned_rows = []

    print(f"Raw documents: {len(df)}")

    for _, row in df.iterrows():
        src = str(row["transliteration"]).strip()
        tgt = str(row["translation"]).strip()

        # Split source by newlines
        src_lines = [s.strip() for s in src.split("\n") if len(s.strip()) > 1]
        
        # Split target by sentence punctuation
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if len(t.strip()) > 1]

        if len(src_lines) == len(tgt_sents) and len(src_lines) > 1:
            for s, t in zip(src_lines, tgt_sents):
                aligned_rows.append({"src": s, "tgt": t})
        else:
            aligned_rows.append({"src": src.replace("\n", " "), "tgt": tgt})

    print(f"Aligned training examples: {len(aligned_rows)}")
    return pd.DataFrame(aligned_rows)

# Load and Split Data
df = load_and_align_data(f"{DATA_DIR}/train.csv")
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

Raw documents: 1561
Aligned training examples: 1561


# C3. Tokenization

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    inputs = [ex for ex in examples["src"]]
    targets = examples["tgt"]

    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=MAX_LENGTH, 
            truncation=True, 
            padding="max_length"
        )

    # Replace padding token id with -100
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    return model_inputs

# Apply processing
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["test"].map(preprocess_function, batched=True)



Map:   0%|          | 0/1482 [00:00<?, ? examples/s]



Map:   0%|          | 0/79 [00:00<?, ? examples/s]

# C4. Model Setup

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

# C5. Training Configuration

In [None]:
# --- CORRECTED C5. Training Configuration (Disk Space Safe) ---
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # --- DISK SPACE FIXES ---
    save_strategy="no",           # Do NOT save checkpoints during training
    eval_strategy="epoch",        # Evaluate every epoch
    load_best_model_at_end=False, # Must be False if we aren't saving checkpoints
    # ------------------------
    
    learning_rate=2e-5, 
    
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,

    logging_steps=10, #added by me
    
    num_train_epochs=13, #increased to 13 from 7
    weight_decay=0.01,
    predict_with_generate=True,
    
    fp16=True, # MarianMT is safe with fp16
    report_to="none"
)

# C6. Execution

In [None]:
torch.cuda.empty_cache()
gc.collect()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting MarianMT Training...")
trainer.train()

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting MarianMT Training...


Epoch,Training Loss,Validation Loss
1,No log,2.983993
2,No log,2.58856
3,No log,2.435226


# C7. Save Model

In [None]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Notebook C (MarianMT) Complete.")