# C1. Imports & Configuration

In [1]:
!pip install -q sacremoses

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import re
import gc
import pandas as pd
import torch
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    set_seed,
)

# --- Configuration ---
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/opus-mt-mul-en"
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "/kaggle/working/marian-mt-saved"

MAX_LENGTH = 160 
PREFIX = ""

set_seed(42)

2025-12-24 17:29:14.877037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766597355.132639      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766597355.207678      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766597355.816664      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766597355.816708      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766597355.816711      17 computation_placer.cc:177] computation placer alr

# C2.Data Loading & Alignment

In [4]:
SUBSCRIPT_TRANS = str.maketrans({"₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9", "ₓ": "x"})

def normalize_subscripts(text: str) -> str:
    return text.translate(SUBSCRIPT_TRANS)

def clean_translit(text):
    """Normalize transliteration by stripping scribal marks and gaps."""
    if not isinstance(text, str):
        return ""
    text = normalize_subscripts(text)
    text = text.replace("…", " <big_gap> ")
    text = re.sub(r"\\.\\.\\.+", " <big_gap> ", text)
    text = re.sub(r"\[[^\]]*\]", " ", text)
    text = re.sub(r"<<[^>]*>>", " ", text)
    text = re.sub(r"[˹˺]", " ", text)
    text = re.sub(r"\([^)]*\)", " ", text)
    text = re.sub(r"\{([^}]*)\}", r"\1", text)
    text = re.sub(r"<([^>]*)>", r"\1", text)
    text = re.sub(r"[!?/:·]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_translation(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("…", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def filter_quality(df):
    df["src_len"] = df["src"].str.split().str.len()
    df["tgt_len"] = df["tgt"].str.split().str.len()
    df = df[(df["src_len"] >= 3) & (df["tgt_len"] >= 3)]
    ratio = (df["src_len"] / df["tgt_len"]).clip(upper=6)
    df = df[(ratio >= 0.2) & (ratio <= 5)]
    df = df.drop_duplicates(subset=["src", "tgt"])
    return df.drop(columns=["src_len", "tgt_len"])

def load_and_align_data(filepath):
    """
    Aligns Akkadian transliterations to English translations.
    """
    df = pd.read_csv(filepath)
    aligned_rows = []

    print(f"Raw documents: {len(df)}")

    for _, row in df.iterrows():
        src = clean_translit(row.get("transliteration", ""))
        tgt = clean_translation(row.get("translation", ""))

        src_lines = [s.strip() for s in src.split("\n") if len(s.strip()) > 1]
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if len(t.strip()) > 1]

        if len(src_lines) == len(tgt_sents) and len(src_lines) > 1:
            for s, t in zip(src_lines, tgt_sents):
                aligned_rows.append({"src": s, "tgt": t})
        else:
            merged_src = src.replace("\n", " ")
            if len(merged_src) > 3 and len(tgt) > 3:
                aligned_rows.append({"src": merged_src, "tgt": tgt})

    print(f"Aligned training examples (pre-filter): {len(aligned_rows)}")
    out_df = filter_quality(pd.DataFrame(aligned_rows))
    print(f"Aligned training examples (post-filter): {len(out_df)}")
    return out_df

df = load_and_align_data(f"{DATA_DIR}/train.csv")
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

Raw documents: 1561
Aligned training examples (pre-filter): 1561
Aligned training examples (post-filter): 1529


# C3. Tokenization

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    inputs = [ex for ex in examples["src"]]
    targets = examples["tgt"]

    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=MAX_LENGTH, 
            truncation=True, 
            padding="max_length"
        )

    # Replace padding token id with -100
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    return model_inputs

# Apply processing
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/1452 [00:00<?, ? examples/s]



Map:   0%|          | 0/77 [00:00<?, ? examples/s]

# C4. Model Setup

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

# C5. Training Configuration

In [7]:
# --- CORRECTED C5. Training Configuration (Disk Space Safe) ---
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # --- DISK SPACE FIXES ---
    save_strategy="no",           # Do NOT save checkpoints during training
    eval_strategy="epoch",        # Evaluate every epoch
    load_best_model_at_end=False, # Must be False if we aren't saving checkpoints
    # ------------------------
    
    learning_rate=2e-5, 
    
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,

    logging_steps=10, #added by me
    
    num_train_epochs=15, #increased to 13 from 7
    weight_decay=0.01,
    predict_with_generate=True,
    
    fp16=True, # MarianMT is safe with fp16
    report_to="none",
    
    # Accuracy-focused tweaks
    label_smoothing_factor=0.1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.04,
    generation_max_length=180,
    generation_num_beams=5
)

# C6. Execution

In [8]:
torch.cuda.empty_cache()
gc.collect()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting MarianMT Training...")
trainer.train()

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting MarianMT Training...




Epoch,Training Loss,Validation Loss
1,4.2999,4.151141
2,3.7533,3.669682
3,3.5603,3.487791
4,3.4512,3.384483
5,3.2478,3.31934
6,3.1949,3.265671
7,3.2355,3.23331
8,3.0619,3.209113
9,3.0524,3.183433
10,3.0361,3.170129


TrainOutput(global_step=1365, training_loss=3.5915619986397878, metrics={'train_runtime': 22057.9758, 'train_samples_per_second': 0.987, 'train_steps_per_second': 0.062, 'total_flos': 922882985164800.0, 'train_loss': 3.5915619986397878, 'epoch': 15.0})

In [9]:
# Validation BLEU/chrF on held-out split
metric_bleu = evaluate.load("sacrebleu")
metric_chrf = evaluate.load("chrf")

def dedup_repeats(text: str) -> str:
    toks = text.split()
    out = []
    for t in toks:
        if len(out) >= 2 and t == out[-1] == out[-2]:
            continue
        out.append(t)
    return " ".join(out)

def postprocess_text(preds):
    out = []
    for p in preds:
        p = p.strip()
        p = re.sub(r"\s+([.,!?;:])", r"\1", p)
        p = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", p)
        p = dedup_repeats(p)
        if p and p[0].islower():
            p = p[0].upper() + p[1:]
        if p and p[-1] not in ".!?":
            p += "."
        p = re.sub(r"([.!?]){2,}", ".", p)
        out.append(p.strip())
    return out

val_texts = dataset["test"]["src"]
val_refs = [[t] for t in dataset["test"]["tgt"]]

def generate_batch(texts):
    enc = tokenizer(texts, max_length=MAX_LENGTH, truncation=True, padding=True, return_tensors="pt").to(model.device)
    gen = model.generate(
        **enc,
        max_length=MAX_LENGTH,
        min_length=6,
        num_beams=6,
        no_repeat_ngram_size=3,
        length_penalty=1.05,
        early_stopping=True,
    )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

preds = []
for i in range(0, len(val_texts), 32):
    preds.extend(generate_batch(val_texts[i:i+32]))

preds = postprocess_text(preds)
bleu = metric_bleu.compute(predictions=preds, references=val_refs)
chrf = metric_chrf.compute(predictions=preds, references=val_refs)
print({"bleu": bleu["score"], "chrf": chrf["score"]})

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

{'bleu': 6.388023561794478, 'chrf': 26.541406083855307}


# C7. Save Model

In [10]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Notebook C (MarianMT) Complete.")

Saving model to /kaggle/working/marian-mt-saved...
Notebook C (MarianMT) Complete.
