# C1. Imports & Configuration

In [1]:
!pip install -q sacremoses

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h

In [2]:
!pip install -q evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import re
import gc
import pandas as pd
import torch
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    set_seed,
)

# Memory safety tweaks
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
try:
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = False
    torch.set_float32_matmul_precision("medium")
except Exception:
    pass

# --- Configuration ---
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/opus-mt-mul-en"
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "/kaggle/working/marian-mt-saved"

MAX_LENGTH = 160 
PREFIX = ">>eng<< "  # CRITICAL: MarianMT requires target language prefix

set_seed(42)

2026-01-08 11:14:41.317325: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767870881.746312      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767870881.898369      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767870882.934256      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767870882.934298      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767870882.934301      55 computation_placer.cc:177] computation placer alr

# C2.Data Loading & Alignment

In [4]:
SUBSCRIPT_TRANS = str.maketrans({"₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9", "ₓ": "x"})


def normalize_subscripts(text: str) -> str:

    return text.translate(SUBSCRIPT_TRANS)



def replace_gaps(text):

    """Replace various gap notations with standardized tokens"""

    if pd.isna(text): 

        return text

    

    # Complex gap patterns (order matters)

    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)

    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)

    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)



    # Simple gap patterns

    text = re.sub(r'xx', '<gap>', text)

    text = re.sub(r' x ', ' <gap> ', text)

    text = re.sub(r'……', '<big_gap>', text)

    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)

    text = re.sub(r'…', '<big_gap>', text)

    text = re.sub(r'\.\.\.', '<big_gap>', text)



    return text



def replace_gaps_back(text):

    """Convert standardized gap tokens back to original format"""

    if pd.isna(text):  

        return text

    

    text = re.sub(r'<gap>', 'x', text)

    text = re.sub(r'<big_gap>', '...', text)



    return text



def clean_translit(text):

    """Normalize transliteration by stripping scribal marks and gaps."""

    if not isinstance(text, str):

        return ""

    text = normalize_subscripts(text)

    # Apply gap replacement first

    text = replace_gaps(text)

    text = re.sub(r"\[[^\]]*\]", " ", text)

    text = re.sub(r"<<[^>]*>>", " ", text)

    text = re.sub(r"[˹˺]", " ", text)

    text = re.sub(r"\([^)]*\)", " ", text)

    text = re.sub(r"\{([^}]*)\}", r"\1", text)

    text = re.sub(r"<([^>]*)>", r"\1", text)

    text = re.sub(r"[!?/:·]", " ", text)

    text = re.sub(r"\s+", " ", text)

    return text.strip()



def clean_translation(text):

    if not isinstance(text, str):

        return ""

    text = text.replace("…", " ")

    text = re.sub(r"\s+", " ", text)

    return text.strip()



def filter_quality(df):

    df["src_len"] = df["src"].str.split().str.len()

    df["tgt_len"] = df["tgt"].str.split().str.len()

    df = df[(df["src_len"] >= 3) & (df["tgt_len"] >= 3)]

    ratio = (df["src_len"] / df["tgt_len"]).clip(upper=6)

    df = df[(ratio >= 0.2) & (ratio <= 5)]

    df = df.drop_duplicates(subset=["src", "tgt"])

    return df.drop(columns=["src_len", "tgt_len"])



def load_and_align_data(filepath):

    """

    Aligns Akkadian transliterations to English translations.

    """

    df = pd.read_csv(filepath)

    aligned_rows = []



    print(f"Raw documents: {len(df)}")



    for _, row in df.iterrows():

        src = clean_translit(row.get("transliteration", ""))

        tgt = clean_translation(row.get("translation", ""))



        src_lines = [s.strip() for s in src.split("\n") if len(s.strip()) > 1]

        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if len(t.strip()) > 1]



        if len(src_lines) == len(tgt_sents) and len(src_lines) > 1:

            for s, t in zip(src_lines, tgt_sents):

                aligned_rows.append({"src": s, "tgt": t})

        else:

            merged_src = src.replace("\n", " ")

            if len(merged_src) > 3 and len(tgt) > 3:

                aligned_rows.append({"src": merged_src, "tgt": tgt})



    print(f"Aligned training examples (pre-filter): {len(aligned_rows)}")

    out_df = filter_quality(pd.DataFrame(aligned_rows))

    print(f"Aligned training examples (post-filter): {len(out_df)}")

    return out_df

In [5]:
# Broad Search miner and main dataset assembly

from tqdm.auto import tqdm



def mine_publications_data():

    print("\n" + "="*60)

    print("MINING PUBLICATIONS FOR ADDITIONAL DATA (BROAD MODE)")

    print("="*60)



    pub_path = f"{DATA_DIR}/publications.csv"

    pub_texts_path = f"{DATA_DIR}/published_texts.csv"



    print(f"Looking for: {pub_path}")

    if not os.path.exists(pub_path):

        print(f"❌ Error: File not found at {pub_path}")

        return pd.DataFrame(columns=["src", "tgt"])



    pubs = pd.read_csv(pub_path)

    pub_texts = pd.read_csv(pub_texts_path)



    akkadian_mask = pubs['has_akkadian'].astype(str).str.lower() == 'true'
    eng_mask = pubs['page_text'].astype(str).str.contains(r'\b(the|and|that|with)\b', case=False)

    pubs = pubs[eng_mask].copy()

    print(f"Searching {len(pubs)} relevant publication pages...")



    augmented_rows = []

    candidates = pub_texts.dropna(subset=['cdli_id']).head(3000)



    for _, row in tqdm(candidates.iterrows(), total=len(candidates)):

        cdli_ids = str(row['cdli_id']).split('|')

        translit = clean_translit(str(row.get('transliteration', '')))

        if len(translit.split()) < 3:

            continue

        for pid in cdli_ids:

            pid = pid.strip()

            if len(pid) < 4:

                continue

            matches = pubs[pubs['page_text'].astype(str).str.contains(pid, regex=False)]

            if matches.empty:

                continue

            content = str(matches.iloc[0]['page_text'])

            idx = content.find(pid)

            snippet = content[idx:idx+1000] if idx != -1 else content[:1000]

            potential_trans = re.findall(r'([A-Z][a-z\s\-,;]{20,300}[\.\!\?])', snippet)

            for sent in potential_trans:

                if len(sent.split()) > 5 and "Assyrian" not in sent:

                    augmented_rows.append({"src": translit, "tgt": sent.strip()})

                    break



    if not augmented_rows:

        print("⚠️ Warning: Still found 0 pairs. Check regex or data.")

        return pd.DataFrame(columns=["src", "tgt"])



    result_df = pd.DataFrame(augmented_rows)

    result_df = result_df.drop_duplicates(subset=['src'])

    print(f"✓ SUCCESS: Mined {len(result_df)} additional training pairs!")

    return filter_quality(result_df)



# Main execution

train_df = load_and_align_data(f"{DATA_DIR}/train.csv")

mined_df = mine_publications_data()



if len(mined_df) > 0:

    print(f"Merging {len(mined_df)} mined examples...")

    train_df = pd.concat([train_df, mined_df], ignore_index=True)



dataset = Dataset.from_pandas(train_df)

dataset = dataset.train_test_split(test_size=0.05, seed=42)

Raw documents: 1561
Aligned training examples (pre-filter): 1561
Aligned training examples (post-filter): 1529

MINING PUBLICATIONS FOR ADDITIONAL DATA (BROAD MODE)
Looking for: /kaggle/input/deep-past-initiative-machine-translation/publications.csv


  eng_mask = pubs['page_text'].astype(str).str.contains(r'\b(the|and|that|with)\b', case=False)


Searching 144614 relevant publication pages...


  0%|          | 0/3000 [00:00<?, ?it/s]



In [6]:
# Quick data stats after mining and merge

sup_count_est = len(train_df) - (len(mined_df) if isinstance(mined_df, pd.DataFrame) else 0)

print("\n=== DATASET COUNTS ===")

print(f"Supervised pairs (est.): {sup_count_est}")

print(f"Mined pairs: {len(mined_df) if isinstance(mined_df, pd.DataFrame) else 0}")

print(f"Total pairs: {len(train_df)}")


=== DATASET COUNTS ===
Supervised pairs (est.): 1529
Mined pairs: 0
Total pairs: 1529


# C3. Tokenization

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    # Add prefix for MarianMT to specify target language
    inputs = [PREFIX + ex for ex in examples["src"]]
    targets = examples["tgt"]

    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=MAX_LENGTH, 
            truncation=True, 
            padding="max_length"
        )

    # Replace padding token id with -100
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    return model_inputs

# Apply processing
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/1452 [00:00<?, ? examples/s]



Map:   0%|          | 0/77 [00:00<?, ? examples/s]

# C4. Model Setup

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

# C5. Training Configuration

In [9]:
# --- C5. Training Configuration (Optimized for 31+ Score) ---
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # --- DISK SPACE & SPEED ---
    save_strategy="no",           # No checkpoints to save disk space
    eval_strategy="no",           # Skip eval for faster training
    load_best_model_at_end=False,
    
    learning_rate=3e-5,           # Slightly higher for better convergence
    
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Effective batch = 16
    gradient_checkpointing=False,    # MarianMT is memory efficient
    
    num_train_epochs=18,            # More epochs for this fast model
    weight_decay=0.01,
    predict_with_generate=False,    # Faster training
    
    fp16=True,                      # Mixed precision
    report_to="none",
    logging_steps=50,
    
    # Quality optimizations
    label_smoothing_factor=0.1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    generation_max_length=180,
    generation_num_beams=6
)

# C6. Execution

In [10]:
torch.cuda.empty_cache()
gc.collect()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting MarianMT Training...")
trainer.train()

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting MarianMT Training...


Step,Training Loss
50,10.8017
100,4.1792
150,3.6555
200,3.4421
250,3.2983
300,3.1713
350,3.1383
400,3.0235
450,2.9769
500,2.9412


TrainOutput(global_step=828, training_loss=3.5873839981889954, metrics={'train_runtime': 579.834, 'train_samples_per_second': 45.075, 'train_steps_per_second': 1.428, 'total_flos': 1107459582197760.0, 'train_loss': 3.5873839981889954, 'epoch': 18.0})

In [11]:
# POST-TRAINING VALIDATION - Load fresh data and evaluate
print("\n=== POST-TRAINING VALIDATION ===")

# Reload validation data fresh
val_df = pd.read_csv(f"{DATA_DIR}/train.csv")
val_texts = []
val_refs_list = []

for _, row in val_df.iterrows():
    src = clean_translit(row.get("transliteration", ""))
    tgt = clean_translation(row.get("translation", ""))
    if len(src) > 5 and len(tgt) > 5:
        val_texts.append(src)
        val_refs_list.append(tgt)

# Use ~200 samples for quick validation
val_texts = val_texts[:200]
val_refs_list = val_refs_list[:200]
val_refs = [[t] for t in val_refs_list]

print(f"Validating on {len(val_texts)} samples...")

metric_bleu = evaluate.load("sacrebleu")
metric_chrf = evaluate.load("chrf")

def dedup_repeats(text: str) -> str:
    toks = text.split()
    out = []
    for t in toks:
        if len(out) >= 2 and t == out[-1] == out[-2]:
            continue
        out.append(t)
    return " ".join(out)

def postprocess_text(preds):
    out = []
    for p in preds:
        p = p.strip()
        p = re.sub(r"\s+([.,!?;:])", r"\1", p)
        p = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", p)
        p = dedup_repeats(p)
        if p and p[0].islower():
            p = p[0].upper() + p[1:]
        if p and p[-1] not in ".!?":
            p += "."
        p = re.sub(r"([.!?]){2,}", ".", p)
        out.append(p.strip())
    return out

def generate_batch(texts):
    batch_inputs = [">>eng<< " + doc for doc in texts]
    enc = tokenizer(batch_inputs, max_length=160, truncation=True, padding=True, return_tensors="pt").to(model.device)
    gen = model.generate(
        **enc,
        max_length=180,
        min_length=6,
        num_beams=6,
        no_repeat_ngram_size=3,
        length_penalty=1.05,
        early_stopping=True,
    )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

preds = []
for i in range(0, len(val_texts), 8):
    preds.extend(generate_batch(val_texts[i:i+8]))

preds = postprocess_text(preds)
bleu = metric_bleu.compute(predictions=preds, references=val_refs)
chrf = metric_chrf.compute(predictions=preds, references=val_refs)
print(f"Validation BLEU: {bleu['score']:.2f}, chrF: {chrf['score']:.2f}")


=== POST-TRAINING VALIDATION ===
Validating on 200 samples...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Validation BLEU: 6.22, chrF: 26.33


# C7. Save Model

In [12]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Notebook C (MarianMT) Complete.")

Saving model to /kaggle/working/marian-mt-saved...
Notebook C (MarianMT) Complete.


In [13]:
# C8. Optional Self-Training Augmentation (Fast, Quality-Filtered)
ENABLE_SELF_TRAIN = False
MAX_PSEUDO = int(os.getenv("MARIAN_MAX_PSEUDO", "1500"))

if ENABLE_SELF_TRAIN:
    print("\n=== SELF-TRAINING AUGMENTATION (MarianMT) ===")
    pub_path = f"{DATA_DIR}/published_texts.csv"
    if os.path.exists(pub_path):
        pub_df = pd.read_csv(pub_path)
        translits = pub_df.get("transliteration", pd.Series([])).dropna().astype(str).tolist()
        translits = [clean_translit(t) for t in translits]
        translits = [t for t in translits if 5 <= len(t.split()) <= 140]
        translits = translits[:MAX_PSEUDO]
        print(f"Generating pseudo translations for {len(translits)} extra transliterations...")

        def generate_batch(texts):
            batch_inputs = [PREFIX + doc for doc in texts]
            enc = tokenizer(batch_inputs, max_length=MAX_LENGTH, truncation=True, padding=True, return_tensors="pt").to(model.device)
            gen = model.generate(
                **enc,
                max_length=180,
                min_length=6,
                num_beams=6,
                no_repeat_ngram_size=3,
                length_penalty=1.05,
                early_stopping=True,
            )
            return tokenizer.batch_decode(gen, skip_special_tokens=True)

        pseudo_trans = []
        for i in range(0, len(translits), 16):
            batch_preds = generate_batch(translits[i:i+16])
            pseudo_trans.extend(batch_preds)

        # Postprocess & quality filter
        def dedup_repeats(text: str) -> str:
            toks = text.split()
            out = []
            for t in toks:
                if len(out) >= 2 and t == out[-1] == out[-2]:
                    continue
                out.append(t)
            return " ".join(out)
        def postprocess_text(preds):
            out = []
            for p in preds:
                p = p.strip()
                p = re.sub(r"\s+([.,!?;:])", r"\1", p)
                p = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", p)
                p = dedup_repeats(p)
                if p and p[0].islower():
                    p = p[0].upper() + p[1:]
                if p and p[-1] not in ".!?":
                    p += "."
                p = re.sub(r"([.!?]){2,}", ".", p)
                out.append(p.strip())
            return out

        pseudo_trans = postprocess_text(pseudo_trans)
        aug_df = pd.DataFrame({"transliteration": translits, "translation": pseudo_trans})
        aug_df["src_len"] = aug_df["transliteration"].str.split().str.len()
        aug_df["tgt_len"] = aug_df["translation"].str.split().str.len()
        ratio = (aug_df["tgt_len"] / aug_df["src_len"]).clip(upper=6)
        aug_df = aug_df[(aug_df["tgt_len"] >= 4) & (ratio >= 0.5) & (ratio <= 6)]
        aug_df = aug_df.drop(columns=["src_len", "tgt_len"])
        print(f"Pseudo pairs retained after filtering: {len(aug_df)}")

        base_train = pd.read_csv(f"{DATA_DIR}/train.csv")
        base_train = base_train.dropna(subset=["transliteration", "translation"]).astype(str)
        base_train["transliteration"] = base_train["transliteration"].map(clean_translit)
        base_train["translation"] = base_train["translation"].map(clean_translation)
        combined = pd.concat([
            base_train[["transliteration", "translation"]],
            aug_df[["transliteration", "translation"]]
        ], axis=0).drop_duplicates().reset_index(drop=True)
        print(f"Total combined training pairs: {len(combined)}")

        def preprocess_function_aug(examples):
            inputs = [PREFIX + ex for ex in examples["transliteration"]]
            targets = examples["translation"]
            model_inputs = tokenizer(
                inputs,
                max_length=MAX_LENGTH,
                truncation=True,
                padding="max_length"
            )
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(
                    targets,
                    max_length=MAX_LENGTH,
                    truncation=True,
                    padding="max_length"
                )
            model_inputs["labels"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label]
                for label in labels["input_ids"]
            ]
            return model_inputs

        ds_combined = Dataset.from_pandas(combined)
        tokenized_combined = ds_combined.map(preprocess_function_aug, batched=True)

        training_args_aug = Seq2SeqTrainingArguments(
            output_dir=OUTPUT_DIR,
            save_strategy="no",
            eval_strategy="no",
            load_best_model_at_end=False,
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=2,
            num_train_epochs=2,
            fp16=True,
            report_to="none"
        )
        trainer_aug = Seq2SeqTrainer(
            model=model,
            args=training_args_aug,
            train_dataset=tokenized_combined,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        print("Starting second-stage training with augmented data...")
        trainer_aug.train()
        print("Augmentation stage complete.")
        
        print(f"Saving augmented model to {OUTPUT_DIR}...")
        trainer_aug.save_model(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
    else:
        print("published_texts.csv not found; skipping self-training.")