In [1]:
!nvidia-smi

Wed Dec 24 18:47:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             27W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

# A1. Install required libraries

In [2]:
!pip install -q evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

# A2. Imports & config

In [3]:
import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed
)
import evaluate

set_seed(42)

2025-12-24 18:48:09.421166: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766602089.596587      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766602089.651551      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766602090.072045      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766602090.072089      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766602090.072092      24 computation_placer.cc:177] computation placer alr

# A3. Set constants (DO NOT change yet)

In [None]:
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/byt5-base"
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "/kaggle/working/byt5-base-saved"

# ByT5 is character-based. 380 provides good coverage without excessive memory
MAX_LENGTH = 380
PREFIX = "translate Akkadian to English: "

# A4. Data Loading & Cleaning

In [5]:
SUBSCRIPT_TRANS = str.maketrans({"₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9", "ₓ": "x"})

def normalize_subscripts(text: str) -> str:
    return text.translate(SUBSCRIPT_TRANS)

def clean_translit(text):
    """Normalize transliteration following competition guidance."""
    if not isinstance(text, str):
        return ""
    text = normalize_subscripts(text)
    text = text.replace("…", " <big_gap> ")
    text = re.sub(r"\\.\\.\\.+", " <big_gap> ", text)
    text = re.sub(r"\[[^\]]*\]", " ", text)           # remove broken text markers
    text = re.sub(r"<<[^>]*>>", " ", text)               # errant signs
    text = re.sub(r"[˹˺]", " ", text)                    # half brackets
    text = re.sub(r"\([^)]*\)", " ", text)             # comments/erasures
    text = re.sub(r"\{([^}]*)\}", r"\1", text)         # determinatives
    text = re.sub(r"<([^>]*)>", r"\1", text)            # scribal insertions keep content
    text = re.sub(r"[!?/:·]", " ", text)                 # scribal punctuation
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_translation(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("…", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def filter_quality(df):
    df["src_len"] = df["transliteration"].str.split().str.len()
    df["tgt_len"] = df["translation"].str.split().str.len()
    df = df[(df["src_len"] >= 3) & (df["tgt_len"] >= 3)]
    ratio = (df["src_len"] / df["tgt_len"]).clip(upper=6)
    df = df[(ratio >= 0.2) & (ratio <= 5)]
    df = df.drop_duplicates(subset=["transliteration", "translation"])
    return df.drop(columns=["src_len", "tgt_len"])

def load_and_align_data(filepath):
    """
    Reads the csv and aligns transliterations to translations at the sentence level.
    Falls back to document-level when counts do not match, then filters noisy pairs.
    """
    df = pd.read_csv(filepath)
    aligned_rows = []

    print(f"Raw documents: {len(df)}")

    for _, row in df.iterrows():
        src = clean_translit(row.get("transliteration", ""))
        tgt = clean_translation(row.get("translation", ""))

        src_lines = [s.strip() for s in src.split("\n") if s.strip()]
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if t.strip()]

        if len(src_lines) == len(tgt_sents) and len(src_lines) > 1:
            for s, t in zip(src_lines, tgt_sents):
                if len(s) > 3 and len(t) > 3:
                    aligned_rows.append({"transliteration": s, "translation": t})
        else:
            merged_src = src.replace("\n", " ")
            if len(merged_src) > 3 and len(tgt) > 3:
                aligned_rows.append({"transliteration": merged_src, "translation": tgt})

    print(f"Aligned training examples (pre-filter): {len(aligned_rows)}")
    out_df = filter_quality(pd.DataFrame(aligned_rows))
    print(f"Aligned training examples (post-filter): {len(out_df)}")
    return out_df

train_df = load_and_align_data(f"{DATA_DIR}/train.csv")

dataset = Dataset.from_pandas(train_df)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

Raw documents: 1561
Aligned training examples (pre-filter): 1561
Aligned training examples (post-filter): 1529


# A5 . Tokenization

In [6]:
print("Loading Tokenizer from:", MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    inputs = [PREFIX + doc for doc in examples["transliteration"]]
    targets = examples["translation"]

    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length" # Consistent padding helps training stability
    )
    
    labels = tokenizer(
        targets, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    # Replace padding token id with -100 so it's ignored by the loss function
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Process datasets
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset["test"].map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names)

Loading Tokenizer from: /kaggle/input/models-for-dpc/pretrained_models/byt5-base


Map:   0%|          | 0/1452 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

# A6. Model Setup

In [7]:
print("Loading Model from:", MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

# Data Collator handles dynamic padding during batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

Loading Model from: /kaggle/input/models-for-dpc/pretrained_models/byt5-base


# A7. Training Arguments

In [None]:
# --- 5. Training Arguments (OPTIMIZED for Quality on GPU P100) ---
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,

    # --- VALIDATION STRATEGY ---
    save_strategy="no",
    eval_strategy="no",                   # Skip validation to save memory, validate after
    load_best_model_at_end=False,
    
    learning_rate=3e-4,

    # --- MEMORY-OPTIMIZED BUT EFFECTIVE ---
    per_device_train_batch_size=1,        # Memory-safe on P100
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,       # Effective batch = 16
    gradient_checkpointing=True,          # Memory optimization
    
    num_train_epochs=10,                  # INCREASED: More epochs for better convergence
    weight_decay=0.01,
    predict_with_generate=False,          # Memory-safe
    fp16=True,
    report_to="none",

    # Quality tweaks
    label_smoothing_factor=0.1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.06,
    generation_max_length=400,
    generation_num_beams=6
)

# A8. Trainer

In [9]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Force aggressive memory cleanup
import gc
torch.cuda.empty_cache()
gc.collect()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


# A9. Execution

In [10]:
gc.collect()
torch.cuda.empty_cache()

print("Starting Training with Memory Fixes...")
trainer.train()

Starting Training with Memory Fixes...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,37284589.568


TrainOutput(global_step=728, training_loss=25607547.78021978, metrics={'train_runtime': 1656.7563, 'train_samples_per_second': 3.506, 'train_steps_per_second': 0.439, 'total_flos': 5183711087689728.0, 'train_loss': 25607547.78021978, 'epoch': 4.0})

In [11]:
# Evaluate on validation split with sacreBLEU and chrF AFTER training (memory-safe)
print("\n=== POST-TRAINING VALIDATION ===")
metric_bleu = evaluate.load("sacrebleu")
metric_chrf = evaluate.load("chrf")

def dedup_repeats(text: str) -> str:
    toks = text.split()
    out = []
    for t in toks:
        if len(out) >= 2 and t == out[-1] == out[-2]:
            continue
        out.append(t)
    return " ".join(out)

def postprocess_text(preds):
    out = []
    for p in preds:
        p = p.strip()
        p = re.sub(r"\s+([.,!?;:])", r"\1", p)
        p = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", p)
        p = dedup_repeats(p)
        if p and p[0].islower():
            p = p[0].upper() + p[1:]
        if p and p[-1] not in ".!?":
            p += "."
        p = re.sub(r"([.!?]){2,}", ".", p)
        out.append(p.strip())
    return out

val_texts = dataset["test"]["transliteration"]
val_refs = [[t] for t in dataset["test"]["translation"]]

def generate_batch(texts):
    batch_inputs = [PREFIX + doc for doc in texts]
    enc = tokenizer(batch_inputs, max_length=MAX_LENGTH, truncation=True, padding=True, return_tensors="pt").to(model.device)
    gen = model.generate(
        **enc,
        max_length=MAX_LENGTH,
        min_length=6,
        num_beams=4,
        no_repeat_ngram_size=3,
        length_penalty=1.05,
        early_stopping=True,
    )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

preds = []
for i in range(0, len(val_texts), 8):
    preds.extend(generate_batch(val_texts[i:i+8]))

preds = postprocess_text(preds)
bleu = metric_bleu.compute(predictions=preds, references=val_refs)
chrf = metric_chrf.compute(predictions=preds, references=val_refs)
print(f"Validation BLEU: {bleu['score']:.2f}, chrF: {chrf['score']:.2f}")


=== POST-TRAINING VALIDATION ===


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Caching is incompatible with gradient checkpointing in T5Block. Setting `past_key_values=None`.


Validation BLEU: 0.00, chrF: 0.15


# A10. Save Final Model

In [12]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Notebook A Complete.")

Saving model to /kaggle/working/byt5-base-saved...
Notebook A Complete.
