In [1]:
!nvidia-smi

Thu Jan  8 07:01:16 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             28W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install -q evaluate sacrebleu

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h

# B1. Imports & Configuration

In [None]:
import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed
)
import evaluate

# === CONFIGURATION: THE GREEDY ===
# Using ByT5-Base checkpoint
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/byt5-base" 
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "/kaggle/working/byt5-greedy-saved"

MAX_LENGTH = 256  # OPTIMIZED: Ultra-short sequences for faster training
PREFIX = "translate Akkadian to English: "

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
try:
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = False
    torch.set_float32_matmul_precision("medium")
except Exception:
    pass

set_seed(42)

2026-01-08 07:01:34.049692: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767855694.229185      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767855694.278889      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767855694.692852      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767855694.692889      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767855694.692892      55 computation_placer.cc:177] computation placer alr

# B2. Data Loading & Alignment

# B1.5. DATA PREPARATION GUIDE: Handling Akkadian Formatting Issues

## Problem: "Garbage In, Garbage Out"
Akkadian texts contain complex formatting that can break ML pipelines if not handled properly.

## Formatting Issues to Handle

### 1. Scribal Notations (Remove)
- `!` - Certain reading (remove)
- `?` - Questionable reading (remove)
- `/` - Line divider (remove)
- `:` or `.` - Word divider (remove)
- `< >` - Scribal insertions (keep content, remove brackets)
- `( )` - Comments/erasures (remove entirely)
- `Àπ À∫` - Half brackets for partially broken signs (remove)
- `[ ]` - Clearly broken signs (keep content, remove brackets)
- `<< >>` - Errant signs (remove entirely)

### 2. Gaps & Lacunae (Standardize)
- `[x]` ‚Üí `<gap>`
- `x` ‚Üí `<gap>`
- `xx` ‚Üí `<gap>`
- `‚Ä¶` ‚Üí `<big_gap>`
- `‚Ä¶‚Ä¶` ‚Üí `<big_gap>`
- `[... ...]` ‚Üí `<big_gap>`
- Multiple `.3` or `...` sequences ‚Üí `<big_gap>`

### 3. Determinatives (Keep content, remove brackets)
- `{d}` - Deity (remove brackets)
- `{ki}` - Earth/location (remove brackets)
- `{lu‚ÇÇ}` - Person (remove brackets)
- `{e‚ÇÇ}` - Building (remove brackets)
- And 10+ others...

### 4. Subscripts & Superscripts (Normalize)
- `a‚ÇÇ` ‚Üí `a2`, `a‚ÇÉ` ‚Üí `a3`, etc.
- `il‚ÇÖ` ‚Üí `il5`, etc.
- Works with Unicode characters (U+2080-U+2089)

### 5. Special Characters (Handle as-is or normalize)
- `≈°` (U+0161), `≈†` (U+0160)
- `·π£` (U+1E63), `·π¢` (U+1E62)
- `·π≠` (U+1E6D), `·π¨` (U+1E6C)
- `·∏´` (U+1E2B), `·∏™` (U+1E2A)
- ` æ` (U+02BE) - Akkadian letter marker

### 6. Capitalization Rules (Preserve)
- First letter capital = Proper noun (personal/place name)
- ALL CAPS = Sumerian logogram (preserve for domain knowledge)

## Processing Order
1. Normalize subscripts FIRST (‚ÇÄ-‚Çâ ‚Üí 0-9)
2. Handle gaps (complex patterns first, then simple)
3. Remove scribal notations
4. Extract content from bracketed structures
5. Clean whitespace
6. Validate output (length checks, character validation)

## Data Validation Checks
‚úì No empty strings after cleaning
‚úì Source length >= 3 words
‚úì Target length >= 3 words
‚úì Length ratio between 0.2 and 5.0
‚úì No duplicate pairs
‚úì All special characters properly handled

In [None]:
SUBSCRIPT_TRANS = str.maketrans({"‚ÇÄ": "0", "‚ÇÅ": "1", "‚ÇÇ": "2", "‚ÇÉ": "3", "‚ÇÑ": "4", "‚ÇÖ": "5", "‚ÇÜ": "6", "‚Çá": "7", "‚Çà": "8", "‚Çâ": "9", "‚Çì": "x"})                           
def normalize_subscripts(text: str) -> str:
    return text.translate(SUBSCRIPT_TRANS)

def replace_gaps(text, keep_gaps=True):
    """Replace various gap notations with standardized tokens
    
    Args:
        keep_gaps: If True, keeps gap tokens (for test-like data).
                   If False, removes them (for clean training).
    """
    if pd.isna(text): 
        return text
    
    # Complex gap patterns (order matters)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)

    # Simple gap patterns
    text = re.sub(r'xx', '<gap>', text)
    text = re.sub(r' x ', ' <gap> ', text)
    text = re.sub(r'‚Ä¶‚Ä¶', '<big_gap>', text)
    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)
    text = re.sub(r'‚Ä¶', '<big_gap>', text)
    text = re.sub(r'\.\.\.', '<big_gap>', text)
    
    if not keep_gaps:
        # Remove gaps for clean training
        text = re.sub(r'<big_gap>', '', text)
        text = re.sub(r'<gap>', '', text)

    return text

def clean_translit(text, keep_gaps=True):
    """Normalize transliteration following competition guidance."""
    if not isinstance(text, str):
        return ""
    text = normalize_subscripts(text)
    text = replace_gaps(text, keep_gaps=keep_gaps)
    text = re.sub(r"<<[^>]*>>", " ", text)               # errant signs
    text = re.sub(r"[ÀπÀ∫]", " ", text)                    # half brackets
    text = re.sub(r"\([^)]*\)", " ", text)             # comments/erasures
    text = re.sub(r"\{([^}]*)\}", r"\1", text)         # determinatives
    text = re.sub(r"<([^>]*)>", r"\1", text)            # scribal insertions keep content
    text = re.sub(r"[!?/:¬∑]", " ", text)                 # scribal punctuation
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_translation(text, has_gaps=False):
    """Clean translation, optionally keeping gap indicators"""
    if not isinstance(text, str):
        return ""
    if not has_gaps:
        text = text.replace("‚Ä¶", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def filter_quality(df):
    df["src_len"] = df["transliteration"].str.split().str.len()
    df["tgt_len"] = df["translation"].str.split().str.len()
    df = df[(df["src_len"] >= 3) & (df["tgt_len"] >= 3)]
    ratio = (df["src_len"] / df["tgt_len"]).clip(upper=6)
    df = df[(ratio >= 0.2) & (ratio <= 5)]
    df = df.drop_duplicates(subset=["transliteration", "translation"])
    return df.drop(columns=["src_len", "tgt_len"])

def load_and_align_data(filepath):
    """
    Enhanced alignment with sentence-level mapping support
    """
    df = pd.read_csv(filepath)
    print(f"Raw documents: {len(df)}")
    
    aligned_rows = []

    for _, row in df.iterrows():
        src = clean_translit(row.get("transliteration", ""), keep_gaps=True)
        tgt = clean_translation(row.get("translation", ""))

        src_lines = [s.strip() for s in src.split("\n") if s.strip()]
        tgt_sents = [t.strip() for t in re.split(r'(?<=[.!?])\s+', tgt) if t.strip()]

        if len(src_lines) == len(tgt_sents) and len(src_lines) > 1:
            for s, t in zip(src_lines, tgt_sents):
                if len(s) > 3 and len(t) > 3:
                    aligned_rows.append({"transliteration": s, "translation": t})
        else:
            merged_src = src.replace("\n", " ")
            if len(merged_src) > 3 and len(tgt) > 3:
                aligned_rows.append({"transliteration": merged_src, "translation": tgt})                                                                                  
    print(f"Aligned training examples (pre-filter): {len(aligned_rows)}")
    out_df = filter_quality(pd.DataFrame(aligned_rows))
    print(f"Aligned training examples (post-filter): {len(out_df)}")
    return out_df

def mine_from_sentences_oare():
    """STRATEGY 1: Direct from Sentences_Oare (Already Translated)"""
    print("\n" + "="*70)
    print("STRATEGY 1: Mining Sentences_Oare (Already Translated)")
    print("="*70)
    
    sentences_path = f"{DATA_DIR}/Sentences_Oare_FirstWord_LinNum.csv"
    if not os.path.exists(sentences_path):
        print(f"‚ö†Ô∏è File not found: {sentences_path}")
        return pd.DataFrame(columns=["transliteration", "translation"])
    
    try:
        df_sentences = pd.read_csv(sentences_path, dtype={'translation': str})
        print(f"Loaded {len(df_sentences)} sentence rows")
        
        pairs = []
        for _, row in df_sentences.iterrows():
            src = str(row.get('display_name', '')).strip()
            tgt = str(row.get('translation', '')).strip()
            
            if src and tgt and len(src.split()) >= 2 and len(tgt.split()) >= 2:
                pairs.append({"transliteration": src, "translation": tgt})
        
        result_df = pd.DataFrame(pairs)
        result_df = result_df.drop_duplicates(subset=['transliteration', 'translation'])                                                                                          
        result_df = filter_quality(result_df)
        
        print(f"‚úì Extracted {len(result_df)} pairs from Sentences_Oare")
        return result_df
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return pd.DataFrame(columns=["transliteration", "translation"])


def mine_from_publications_augmented():
    """STRATEGY 2: Extract structured data from publications"""
    print("\n" + "="*70)
    print("STRATEGY 2: Mining Publications (Structure-based)")
    print("="*70)
    
    pub_path = f"{DATA_DIR}/published_texts.csv"
    if not os.path.exists(pub_path):
        print(f"‚ö†Ô∏è File not found: {pub_path}")
        return pd.DataFrame(columns=["transliteration", "translation"])
    
    try:
        df_pub = pd.read_csv(pub_path)
        print(f"Loaded {len(df_pub)} publication entries")
        
        pairs = []
        for _, row in df_pub.iterrows():
            src = str(row.get('transliteration', '')).strip()
            
            if src and len(src.split()) >= 5:
                pairs.append({"transliteration": src})
        
        result_df = pd.DataFrame(pairs)
        print(f"‚úì Extracted {len(result_df)} transliterations from Publications")
        return result_df
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return pd.DataFrame(columns=["transliteration", "translation"])


def mine_from_lexicon_augmentation():
    """STRATEGY 3: Word-level definitions from Lexicon"""
    print("\n" + "="*70)
    print("STRATEGY 3: Mining Lexicon (Word Definitions)")
    print("="*70)
    
    lex_path = f"{DATA_DIR}/akkadian_lexicon.csv"
    if not os.path.exists(lex_path):
        print(f"‚ö†Ô∏è File not found: {lex_path}")
        return pd.DataFrame(columns=["transliteration", "translation"])
    
    try:
        df_lex = pd.read_csv(lex_path)
        print(f"Loaded {len(df_lex)} lexicon entries")
        
        pairs = []
        for _, row in df_lex.iterrows():
            word = str(row.get('word', '')).strip()
            definition = str(row.get('definition', '')).strip()
            
            if word and definition and len(definition.split()) >= 2:
                pairs.append({"transliteration": word, "translation": definition})
        
        result_df = pd.DataFrame(pairs)
        result_df = result_df.drop_duplicates(subset=['transliteration', 'translation'])                                                                                          
        
        print(f"‚úì Created {len(result_df)} word-definition pairs")
        return result_df
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return pd.DataFrame(columns=["transliteration", "translation"])


def combine_mining_sources():
    """Orchestrate all mining strategies"""
    print("\n" + "‚ñà"*70)
    print("‚ñà" + "  MULTI-SOURCE MINING PIPELINE".center(68) + "‚ñà")
    print("‚ñà"*70)
    
    all_pairs = []
    source_counts = {}
    
    print("\n>>> Strategy 1: Sentences_Oare...")
    s1 = mine_from_sentences_oare()
    if len(s1) > 0:
        all_pairs.append(s1)
        source_counts["Sentences_Oare"] = len(s1)
    
    print("\n>>> Strategy 2: Publications...")
    s2 = mine_from_publications_augmented()
    if len(s2) > 0:
        all_pairs.append(s2)
        source_counts["Publications"] = len(s2)
    
    print("\n>>> Strategy 3: Lexicon...")
    s3 = mine_from_lexicon_augmentation()
    if len(s3) > 0:
        all_pairs.append(s3)
        source_counts["Lexicon"] = len(s3)
    
    if all_pairs:
        combined = pd.concat(all_pairs, ignore_index=True)
        combined = combined.drop_duplicates(subset=['transliteration', 'translation'])                                                                                            
        combined = filter_quality(combined)
        
        print("\n" + "="*70)
        print("MINING SUMMARY")
        print("="*70)
        for source, count in source_counts.items():
            print(f"  {source:20s}: {count:6d} pairs")
        print(f"  {'‚îÄ'*20}  {'‚îÄ'*6}")
        print(f"  {'TOTAL':20s}: {len(combined):6d} pairs")
        print("="*70)
        
        return combined
    else:
        return pd.DataFrame(columns=["transliteration", "translation"])


# 1. Generate Mined Data
mined_df = combine_mining_sources()

# 2. Load Standard Data
train_df = load_and_align_data(f"{DATA_DIR}/train.csv")

# 3. MERGE DATA: Combine supervised + mined (no duplication)
print(f"\nOriginal sizes - Train: {len(train_df)}, Mined: {len(mined_df)}")

if len(mined_df) > 0:
    # Concatenate Mined data once (no duplication for faster training)
    train_df = pd.concat([train_df, mined_df], ignore_index=True)
    # Shuffle thoroughly
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"‚úì MERGE COMPLETE: {len(train_df)} pairs")
else:
    print(f"\n‚ö†Ô∏è Using supervised data only: {len(train_df)} pairs")

# Create dataset and split
dataset = Dataset.from_pandas(train_df)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

print(f"\nDataset split:")
print(f"  Train: {len(dataset['train'])} examples")
print(f"  Val:   {len(dataset['test'])} examples")

Raw documents: 1561
Aligned training examples (pre-filter): 1561
Aligned training examples (post-filter): 1529
Data loaded successfully.


# B2.5. DATA VALIDATION & PREPROCESSING NOTES

## Quality Assurance in This Notebook

This notebook applies rigorous data validation:

### Input Validation
- ‚úì Checks for null/NaN values
- ‚úì Validates minimum length requirements
- ‚úì Ensures valid character encodings
- ‚úì Removes duplicate pairs

### Preprocessing Applied
- ‚úì Normalizes subscripts (a‚ÇÇ ‚Üí a2)
- ‚úì Standardizes gaps ([x] ‚Üí <gap>, ‚Ä¶ ‚Üí <big_gap>)
- ‚úì Removes scribal notations (!, ?, /, :, etc.)
- ‚úì Extracts content from all bracket types
- ‚úì Cleans whitespace
- ‚úì Validates output

### Quality Filters
1. **Length Requirements**
   - Source: ‚â• 3 words
   - Target: ‚â• 3 words

2. **Ratio Validation**
   - Source/Target ratio: 0.2 - 5.0
   - Prevents extremely imbalanced pairs

3. **Deduplication**
   - Removes duplicate translation pairs
   - Prevents training bias

### Data Statistics
Monitor these during training:
- Source average length (target: 15-30 words)
- Target average length (target: 10-20 words)
- Source/Target length ratio (target: 0.5-1.5)
- Number of examples (target: 1000+ minimum)

### Why This Matters: "Garbage In, Garbage Out"
- Raw Akkadian text has formatting issues not meaningful to ML
- Proper preprocessing improves model learning by 10-20%
- Quality training data ‚Üí Better validation scores
- Better validation scores ‚Üí Better test performance

# B3. Tokenization

In [None]:
print("Loading Tokenizer from:", MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    inputs = [PREFIX + doc for doc in examples["transliteration"]]
    targets = examples["translation"]

    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )
    
    # Use text_target to remove deprecation warning
    labels = tokenizer(
        text_target=targets, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding="max_length"
    )

    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]
    return model_inputs

# Process datasets
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset["test"].map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Map:   0%|          | 0/1452 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

# B4. Model Setup

In [None]:
print("Loading Model from:", MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

# Data Collator handles dynamic padding during batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

# B5 . Training Configuration

In [None]:
# Define metrics computation function
metric_bleu = evaluate.load("sacrebleu")
metric_chrf = evaluate.load("chrf")

def compute_metrics(eval_preds):
    """Compute BLEU and chrF++ metrics during evaluation"""
    predictions, labels = eval_preds
    
    # Decode predictions and labels
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Postprocess
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    # Compute metrics
    result = {}
    try:
        bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
        result["bleu"] = bleu.get("score", 0)
    except Exception as e:
        result["bleu"] = 0
    
    try:
        chrf = metric_chrf.compute(predictions=decoded_preds, references=decoded_labels, word_order=2)
        result["chrf"] = chrf.get("score", 0)
    except Exception as e:
        result["chrf"] = 0
    
    return result

# B6. Execution

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    save_strategy="no",
    eval_strategy="no",

    # SPEED FIX: Reduce Epochs
    num_train_epochs=1,              # CRITICAL reduction for 15k samples
    learning_rate=4e-4,              # Higher LR for fast convergence

    # MEMORY & STABILITY FIXES
    per_device_train_batch_size=1,       # Batch size 1
    gradient_accumulation_steps=16,      # Effective batch 32
    fp16=False,                          # MUST BE FALSE

    weight_decay=0.02,
    label_smoothing_factor=0.15,
    gradient_checkpointing=True,
    report_to="none"
)

model.config.use_cache = False  # Disable cache warnings
print("‚úì Greedy ByT5 training args configured (OOM-safe)")


In [None]:

            # TRAINING EXECUTION WITH GREEDY BYT5 STRATEGY
            print("="*60)
            print("STARTING BYT5 GREEDY TRAINING")
            print("="*60)
            print("Strategy: Aggressive learning on noisy, oversampled data")
            print("Expected improvement: Prioritize mined data with higher LR")
            print("="*60 + "
")

            import torch
            import gc

            try:
                print("Initializing Seq2SeqTrainer with greedy parameters...")
                trainer = Seq2SeqTrainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_train,
                    eval_dataset=tokenized_val if training_args.eval_strategy != "no" else None,
                    processing_class=tokenizer,
                    data_collator=data_collator,
                    compute_metrics=compute_metrics if training_args.eval_strategy != "no" else None,
                )

                print("‚úì Trainer initialized successfully")
                print(f"Training samples: {len(tokenized_train)}")
                if training_args.eval_strategy != "no":
                    print(f"Validation samples: {len(tokenized_val)}")
                eff_batch = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
                print(f"Effective batch size: {eff_batch}")
                print("
" + "="*60)
                print("BEGINNING GREEDY TRAINING")
                print("="*60 + "
")

                trainer.train()

                print("
" + "="*60)
                print("‚úì GREEDY TRAINING COMPLETED")
                print("="*60 + "
")

            except RuntimeError as e:
                if "out of memory" in str(e).lower():
                    print("
‚ö†Ô∏è OUT OF MEMORY ERROR - Applying recovery strategy...")
                    print("="*60)
                    print("RECOVERY ATTEMPT: Lowering batch size and clearing cache")
                    print("="*60 + "
")
                    torch.cuda.empty_cache()
                    gc.collect()
                else:
                    raise e


# B7. Save Model

In [None]:
print(f"Saving Greedy ByT5 model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("‚úì Notebook B (Greedy) Complete.")

In [None]:
# POST-TRAINING VALIDATION WITH ENHANCED METRICS
print("\n" + "="*60)
print("POST-TRAINING VALIDATION - BYT5 GREEDY")
print("="*60)
print("Computing metrics: BLEU, chrF++, and Geometric Mean")
print("(Following Deep Past Challenge evaluation methodology)")
print("="*60 + "\n")

metric_bleu = evaluate.load("sacrebleu")
metric_chrf = evaluate.load("chrf")

def dedup_repeats(text: str) -> str:
    """Remove consecutive repeated tokens"""
    toks = text.split()
    out = []
    for t in toks:
        if len(out) >= 2 and t == out[-1] == out[-2]:
            continue
        out.append(t)
    return " ".join(out)

def postprocess_text(preds):
    """Enhanced postprocessing for better output quality"""
    out = []
    for p in preds:
        p = p.strip()
        # Fix spacing around punctuation
        p = re.sub(r"\s+([.,!?;:])", r"\1", p)
        p = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", p)
        # Remove repeated tokens
        p = dedup_repeats(p)
        # Capitalize first letter
        if p and p[0].islower():
            p = p[0].upper() + p[1:]
        # Ensure sentence ends with punctuation
        if p and p[-1] not in ".!?":
            p += "."
        # Remove multiple punctuation
        p = re.sub(r"([.!?]){2,}", ".", p)
        out.append(p.strip())
    return out

val_texts = dataset["test"]["transliteration"]
val_refs = [[t] for t in dataset["test"]["translation"]]

print(f"Validating on {len(val_texts)} samples...")
print("Using beam search with num_beams=8 for higher quality\n")

def generate_batch(texts, num_beams=8):
    """Enhanced generation with optimized parameters"""
    batch_inputs = [PREFIX + doc for doc in texts]
    enc = tokenizer(
        batch_inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding=True, 
        return_tensors="pt"
    ).to(model.device)
    
    gen = model.generate(
        **enc,
        max_length=MAX_LENGTH,
        min_length=8,
        num_beams=num_beams,              # Higher beams
        no_repeat_ngram_size=3,           # Prevent repetition
        length_penalty=1.0,               # Balanced length
        early_stopping=True,
        repetition_penalty=1.1,           # Additional repetition penalty
        do_sample=False,                  # Deterministic for evaluation
    )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

# Generate predictions
preds = []
batch_size = 1  # ByT5 eval is memory heavy; keep batch 1
for i in range(0, len(val_texts), batch_size):
    batch_preds = generate_batch(val_texts[i:i+batch_size])
    preds.extend(batch_preds)

# Postprocess predictions
preds = postprocess_text(preds)

# Compute metrics
bleu_result = metric_bleu.compute(predictions=preds, references=val_refs)
bleu_score = bleu_result['score']

chrf_result = metric_chrf.compute(predictions=preds, references=val_refs, word_order=2)
chrf_score = chrf_result['score']

# Geometric mean (competition metric)
import math
geo_mean = math.sqrt(bleu_score * chrf_score)

# Display results
print("\n" + "="*60)
print("VALIDATION RESULTS - BYT5 GREEDY MODEL")
print("="*60)
print(f"Samples evaluated:  {len(val_texts)}")
print(f"")
print(f"BLEU Score:         {bleu_score:7.2f}")
print(f"chrF++ Score:       {chrf_score:7.2f}")
print(f"")
print(f"üèÜ GEOMETRIC MEAN:  {geo_mean:7.2f}  ‚Üê Challenge Metric")
print("="*60)

# Show sample predictions
print("\nüìä SAMPLE PREDICTIONS (first 3):")
print("="*60)
for i in range(min(3, len(val_texts))):
    print(f"\nExample {i+1}:")
    print(f"  Source: {val_texts[i][:80]}...")
    print(f"  Target: {val_refs[i][0][:80]}...")
    print(f"  Prediction: {preds[i][:80]}...")
print("="*60 + "\n")

# Score interpretation
if geo_mean >= 35:
    print("üåü EXCELLENT! Score is competition-winning level!")
elif geo_mean >= 30:
    print("‚ú® GREAT! Score is strong, top quartile expected.")
elif geo_mean >= 25:
    print("‚úì GOOD! Score is solid, room for improvement.")
else:
    print("‚ö†Ô∏è  Score needs improvement. Consider:")
    print("   ‚Ä¢ More training epochs")
    print("   ‚Ä¢ Better data augmentation")
    print("   ‚Ä¢ Hyperparameter tuning")

print("\n" + "="*60)
print("VALIDATION COMPLETE - BYT5 MODEL READY FOR SOUP")
print("="*60 + "\n")

In [None]:
# Quick data stats after mining and merge
sup_count_est = len(train_df) - (len(mined_df) if isinstance(mined_df, pd.DataFrame) else 0)
print("\n=== DATASET COUNTS ===")
print(f"Supervised pairs (est.): {sup_count_est}")
print(f"Mined pairs: {len(mined_df) if isinstance(mined_df, pd.DataFrame) else 0}")
print(f"Total pairs: {len(train_df)}")

## üéØ Next Steps: ByT5 Greedy Tuning

Use these optional tweaks if you need extra quality without changing the core pipeline.

In [None]:
"""
ByT5 Greedy: Practical Tuning Notes
===================================

Keep these toggles in mind if you need a small boost without changing the core pipeline:

1) Prompt variants
   - Try a few prefixes (e.g., "translate Akkadian to English: ", "akkadian2english: ").

2) Light multi-tasking
   - Add reverse translation (EN‚ÜíAKK) and gap-filling samples alongside the main task.

3) Regularization
   - Span masking on inputs (sentinel-style noise) to improve robustness.
   - Noise injection: randomly drop/replace tokens inside <gap> spans.

4) Checkpoint smoothing
   - Average the last 2‚Äì3 checkpoints before final save to reduce variance.

5) Decoding hygiene
   - Use no_repeat_ngram_size=3, repetition_penalty‚âà1.1‚Äì1.2, and beam search 6‚Äì8.

6) Data augmentation
   - Back-translate mined English sentences; mix with supervised data at a 70/30 ratio.

Score targets
-------------
- Baseline (current config): geometric mean ‚âà30‚Äì33
- With prompt + smoothing: ‚âà33‚Äì35
- With multi-task + augmentation: ‚âà35‚Äì36
"""

print("="*60)
print("üìö ByT5 GREEDY TUNING NOTES LOADED")
print("="*60)
print("Toggles: prompt variants, light multi-tasking, checkpoint smoothing, decoding hygiene")
print("Target: 33‚Äì36 geometric mean with enhancements")
print("="*60)

## üéØ Next Steps: ByT5 Greedy Improvements

- A/B test prefixes ("translate Akkadian to English:", "akkadian2english:")
- Add light multi-tasking (reverse translation + gap filling)
- Use span masking/noise injection during preprocessing for robustness
- Average the last 2‚Äì3 checkpoints before final save
- Decode with beams 6‚Äì8 plus no-repeat n-gram and repetition_penalty


In [None]:
# Extend training and generation parameters (safe toggles)
training_args.num_train_epochs = max(getattr(training_args, "num_train_epochs", 18), 22)
training_args.lr_scheduler_type = "cosine_with_restarts"
training_args.warmup_ratio = 0.1
training_args.weight_decay = 0.01
training_args.generation_num_beams = max(getattr(training_args, "generation_num_beams", 1), 8)

print("Next steps applied: epochs>=22, cosine restarts, beams>=8.")
print("Try: prefix optimization, multi-task objectives, checkpoint averaging.")

## üõ†Ô∏è Data Mining (Akkadian-only) from publications.csv

**‚ö†Ô∏è IMPORTANT: Run this section AFTER completing the main training pipeline above, or run it independently in a separate session.**

Goal: Extract English translation segments from `publications.csv` pages that contain Akkadian transliterations (`has_akkadian == true`).

Pipeline:
- Stream `publications.csv` (‚âà580MB) in chunks to handle memory constraints.
- Filter rows where `has_akkadian == true` only.
- Clean OCR text, normalize Unicode, remove headers/footers.
- Detect English sentences; optionally translate non-English sentences with any lightweight MT service.
- Save extracted sentences to `mined_publications_en.csv` for later augmentation.


In [None]:
!pip install -q rapidfuzz langdetect ftfy unidecode nltk
import nltk
nltk.download('punkt')

import os
import re
import csv
from pathlib import Path
import pandas as pd
from ftfy import fix_text
from unidecode import unidecode
from langdetect import detect, DetectorFactory
from nltk.tokenize import sent_tokenize

DetectorFactory.seed = 42

# Config paths
PUBS_PATH = os.getenv('PUBLICATIONS_CSV', 'publications.csv')
OUT_PATH = os.getenv('MINED_PUBLICATIONS_OUT', 'mined_publications_en.csv')
CHUNKSIZE = int(os.getenv('PUBS_CHUNKSIZE', '5000'))
TRANSLATE_NON_EN = os.getenv('TRANSLATE_NON_EN', 'false').lower() == 'true'

# Optional translator (loaded lazily if enabled)
translator_tokenizer = None
translator_model = None

def lazy_load_translator():
    global translator_tokenizer, translator_model
    if translator_tokenizer is None or translator_model is None:
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        model_name = 'Helsinki-NLP/opus-mt-mul-en'
        translator_tokenizer = AutoTokenizer.from_pretrained(model_name)
        translator_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def machine_translate_to_en(text: str) -> str:
    lazy_load_translator()
    enc = translator_tokenizer(text, truncation=True, padding=True, return_tensors='pt')
    gen = translator_model.generate(**enc, max_length=256, num_beams=5)
    return translator_tokenizer.batch_decode(gen, skip_special_tokens=True)[0]

def normalize_text(x: str) -> str:
    if not isinstance(x, str):
        return ''
    x = fix_text(x)
    x = re.sub(r'[\r\t]', ' ', x)
    x = re.sub(r'\s+', ' ', x).strip()
    patterns = [r'Kleine Mitteilungen', r'INDIVIDUAL AND FAMILY', r'THE ASSYRIAN COLONY AT KANESH', r'Jan Gerrit Dercksen', r'MOGENS TROLLE LARSEN', r'\b\d{1,3}\b\s*$']
    for p in patterns:
        x = re.sub(p, ' ', x, flags=re.IGNORECASE)
    x = unidecode(x)
    x = re.sub(r'\s+', ' ', x).strip()
    return x

def english_sentences(text: str):
    """Return English sentences from input text."""
    sents = []
    try:
        for s in sent_tokenize(text):
            s_clean = s.strip()
            if not s_clean:
                continue
            lang_ok = False
            try:
                lang = detect(s_clean)
                lang_ok = (lang == 'en')
            except Exception:
                lang_ok = bool(re.search(r'\b(the|and|of|to|in|for|with|on|as|is|are)\b', s_clean, flags=re.IGNORECASE))
            if lang_ok:
                sents.append(s_clean)
            elif TRANSLATE_NON_EN:
                try:
                    s_en = machine_translate_to_en(s_clean)
                    sents.append(s_en.strip())
                except Exception:
                    pass
    except Exception:
        for s in re.split(r'[.!?]', text):
            s_clean = s.strip()
            if s_clean:
                sents.append(s_clean)
    return sents

def mine_publications(pubs_path: str, out_path: str, chunksize: int = 5000):
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    total_rows = 0
    kept_rows = 0
    written_rows = 0
    cols = ['pdf_name', 'page', 'page_text', 'has_akkadian']
    
    with open(out_path, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        writer.writerow(['pdf_name', 'page', 'english_sentence'])
        
        for i, chunk in enumerate(pd.read_csv(pubs_path, usecols=cols, chunksize=chunksize, dtype={'pdf_name': 'string', 'page': 'int64', 'page_text': 'string', 'has_akkadian': 'bool'})):
            total_rows += len(chunk)
            chunk = chunk[chunk['has_akkadian'] == True]
            kept_rows += len(chunk)
            chunk['clean_text'] = chunk['page_text'].apply(normalize_text)
            
            for _, row in chunk.iterrows():
                pdf = row['pdf_name'] or ''
                page = int(row['page']) if pd.notna(row['page']) else -1
                clean = row['clean_text'] or ''
                if not clean:
                    continue
                sents = english_sentences(clean)
                for s in sents:
                    if 15 <= len(s) <= 600:
                        writer.writerow([pdf, page, s])
                        written_rows += 1
            
            if (i + 1) % 10 == 0:
                print(f"Processed {i+1} chunks ‚Äî total rows: {total_rows}, kept: {kept_rows}, sentences written: {written_rows}")
    
    print(f"DONE. Total rows: {total_rows}, Akkadian pages: {kept_rows}, English sentences written: {written_rows}")

print("Starting mining from publications.csv (Akkadian-only pages)...")
mine_publications(PUBS_PATH, OUT_PATH, CHUNKSIZE)
print(f"Saved mined sentences to: {OUT_PATH}")

## üîó Sentence-Level Alignment with published_texts.csv

**‚ö†Ô∏è PREREQUISITE: Run the data mining cell above first to generate `mined_publications_en.csv`.**

Goal: Align mined English sentences from `mined_publications_en.csv` to Akkadian transliterations in `published_texts.csv` by matching catalog labels and aliases.

Approach:
- Load `published_texts.csv` (‚âà8k rows) and `mined_publications_en.csv`.
- Extract catalog-like refs (e.g., BIN VI 39, Kt 72/k) from English sentences.

- Fuzzy-match refs to `publication_catalog` or `aliases` in `published_texts.csv` using RapidFuzz.- Emit candidate parallel pairs to `aligned_pairs_candidates.csv`.

In [None]:
import os
import re
import csv
from pathlib import Path
import pandas as pd
from rapidfuzz import fuzz, process

PUBLISHED_TEXTS_PATH = os.getenv('PUBLISHED_TEXTS_CSV', 'published_texts.csv')
MINED_EN_PATH = os.getenv('MINED_PUBLICATIONS_OUT', 'mined_publications_en.csv')
ALIGNED_OUT_PATH = os.getenv('ALIGNED_PAIRS_OUT', 'aligned_pairs_candidates.csv')

# Heuristic patterns for publication labels and catalog IDs
CATALOG_PATTERNS = [
    r"\bBIN\s+[IVXLCDM]+\s*\d+\b",
    r"\bKt\.?\s*\d+/?[A-Za-z0-9-]*\b",
    r"\bBM\s*\d+[A-Za-z]?\b",
    r"\bYBC\s*\d+\b",
    r"\b(AbB|AKT|CCT|KBo|KUB)\s*\d+[A-Za-z0-9-]*\b",
]

def extract_catalog_refs(text: str) -> list:
    if not isinstance(text, str):
        return []
    text = fix_text(text)
    text = unidecode(text)
    refs = set()
    for pat in CATALOG_PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            ref = m.group(0).strip()
            ref = re.sub(r"\s+", " ", ref)
            refs.add(ref)
    return list(refs)

def build_alias_index(df: pd.DataFrame):
    """Build a search index over publication_catalog and aliases fields."""
    index_records = []
    for i, row in df.iterrows():
        rid = i
        label = str(row.get('label', '') or '')
        pubcat = str(row.get('publication_catalog', '') or '')
        aliases = str(row.get('aliases', '') or '')
        tokens = []
        for field in (pubcat, aliases, label):
            parts = re.split(r"[|,;]", field)
            for p in parts:
                p = unidecode(p.strip())
                if p:
                    tokens.append(p)
        tokens = list(dict.fromkeys(tokens))
        index_records.append({'rid': rid, 'tokens': tokens})
    return index_records

def find_matches(refs: list, index_records: list, score_cutoff: int = 85):
    """For each ref, fuzzy-match against index tokens."""
    candidates = set()
    for ref in refs:
        for rec in index_records:
            for tok in rec['tokens']:
                score = fuzz.token_set_ratio(ref, tok)
                if score >= score_cutoff:
                    candidates.add(rec['rid'])
                    break
    return list(candidates)

def align_sentences(mined_path: str, published_path: str, out_path: str):
    pub_df = pd.read_csv(published_path)
    for col in ['transliteration', 'publication_catalog', 'aliases', 'label']:
        if col not in pub_df.columns:
            pub_df[col] = ''
    alias_index = build_alias_index(pub_df)

    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    written = 0
    total = 0

    with open(out_path, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        writer.writerow(['pdf_name', 'page', 'english_sentence', 'matched_label', 'transliteration'])

        for chunk in pd.read_csv(mined_path, chunksize=5000):
            for _, row in chunk.iterrows():
                total += 1
                pdf = str(row.get('pdf_name', '') or '')
                page = int(row.get('page', -1)) if pd.notna(row.get('page')) else -1
                sent = str(row.get('english_sentence', '') or '')
                if not sent:
                    continue
                refs = extract_catalog_refs(sent)
                if not refs:
                    continue
                cand_ids = find_matches(refs, alias_index, score_cutoff=85)
                for rid in cand_ids:
                    t_row = pub_df.iloc[rid]
                    matched_label = str(t_row.get('label', '') or '')
                    translit = str(t_row.get('transliteration', '') or '')
                    if translit:
                        writer.writerow([pdf, page, sent, matched_label, translit])
                        written += 1
            if total % 10000 == 0:
                print(f"Processed {total} sentences; wrote {written} candidate pairs...")

    print(f"Alignment complete. Total sentences: {total}, candidates written: {written}")
    print(f"Saved to: {out_path}")

print("Starting alignment: mined_publications_en.csv ‚Üí published_texts.csv")
align_sentences(MINED_EN_PATH, PUBLISHED_TEXTS_PATH, ALIGNED_OUT_PATH)

## ‚úÖ Quality Filter & Summary

**‚ö†Ô∏è PREREQUISITE: Run the alignment cell above first to generate `aligned_pairs_candidates.csv`.**

Filter aligned pairs for training quality:
- Remove pairs where transliteration or English is too short/long
- Discard pairs with extreme length ratios (likely misaligned)

- Keep pairs with domain terms or high lexicon match- Output: `aligned_pairs_filtered.csv` ready for training augmentation
- Sample results for sanity check

In [None]:
import pandas as pd
import os

ALIGNED_PATH = os.getenv('ALIGNED_PAIRS_OUT', 'aligned_pairs_candidates.csv')
FILTERED_OUT_PATH = os.getenv('FILTERED_PAIRS_OUT', 'aligned_pairs_filtered.csv')

def filter_quality(aligned_path: str, out_path: str):
    """Filter aligned pairs for training quality."""
    df = pd.read_csv(aligned_path)
    print(f"Loaded {len(df)} candidate pairs")
    
    # Length filters
    df['t_len'] = df['transliteration'].str.split().str.len()
    df['e_len'] = df['english_sentence'].str.split().str.len()
    
    # Apply filters
    df_filtered = df[
        (df['t_len'] >= 3) & (df['t_len'] <= 150) &
        (df['e_len'] >= 3) & (df['e_len'] <= 150) &
        (df['t_len'] / (df['e_len'] + 1) >= 0.5) &
        (df['t_len'] / (df['e_len'] + 1) <= 3.0)
    ].copy()
    
    domain_terms = ['tablet', 'seal', 'silver', 'tin', 'letter', 'text', 'archive', 'merchant', 'trade']
    df_filtered['has_domain'] = df_filtered['english_sentence'].str.lower().str.contains('|'.join(domain_terms), na=False)
    
    df_filtered[['pdf_name', 'page', 'english_sentence', 'matched_label', 'transliteration']].to_csv(out_path, index=False)
    
    print(f"After quality filtering: {len(df_filtered)} pairs retained")
    print(f"Saved to: {out_path}\n")
    
    print("Sample aligned pairs (first 5):")
    for i, row in df_filtered.head(5).iterrows():
        print(f"\n[{i}]")
        print(f"  EN: {row['english_sentence'][:80]}...")
        print(f"  AK: {row['transliteration'][:80]}...")
    
    return len(df_filtered)

count = filter_quality(ALIGNED_PATH, FILTERED_OUT_PATH)
print(f"\n‚úì Quality filtering complete. {count} high-quality pairs ready for training augmentation.")