In [1]:
!nvidia-smi

Mon Jan 12 14:28:05 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

# A1. Install required libraries

In [2]:
!pip install -q evaluate sacrebleu

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.1/104.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

# A2. Imports & config

In [3]:
import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed
)
import evaluate

# Memory/precision safety tweaks (helps avoid OOM on P100/T4)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
try:
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = False
    torch.set_float32_matmul_precision("medium")
except Exception:
    pass

set_seed(42)

2026-01-12 14:28:23.832944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768228103.995511      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768228104.045372      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768228104.430344      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768228104.430381      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768228104.430384      55 computation_placer.cc:177] computation placer alr

# A3. Set constants (DO NOT change yet)

In [4]:

# === CONFIGURATION: THE PURIST ===
MODEL_PATH = "/kaggle/input/models-for-dpc/pretrained_models/byt5-base"
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
OUTPUT_DIR = "/kaggle/working/byt5-base-saved"

ENABLE_MONO_PRETRAIN = True   # <--- CRITICAL: Keeps the "Purist" logic
MAX_LENGTH = 300              # Reduced from 512 for ByT5 speed/memory
PREFIX = "translate Akkadian to English: "

BATCH_SIZE = 4                # Adjust based on GPU memory
GRAD_ACCUM = 8                # Gradient accumulation steps


# A4. Data Loading & Cleaning

# A3.5. DATA PREPARATION GUIDE: Handling Akkadian Formatting Issues

## Problem: "Garbage In, Garbage Out"
Akkadian texts contain complex formatting that can break ML pipelines if not handled properly.

## Formatting Issues to Handle

### 1. Scribal Notations (Remove)
- `!` - Certain reading (remove)
- `?` - Questionable reading (remove)
- `/` - Line divider (remove)
- `:` or `.` - Word divider (remove)
- `< >` - Scribal insertions (keep content, remove brackets)
- `( )` - Comments/erasures (remove entirely)
- `Àπ À∫` - Half brackets for partially broken signs (remove)
- `[ ]` - Clearly broken signs (keep content, remove brackets)
- `<< >>` - Errant signs (remove entirely)

### 2. Gaps & Lacunae (Standardize)
- `[x]` ‚Üí `<gap>`
- `x` ‚Üí `<gap>`
- `xx` ‚Üí `<gap>`
- `‚Ä¶` ‚Üí `<big_gap>`
- `‚Ä¶‚Ä¶` ‚Üí `<big_gap>`
- `[... ...]` ‚Üí `<big_gap>`
- Multiple `.3` or `...` sequences ‚Üí `<big_gap>`

### 3. Determinatives (Keep content, remove brackets)
- `{d}` - Deity (remove brackets)
- `{ki}` - Earth/location (remove brackets)
- `{lu‚ÇÇ}` - Person (remove brackets)
- `{e‚ÇÇ}` - Building (remove brackets)
- And 10+ others...

### 4. Subscripts & Superscripts (Normalize)
- `a‚ÇÇ` ‚Üí `a2`, `a‚ÇÉ` ‚Üí `a3`, etc.
- `il‚ÇÖ` ‚Üí `il5`, etc.
- Works with Unicode characters (U+2080-U+2089)

### 5. Special Characters (Handle as-is or normalize)
- `≈°` (U+0161), `≈†` (U+0160)
- `·π£` (U+1E63), `·π¢` (U+1E62)
- `·π≠` (U+1E6D), `·π¨` (U+1E6C)
- `·∏´` (U+1E2B), `·∏™` (U+1E2A)
- ` æ` (U+02BE) - Akkadian letter marker

### 6. Capitalization Rules (Preserve)
- First letter capital = Proper noun (personal/place name)
- ALL CAPS = Sumerian logogram (preserve for domain knowledge)

## Processing Order
1. Normalize subscripts FIRST (‚ÇÄ-‚Çâ ‚Üí 0-9)
2. Handle gaps (complex patterns first, then simple)
3. Remove scribal notations
4. Extract content from bracketed structures
5. Clean whitespace
6. Validate output (length checks, character validation)

## Data Validation Checks
‚úì No empty strings after cleaning
‚úì Source length >= 3 words
‚úì Target length >= 3 words
‚úì Length ratio between 0.2 and 5.0
‚úì No duplicate pairs
‚úì All special characters properly handled

In [5]:
"""
COMPREHENSIVE DATA PREPROCESSING FOR AKKADIAN TEXTS
Handles all formatting issues mentioned in competition guidelines
"""

# ============================================================================
# SUBSCRIPT & SUPERSCRIPT NORMALIZATION
# ============================================================================
SUBSCRIPT_TRANS = str.maketrans({
    "‚ÇÄ": "0", "‚ÇÅ": "1", "‚ÇÇ": "2", "‚ÇÉ": "3", "‚ÇÑ": "4", 
    "‚ÇÖ": "5", "‚ÇÜ": "6", "‚Çá": "7", "‚Çà": "8", "‚Çâ": "9", 
    "‚Çì": "x"
})

def normalize_subscripts(text: str) -> str:
    """Convert subscript Unicode characters to regular numbers"""
    if not isinstance(text, str):
        return ""
    return text.translate(SUBSCRIPT_TRANS)

# ============================================================================
# GAP & LACUNAE HANDLING
# ============================================================================
def replace_gaps(text, keep_gaps=True):
    """
    Replace various gap notations with standardized tokens.
    Handles all gap patterns mentioned in competition guidelines.
    
    Args:
        text: Input text with gaps
        keep_gaps: If True, keeps <gap> and <big_gap> tokens.
                  If False, removes them completely.
    
    Returns:
        Text with normalized gap tokens
    """
    if pd.isna(text): 
        return text
    
    # STEP 1: Complex gap patterns (order matters!)
    # [...] patterns for multiple dots
    text = re.sub(r'\[\s*\.\s*\.\s*\.\s*\.\s*\]', '<big_gap>', text)  # [......]
    text = re.sub(r'\[\s*\.\s*\.\s*\.\s*\]', '<big_gap>', text)       # [....]
    text = re.sub(r'\[\s*\.\s*\.\s*\]', '<gap>', text)                 # [...] 
    
    # Multiple .3 patterns with multiple dots
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    
    # Multiple dots (.....)
    text = re.sub(r'\.{4,}', '<big_gap>', text)  # 4+ dots = big gap
    
    # STEP 2: Unicode gap markers
    text = re.sub(r'‚Ä¶‚Ä¶', '<big_gap>', text)      # Unicode horizontal ellipsis
    text = re.sub(r'‚Ä¶', '<big_gap>', text)        # Unicode single ellipsis
    
    # STEP 3: Standard dot patterns
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)  # Multiple ... groups
    text = re.sub(r'\.\.\.', '<big_gap>', text)  # Three dots
    text = re.sub(r'\.\.', '<gap>', text)        # Two dots
    
    # STEP 4: [x] and [xx] patterns
    text = re.sub(r'\[x+\]', '<gap>', text)      # [x] or [xx]
    
    # STEP 5: Bare x patterns
    text = re.sub(r'(?:^|\s)xx(?:\s|$)', ' <gap> ', text)  # xx as separate word
    text = re.sub(r'(?:^|\s)x(?:\s|$)', ' <gap> ', text)   # x as separate word
    
    # STEP 6: Remove gaps if not needed
    if not keep_gaps:
        text = re.sub(r'<big_gap>', '', text)
        text = re.sub(r'<gap>', '', text)
    
    return text

# ============================================================================
# SCRIBAL NOTATION REMOVAL
# ============================================================================
def remove_scribal_notations(text):
    """
    Remove modern scribal notations that are not meaningful for translation.
    These are editorial marks added by scholars, not part of the original text.
    """
    if not isinstance(text, str):
        return ""
    
    # Remove line number markers (1, 5, 10, 1', 1'')
    text = re.sub(r'\b\d+\'?\s*\'?\s*\b', ' ', text)
    
    # Remove uncertainty markers
    text = re.sub(r'[!?]', ' ', text)  # ! = certain, ? = uncertain
    
    # Remove other scribal punctuation
    text = re.sub(r'[/:¬∑]', ' ', text)  # / = line divider, : = word divider, ¬∑ = separator
    
    return text

# ============================================================================
# BRACKETED CONTENT HANDLING
# ============================================================================
def handle_brackets(text):
    """
    Handle various bracket types according to guidelines.
    
    - ( ) Remove entirely (comments/erasures)
    - < > Keep content (scribal insertions)
    - [ ] Keep content (clearly broken signs)
    - { } Keep content (determinatives)
    - << >> Remove entirely (errant signs)
    - Àπ À∫ Remove (half brackets for partially broken)
    """
    if not isinstance(text, str):
        return ""
    
    # Remove comments and erasures (keep nothing)
    text = re.sub(r'\([^)]*\)', ' ', text)
    
    # Keep content from scribal insertions and broken signs
    text = re.sub(r'<([^>]*)>', r'\1', text)      # <content> ‚Üí content
    text = re.sub(r'\[([^\]]*)\]', r'\1', text)   # [content] ‚Üí content
    
    # Determinatives: {content} ‚Üí content (removes classifier brackets)
    text = re.sub(r'\{([^}]*)\}', r'\1', text)
    
    # Remove half brackets for partially broken signs
    text = re.sub(r'[ÀπÀ∫]', ' ', text)
    
    # Remove errant/erroneous signs entirely
    text = re.sub(r'<<[^>]*>>', ' ', text)
    
    return text

# ============================================================================
# MAIN TRANSLITERATION CLEANING FUNCTION
# ============================================================================
def clean_translit(text, keep_gaps=True):
    """
    Comprehensive normalization of Akkadian transliteration.
    Handles all formatting issues in proper order.
    
    Processing order:
    1. Normalize subscripts
    2. Handle gaps
    3. Remove scribal notations
    4. Handle bracket types
    5. Clean whitespace
    """
    if not isinstance(text, str):
        return ""
    
    # STEP 1: Normalize subscripts/superscripts FIRST
    text = normalize_subscripts(text)
    
    # STEP 2: Handle gaps (complex patterns)
    text = replace_gaps(text, keep_gaps=keep_gaps)
    
    # STEP 3: Remove scribal notations
    text = remove_scribal_notations(text)
    
    # STEP 4: Handle all bracket types
    text = handle_brackets(text)
    
    # STEP 5: Clean whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# ============================================================================
# TRANSLATION CLEANING FUNCTION
# ============================================================================
def clean_translation(text, has_gaps=False):
    """
    Clean translation with minimal processing.
    Keep as much content as possible.
    """
    if not isinstance(text, str):
        return ""
    
    # Handle gap indicators if source has gaps
    if not has_gaps:
        text = text.replace("‚Ä¶", " ")
    
    # Clean whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# ============================================================================
# DATA QUALITY FILTERING
# ============================================================================
def filter_quality(df):
    """
    Filter out low-quality pairs based on validation checks.
    
    Validation criteria:
    - Minimum 3 words in source and target
    - Length ratio between 0.2 and 5.0
    - No duplicate pairs
    """
    # Calculate lengths
    df["src_len"] = df["transliteration"].str.split().str.len()
    df["tgt_len"] = df["translation"].str.split().str.len()
    
    # Minimum length check
    df = df[(df["src_len"] >= 3) & (df["tgt_len"] >= 3)]
    
    # Length ratio check (one language often longer than other)
    ratio = (df["src_len"] / df["tgt_len"]).clip(upper=6)
    df = df[(ratio >= 0.2) & (ratio <= 5)]
    
    # Remove exact duplicates
    df = df.drop_duplicates(subset=["transliteration", "translation"])
    
    # Cleanup
    return df.drop(columns=["src_len", "tgt_len"])

# ============================================================================
# VALIDATION & REPORTING
# ============================================================================
def validate_preprocessing(original_df, cleaned_df):
    """
    Report on preprocessing impact.
    """
    print("\n" + "="*60)
    print("DATA PREPROCESSING VALIDATION")
    print("="*60)
    print(f"Original samples: {len(original_df)}")
    print(f"After cleaning: {len(cleaned_df)}")
    print(f"Removed: {len(original_df) - len(cleaned_df)} samples")
    
    if len(cleaned_df) > 0:
        avg_src = cleaned_df["transliteration"].str.split().str.len().mean()
        avg_tgt = cleaned_df["translation"].str.split().str.len().mean()
        print(f"Avg source length: {avg_src:.1f} words")
        print(f"Avg target length: {avg_tgt:.1f} words")
        print(f"Avg ratio (src/tgt): {avg_src/avg_tgt:.2f}")
    print("="*60 + "\n")

# Replace gaps function (with corrected newlines and indentation)
def replace_gaps(text, keep_gaps=True):
    """Replace various gap notations with standardized tokens
    
    Args:
        keep_gaps: If True, keeps gap tokens (for test-like data).
                   If False, removes them (for clean training).
    """
    if pd.isna(text): 
        return text
    
    # Complex gap patterns (order matters)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)

    # Simple gap patterns
    text = re.sub(r'xx', '<gap>', text)
    text = re.sub(r' x ', ' <gap> ', text)
    text = re.sub(r'‚Ä¶‚Ä¶', '<big_gap>', text)
    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)
    text = re.sub(r'‚Ä¶', '<big_gap>', text)
    text = re.sub(r'\.\.\.', '<big_gap>', text)
    
    # Bracketed gaps
    text = re.sub(r'\[\.\.\.+\]', '<big_gap>', text)
    text = re.sub(r'\[x+\]', '<gap>', text)
    
    if not keep_gaps:
        # Remove gaps for clean training
        text = re.sub(r'<big_gap>', '', text)
        text = re.sub(r'<gap>', '', text)

    return text

def clean_translit(text, keep_gaps=True):
    """Normalize transliteration following competition guidance."""
    if not isinstance(text, str):
        return ""
    text = normalize_subscripts(text)
    # Apply gap replacement - KEEP gaps for domain matching
    text = replace_gaps(text, keep_gaps=keep_gaps)
    # Only remove scribal markers, keep gaps
    text = re.sub(r"<<[^>]*>>", " ", text)               # errant signs
    text = re.sub(r"[ÀπÀ∫]", " ", text)                    # half brackets
    text = re.sub(r"\([^)]*\)", " ", text)             # comments/erasures
    text = re.sub(r"\{([^}]*)\}", r"\1", text)         # determinatives
    text = re.sub(r"<([^>]*)>", r"\1", text)            # scribal insertions keep content
    text = re.sub(r"[!?/:¬∑]", " ", text)                 # scribal punctuation
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_translation(text, has_gaps=False):
    """Clean translation, optionally keeping gap indicators"""
    if not isinstance(text, str):
        return ""
    if not has_gaps:
        text = text.replace("‚Ä¶", " ")
    # Keep ... if source has gaps
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def filter_quality(df):
    df["src_len"] = df["transliteration"].str.split().str.len()
    df["tgt_len"] = df["translation"].str.split().str.len()
    df = df[(df["src_len"] >= 3) & (df["tgt_len"] >= 3)]
    ratio = (df["src_len"] / df["tgt_len"]).clip(upper=6)
    df = df[(ratio >= 0.2) & (ratio <= 5)]
    df = df.drop_duplicates(subset=["transliteration", "translation"])
    return df.drop(columns=["src_len", "tgt_len"])

# A5 . Tokenization

In [6]:
# Load and preprocess training data
print("="*60)
print("LOADING & PREPROCESSING DATA")
print("="*60)

train_path = f"{DATA_DIR}/train.csv"
print(f"Loading data from: {train_path}")

train_df = pd.read_csv(train_path)
print(f"Original dataset size: {len(train_df)}")

# Clean data
train_df = train_df.dropna(subset=["transliteration", "translation"])
train_df["transliteration"] = train_df["transliteration"].astype(str).apply(clean_translit)
train_df["translation"] = train_df["translation"].astype(str).apply(clean_translation)

# Filter quality
train_df = filter_quality(train_df)
print(f"After quality filtering: {len(train_df)}")

# Create train/validation split
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")

# Convert to HuggingFace Dataset
dataset = {
    "train": Dataset.from_pandas(train_data[["transliteration", "translation"]].reset_index(drop=True)),
    "test": Dataset.from_pandas(val_data[["transliteration", "translation"]].reset_index(drop=True))
}

print("="*60)
print("‚úì Data loaded and preprocessed successfully")
print("="*60 + "\n")

LOADING & PREPROCESSING DATA
Loading data from: /kaggle/input/deep-past-initiative-machine-translation/train.csv
Original dataset size: 1561
After quality filtering: 1528
Training samples: 1375
Validation samples: 153
‚úì Data loaded and preprocessed successfully



In [7]:

print("Loading Tokenizer from:", MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def preprocess_function(examples):
    inputs = [doc for doc in examples["transliteration"]]
    targets = examples["translation"]

    model_inputs = tokenizer(
        inputs, max_length=MAX_LENGTH, truncation=True, padding="max_length"
    )

    # FIX: Use text_target to avoid warnings
    labels = tokenizer(
        text_target=targets, max_length=MAX_LENGTH, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]
    return model_inputs

# Process datasets
tokenized_train = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = dataset["test"].map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names)


Loading Tokenizer from: /kaggle/input/models-for-dpc/pretrained_models/byt5-base


Map:   0%|          | 0/1375 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

# A6. Model Setup

In [8]:
print("Loading Model from:", MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

# Data Collator handles dynamic padding during batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    label_pad_token_id=-100
)

Loading Model from: /kaggle/input/models-for-dpc/pretrained_models/byt5-base


# A6. Optional: Monolingual Pre-Training on Akkadian Texts

This step teaches the model Akkadian grammar and morphology BEFORE translation training.
Uses published_texts.csv (8,000+ Akkadian texts) with Masked Language Modeling (MLM).

Benefits:
- Model learns to handle gaps naturally
- Better understanding of Akkadian word structure
- Improves low-resource translation performance

Set ENABLE_MONO_PRETRAIN=True to enable (adds ~30min training time).

In [None]:
# Monolingual Pre-Training Configuration
ENABLE_MONO_PRETRAIN = bool(int(os.getenv("ENABLE_MONO_PRETRAIN", "1")))  # Set to 1 to enable

if ENABLE_MONO_PRETRAIN:
    print("\n" + "="*60)
    print("MONOLINGUAL PRE-TRAINING ON AKKADIAN TEXTS")
    print("="*60)
    
    pub_texts_path = f"{DATA_DIR}/published_texts.csv"
    
    if os.path.exists(pub_texts_path):
        # Load Akkadian-only texts
        pub_texts_df = pd.read_csv(pub_texts_path)
        akkadian_texts = pub_texts_df['transliteration'].dropna().astype(str).tolist()
        akkadian_texts = [clean_translit(t, keep_gaps=True) for t in akkadian_texts]
        akkadian_texts = [t for t in akkadian_texts if len(t.split()) >= 5 and len(t.split()) <= 200]
        akkadian_texts = akkadian_texts[:5000]  # Limit for time
        
        print(f"Loaded {len(akkadian_texts)} Akkadian texts for pre-training")
        
        # Simple MLM approach: Mask random spans
        from transformers import DataCollatorForSeq2Seq
        
        def create_mlm_examples(texts):
            """Create masked language modeling examples"""
            mlm_examples = []
            for text in texts:
                tokens = text.split()
                if len(tokens) < 5:
                    continue
                
                # Mask 15% of tokens
                n_mask = max(1, int(len(tokens) * 0.15))
                mask_positions = np.random.choice(len(tokens), size=n_mask, replace=False)
                
                masked_text = []
                for i, token in enumerate(tokens):
                    if i in mask_positions:
                        masked_text.append("<extra_id_0>")  # sentinel-style token
                    else:
                        masked_text.append(token)
                
                input_text = " ".join(masked_text)
                target_text = " ".join([tokens[i] for i in mask_positions])
                
                mlm_examples.append({
                    "transliteration": input_text,
                    "translation": target_text
                })
            
            return mlm_examples
        
        mlm_data = create_mlm_examples(akkadian_texts)
        print(f"Created {len(mlm_data)} MLM training examples")
        
        # Create MLM dataset
        mlm_dataset = Dataset.from_pandas(pd.DataFrame(mlm_data))
        
        def preprocess_mlm(examples):
            inputs = [PREFIX + doc for doc in examples["transliteration"]]
            targets = examples["translation"]
            model_inputs = tokenizer(
                inputs,
                max_length=MAX_LENGTH,
                truncation=True,
                padding="max_length"
            )
            labels = tokenizer(
                text_target=targets,
                max_length=MAX_LENGTH,
                truncation=True,
                padding="max_length"
            )
            model_inputs["labels"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label]
                for label in labels["input_ids"]
            ]
            return model_inputs
        
        tokenized_mlm = mlm_dataset.map(preprocess_mlm, batched=True)
        
        # UPDATED MLM ARGUMENTS (Safe Mode)
        mlm_args = Seq2SeqTrainingArguments(
            output_dir=f"{OUTPUT_DIR}_mlm",
            num_train_epochs=1,
            learning_rate=2e-4,              # Lower LR for stability
            
            # MEMORY & STABILITY FIXES
            per_device_train_batch_size=1,   # Batch size 1 prevents OOM
            gradient_accumulation_steps=16,  # Simulates batch 16
            fp16=False,                      # MUST BE FALSE for ByT5
            
            save_strategy="no",
            eval_strategy="no",
            logging_steps=50,
            report_to="none"
        )
        
        # FIX: processing_class instead of tokenizer (Removes Warning)
        mlm_trainer = Seq2SeqTrainer(
            model=model,
            args=mlm_args,
            train_dataset=tokenized_mlm,
            processing_class=tokenizer,      # Updated argument name
            data_collator=data_collator,
        )
        
        print("Starting monolingual pre-training (1 epoch on Akkadian texts)...")
        try:
            mlm_trainer.train()
            print("‚úì Monolingual pre-training complete")
            print("Model now understands Akkadian grammar and gaps better!")
        except Exception as e:
            print(f"‚ö†Ô∏è  MLM pre-training failed: {e}")
            print("Continuing with main training...")
    
    else:
        print("‚ö†Ô∏è  published_texts.csv not found, skipping monolingual pre-training")
else:
    print("\n‚ö†Ô∏è  Monolingual pre-training disabled (set ENABLE_MONO_PRETRAIN=1 to enable)")


MONOLINGUAL PRE-TRAINING ON AKKADIAN TEXTS
Loaded 5000 Akkadian texts for pre-training
Created 5000 MLM training examples


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Starting monolingual pre-training (1 epoch on Akkadian texts)...




Step,Training Loss
50,2.0273


In [None]:
# Quick data stats after mining and merge
sup_count_est = len(train_df) - (len(mined_df) if isinstance(mined_df, pd.DataFrame) else 0)
print("\n=== DATASET COUNTS ===")
print(f"Supervised pairs (est.): {sup_count_est}")
print(f"Mined pairs: {len(mined_df) if isinstance(mined_df, pd.DataFrame) else 0}")
print(f"Total pairs: {len(train_df)}")

In [None]:
# Clear GPU memory after monolingual pre-training to prevent OOM
import gc
del mlm_trainer
del mlm_dataset
gc.collect()
torch.cuda.empty_cache()
print("Memory cleared for main training.")

# A7. Training Arguments

In [None]:

# A7. UPDATED TRAINING ARGS (THE PURIST)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,

    # MEMORY & STABILITY FIXES
    learning_rate=3e-4,
    per_device_train_batch_size=1,       # Batch size 1
    gradient_accumulation_steps=32,      # Accumulate 32 times (Effective batch = 32)
    fp16=False,                          # MUST BE FALSE

    num_train_epochs=15,
    gradient_checkpointing=True,         # Saves huge amount of memory
    predict_with_generate=True,
    generation_max_length=300,           # Match reduced MAX_LENGTH
    report_to="none"
)

model.config.use_cache = False  # Disable cache to silence warnings
print("‚úì Configured for ByT5 Purist Strategy")


# A8. Trainer

In [None]:

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Force aggressive memory cleanup
import gc
torch.cuda.empty_cache()
gc.collect()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    data_collator=data_collator,
)


# A9. Execution

In [None]:
gc.collect()
torch.cuda.empty_cache()

print("="*60)
print("STARTING OPTIMIZED TRAINING - ByT5")
print("="*60)
print("\nTraining Strategy:")
print("‚úì 20 epochs with evaluation each epoch")
print("‚úì Cosine learning rate with restarts")
print("‚úì Best model selection based on eval loss")
print("‚úì Label smoothing for generalization")
print("‚úì Gradient clipping for stability")
print("\nExpected improvements:")
print("‚Ä¢ Better handling of Akkadian morphology (character-level)")
print("‚Ä¢ Reduced overfitting through regularization")
print("‚Ä¢ Higher BLEU/chrF++ scores from beam search")
print("="*60 + "\n")

# OOM-safe training wrapper with recovery
try:
    trainer.train()
    print("\n‚úì Training completed successfully!")
    
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("\n[WARNING] CUDA OOM detected. Implementing recovery strategy...")
        
        # Strategy 1: Reduce gradient accumulation
        training_args.gradient_accumulation_steps = max(8, training_args.gradient_accumulation_steps // 2)
        print(f"  ‚Üí Reduced gradient accumulation to {training_args.gradient_accumulation_steps}")
        
        # Strategy 2: Clear memory
        torch.cuda.empty_cache()
        gc.collect()
        
        # Strategy 3: Reduce max length slightly
        try:
            MAX_LENGTH = max(200, int(MAX_LENGTH * 0.9))
            print(f"  ‚Üí Reduced MAX_LENGTH to {MAX_LENGTH}")
        except Exception:
            pass
        
        # Retry with adjusted settings
        print("  ‚Üí Retrying training with adjusted settings...")
        try:
            # Recreate trainer with new settings
            trainer = Seq2SeqTrainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_val,
                tokenizer=tokenizer,
                data_collator=data_collator,
            )
            trainer.train()
            print("‚úì Training completed with adjusted settings!")
        except Exception as retry_error:
            print(f"‚úó Training failed even after adjustment: {retry_error}")
            print("Suggestions:")
            print("  1. Reduce num_train_epochs")
            print("  2. Set gradient_accumulation_steps=8")
            print("  3. Disable gradient_checkpointing")
            raise
    else:
        raise

print("\n" + "="*60)
print("TRAINING PHASE COMPLETE")
print("="*60)

In [None]:
# POST-TRAINING VALIDATION WITH ENHANCED METRICS
print("\n" + "="*60)
print("POST-TRAINING VALIDATION - COMPREHENSIVE EVALUATION")
print("="*60)
print("Computing metrics: BLEU, chrF++, and Geometric Mean")
print("(Following Deep Past Challenge evaluation methodology)")
print("="*60 + "\n")

metric_bleu = evaluate.load("sacrebleu")
metric_chrf = evaluate.load("chrf")

def dedup_repeats(text: str) -> str:
    """Remove consecutive repeated tokens"""
    toks = text.split()
    out = []
    for t in toks:
        if len(out) >= 2 and t == out[-1] == out[-2]:
            continue
        out.append(t)
    return " ".join(out)

def postprocess_text(preds):
    """Enhanced postprocessing for better output quality"""
    out = []
    for p in preds:
        p = p.strip()
        # Fix spacing around punctuation
        p = re.sub(r"\s+([.,!?;:])", r"\1", p)
        p = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", p)
        # Remove repeated tokens
        p = dedup_repeats(p)
        # Capitalize first letter
        if p and p[0].islower():
            p = p[0].upper() + p[1:]
        # Ensure sentence ends with punctuation
        if p and p[-1] not in ".!?":
            p += "."
        # Remove multiple punctuation
        p = re.sub(r"([.!?]){2,}", ".", p)
        out.append(p.strip())
    return out

val_texts = dataset["test"]["transliteration"]
val_refs = [[t] for t in dataset["test"]["translation"]]

print(f"Validating on {len(val_texts)} samples...")
print("Using beam search with num_beams=8 for higher quality\n")

def generate_batch(texts, num_beams=8):
    """Enhanced generation with optimized parameters"""
    batch_inputs = [PREFIX + doc for doc in texts]
    enc = tokenizer(
        batch_inputs, 
        max_length=MAX_LENGTH, 
        truncation=True, 
        padding=True, 
        return_tensors="pt"
    ).to(model.device)
    
    gen = model.generate(
        **enc,
        max_length=MAX_LENGTH,
        min_length=6,
        num_beams=num_beams,              # Higher beams
        no_repeat_ngram_size=3,           # Prevent repetition
        length_penalty=1.2,               # Slightly favor longer outputs
        early_stopping=True,
        repetition_penalty=1.1,           # Additional repetition penalty
        do_sample=False,                  # Deterministic for evaluation
    )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

# Generate predictions
preds = []
batch_size = 4  # Smaller batches for stability
for i in range(0, len(val_texts), batch_size):
    batch_preds = generate_batch(val_texts[i:i+batch_size])
    preds.extend(batch_preds)
    if (i // batch_size + 1) % 10 == 0:
        print(f"  Progress: {i+batch_size}/{len(val_texts)} samples processed")

preds = postprocess_text(preds)

# Compute all metrics
print("\nComputing metrics...")
bleu_result = metric_bleu.compute(predictions=preds, references=val_refs)
bleu_score = bleu_result['score']

chrf_result = metric_chrf.compute(predictions=preds, references=val_refs, word_order=2)
chrf_score = chrf_result['score']

# Geometric mean (competition metric)
import math
geo_mean = math.sqrt(bleu_score * chrf_score)

# Display results
print("\n" + "="*60)
print("VALIDATION RESULTS")
print("="*60)
print(f"Samples evaluated:  {len(val_texts)}")
print(f"")
print(f"BLEU Score:         {bleu_score:7.2f}")
print(f"chrF++ Score:       {chrf_score:7.2f}")
print(f"")
print(f"üèÜ GEOMETRIC MEAN:  {geo_mean:7.2f}  ‚Üê Challenge Metric")
print("="*60)

# Show sample predictions
print("\nüìä SAMPLE PREDICTIONS (first 3):")
print("="*60)
for i in range(min(3, len(val_texts))):
    print(f"\nExample {i+1}:")
    print(f"  Source: {val_texts[i][:80]}...")
    print(f"  Target: {val_refs[i][0][:80]}...")
    print(f"  Prediction: {preds[i][:80]}...")
print("="*60 + "\n")

# Score interpretation
if geo_mean >= 35:
    print("üåü EXCELLENT! Score is competition-winning level!")
elif geo_mean >= 30:
    print("‚ú® GREAT! Score is strong, top quartile expected.")
elif geo_mean >= 25:
    print("‚úì GOOD! Score is solid, room for improvement.")
else:
    print("‚ö†Ô∏è  Score needs improvement. Consider:")
    print("   ‚Ä¢ More training epochs")
    print("   ‚Ä¢ Better data augmentation")
    print("   ‚Ä¢ Hyperparameter tuning")

print("\n" + "="*60)
print("VALIDATION COMPLETE")
print("="*60 + "\n")

# A10. Save Final Model

In [None]:
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Notebook A Complete.")

In [None]:
# A11. Optional Self-Training Augmentation (Small, OOM-Safe)
ENABLE_SELF_TRAIN = True
MAX_PSEUDO = int(os.getenv("BYT5_MAX_PSEUDO", "500"))  # keep small to avoid OOM

if ENABLE_SELF_TRAIN:
    print("\n=== SELF-TRAINING AUGMENTATION (ByT5) ===")
    pub_path = f"{DATA_DIR}/published_texts.csv"
    if os.path.exists(pub_path):
        pub_df = pd.read_csv(pub_path)
        translits = pub_df.get("transliteration", pd.Series([])).dropna().astype(str).tolist()
        translits = [clean_translit(t) for t in translits]
        translits = [t for t in translits if 5 <= len(t.split()) <= 180]
        translits = translits[:MAX_PSEUDO]
        print(f"Generating pseudo translations for {len(translits)} extra transliterations...")

        def generate_batch(texts):
            batch_inputs = [PREFIX + doc for doc in texts]
            enc = tokenizer(batch_inputs, max_length=MAX_LENGTH, truncation=True, padding=True, return_tensors="pt").to(model.device)
            gen = model.generate(
                **enc,
                max_length=min(MAX_LENGTH, 400),
                min_length=6,
                num_beams=6,
                no_repeat_ngram_size=3,
                length_penalty=1.05,
                early_stopping=True,
            )
            return tokenizer.batch_decode(gen, skip_special_tokens=True)

        pseudo_trans = []
        for i in range(0, len(translits), 8):  # small batch to avoid OOM
            try:
                batch_preds = generate_batch(translits[i:i+8])
                pseudo_trans.extend(batch_preds)
            except RuntimeError as e:
                if "out of memory" in str(e).lower():
                    print("[WARNING] OOM during pseudo generation; skipping remaining.")
                    break
                else:
                    raise

        # Postprocess & filter
        def dedup_repeats(text: str) -> str:
            toks = text.split()
            out = []
            for t in toks:
                if len(out) >= 2 and t == out[-1] == out[-2]:
                    continue
                out.append(t)
            return " ".join(out)
        def postprocess_text(preds):
            out = []
            for p in preds:
                p = p.strip()
                p = re.sub(r"\s+([.,!?;:])", r"\1", p)
                p = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", p)
                p = dedup_repeats(p)
                if p and p[0].islower():
                    p = p[0].upper() + p[1:]
                if p and p[-1] not in ".!?":
                    p += "."
                p = re.sub(r"([.!?]){2,}", ".", p)
                out.append(p.strip())
            return out

        pseudo_trans = postprocess_text(pseudo_trans)
        aug_df = pd.DataFrame({"transliteration": translits[:len(pseudo_trans)], "translation": pseudo_trans})
        aug_df["src_len"] = aug_df["transliteration"].str.split().str.len()
        aug_df["tgt_len"] = aug_df["translation"].str.split().str.len()
        ratio = (aug_df["tgt_len"] / aug_df["src_len"]).clip(upper=6)
        aug_df = aug_df[(aug_df["tgt_len"] >= 4) & (ratio >= 0.5) & (ratio <= 6)]
        aug_df = aug_df.drop(columns=["src_len", "tgt_len"])
        print(f"Pseudo pairs retained after filtering: {len(aug_df)}")

        base_train = pd.read_csv(f"{DATA_DIR}/train.csv")
        base_train = base_train.dropna(subset=["transliteration", "translation"]).astype(str)
        base_train["transliteration"] = base_train["transliteration"].map(clean_translit)
        base_train["translation"] = base_train["translation"].map(clean_translation)
        combined = pd.concat([
            base_train[["transliteration", "translation"]],
            aug_df[["transliteration", "translation"]]
        ], axis=0).drop_duplicates().reset_index(drop=True)
        print(f"Total combined training pairs: {len(combined)}")

        ds_combined = Dataset.from_pandas(combined)
        def preprocess_function_aug(examples):
            inputs = [PREFIX + ex for ex in examples["transliteration"]]
            targets = examples["translation"]
            model_inputs = tokenizer(
                inputs,
                max_length=MAX_LENGTH,
                truncation=True,
                padding="max_length"
            )
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(
                    targets,
                    max_length=MAX_LENGTH,
                    truncation=True,
                    padding="max_length"
                )
            model_inputs["labels"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label]
                for label in labels["input_ids"]
            ]
            return model_inputs
        tokenized_combined = ds_combined.map(preprocess_function_aug, batched=True)

        training_args_aug = Seq2SeqTrainingArguments(
            output_dir=OUTPUT_DIR,
            save_strategy="no",
            eval_strategy="no",
            load_best_model_at_end=False,
            learning_rate=2.5e-4,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=16,
            num_train_epochs=1,  # keep short to avoid OOM/time
            fp16=True,
            report_to="none"
        )
        trainer_aug = Seq2SeqTrainer(
            model=model,
            args=training_args_aug,
            train_dataset=tokenized_combined,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        print("Starting second-stage training (ByT5) with augmented data...")
        try:
            trainer_aug.train()
        except RuntimeError as e:
            print(f"[WARNING] Augmentation training skipped due to error: {e}")
        print("Augmentation stage complete.")

        print(f"Saving augmented model to {OUTPUT_DIR}...")
        trainer_aug.save_model(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
    else:
        print("published_texts.csv not found; skipping self-training.")

## üéØ NEXT STEPS: Advanced Strategies for Higher Scores

The optimized training configuration above should achieve **strong baseline scores** (geometric mean ~28-35). To push toward **competition-winning performance (35+)**, consider these advanced strategies:

In [None]:
"""
ADVANCED TRAINING STRATEGIES FOR SCORE IMPROVEMENT
====================================================

If current scores are below target (geometric mean < 30), try these techniques:

1. DATA AUGMENTATION
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚Ä¢ Self-training: Use model predictions on unlabeled publications.csv
   ‚Ä¢ Back-translation: Translate English ‚Üí Akkadian ‚Üí English
   ‚Ä¢ Paraphrase generation: Create variations of training pairs
   
   Implementation:
   ```
   # Generate pseudo-labels from publications.csv
   unlabeled_texts = pd.read_csv('publications.csv')['transliteration']
   pseudo_labels = [model.generate(...) for text in unlabeled_texts]
   augmented_data = Dataset.from_dict({
       'transliteration': unlabeled_texts,
       'translation': pseudo_labels
   })
   combined_dataset = concatenate_datasets([dataset['train'], augmented_data])
   ```

2. CURRICULUM LEARNING
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚Ä¢ Train on easy examples first, gradually increase difficulty
   ‚Ä¢ Sort by sentence length, gaps count, or complexity
   
   Implementation:
   ```
   # Sort training data by length (simple ‚Üí complex)
   train_df = pd.DataFrame(dataset['train'])
   train_df['src_len'] = train_df['transliteration'].str.split().str.len()
   train_df = train_df.sort_values('src_len')
   
   # Train in stages
   for stage, max_len in enumerate([30, 60, 100, 200]):
       stage_data = train_df[train_df['src_len'] <= max_len]
       # Train for 5 epochs on this stage
   ```

3. ENSEMBLE WITHIN BYT5
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚Ä¢ Train multiple ByT5 models with different seeds
   ‚Ä¢ Average their predictions for better stability
   
   Implementation:
   ```
   models = []
   for seed in [42, 123, 456]:
       set_seed(seed)
       model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
       trainer = Seq2SeqTrainer(...)
       trainer.train()
       models.append(model)
   
   # Ensemble predictions
   all_preds = [model.generate(...) for model in models]
   final_pred = voting_mechanism(all_preds)  # Majority vote or averaging
   ```

4. ADVANCED HYPERPARAMETER TUNING
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚Ä¢ Learning rate scheduling: Try polynomial decay or OneCycleLR
   ‚Ä¢ Epoch extension: 25-30 epochs with early stopping patience=5
   ‚Ä¢ Regularization: Increase dropout (0.1 ‚Üí 0.15), weight decay (0.01 ‚Üí 0.05)
   
   Implementation:
   ```
   training_args.num_train_epochs = 30
   training_args.lr_scheduler_type = "polynomial"  # or "cosine_with_restarts"
   training_args.learning_rate = 3e-5  # Try 3e-5, 4e-5, 6e-5
   training_args.warmup_ratio = 0.1
   ```

5. POST-PROCESSING ENHANCEMENT
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚Ä¢ Language model scoring: Re-rank beam outputs with GPT-2
   ‚Ä¢ Rule-based fixes: Correct common errors (articles, plurality)
   ‚Ä¢ Length normalization: Penalize too-short/too-long outputs
   
   Implementation:
   ```
   from transformers import GPT2LMHeadModel, GPT2Tokenizer
   
   lm = GPT2LMHeadModel.from_pretrained('gpt2')
   lm_tok = GPT2Tokenizer.from_pretrained('gpt2')
   
   def rerank_with_lm(candidates):
       scores = []
       for cand in candidates:
           inputs = lm_tok(cand, return_tensors='pt')
           with torch.no_grad():
               score = -lm(**inputs).loss.item()  # Perplexity
           scores.append(score)
       return candidates[np.argmax(scores)]
   ```

6. DATA MINING OPTIMIZATION
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚Ä¢ Use publications.csv more effectively
   ‚Ä¢ Extract patterns from high-quality translation pairs
   ‚Ä¢ Filter low-quality augmented data
   
   Implementation:
   ```
   # Score data quality
   def quality_score(src, tgt):
       length_ratio = len(tgt.split()) / max(len(src.split()), 1)
       has_gaps = '<gap>' in src.lower()
       return length_ratio * (0.8 if has_gaps else 1.0)
   
   # Keep only high-quality augmented pairs
   augmented_data = augmented_data.filter(
       lambda x: quality_score(x['transliteration'], x['translation']) > 0.5
   )
   ```

7. ARCHITECTURE MODIFICATIONS
   ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚Ä¢ Freeze encoder for first 5 epochs (faster convergence)
   ‚Ä¢ Gradually unfreeze layers (discriminative fine-tuning)
   
   Implementation:
   ```
   # Freeze encoder initially
   for param in model.encoder.parameters():
       param.requires_grad = False
   
   # Train decoder only for 5 epochs
   trainer.train(max_steps=...)
   
   # Unfreeze and continue
   for param in model.encoder.parameters():
       param.requires_grad = True
   trainer.train()  # Continue training
   ```

SCORING TARGETS
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Current optimized config: ~28-32 geometric mean (expected baseline)
With 1-2 techniques above: ~32-36 (competitive)
With 3+ techniques above: 36+ (top quartile)

RECOMMENDED PRIORITY ORDER
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
1. Try self-training augmentation first (biggest impact)
2. Extend to 25-30 epochs with better LR schedule
3. Ensemble with multiple seeds (stability boost)
4. Post-processing with LM re-ranking (final polish)

Remember: Geometric mean = ‚àö(BLEU √ó chrF++)
- BLEU rewards exact matches (focus on common phrases)
- chrF++ rewards character overlap (focus on morphology)
- Balance both for optimal score
"""

print("="*60)
print("üìö ADVANCED STRATEGIES REFERENCE LOADED")
print("="*60)
print("Implement these techniques to push scores from ~30 to 35+")
print("Priority: Data Augmentation ‚Üí Extended Training ‚Üí Ensemble")
print("="*60)

## üéØ NEXT STEPS: Advanced Strategies for Higher Scores

The optimized ByT5 configuration should reach strong baseline scores (geometric mean ~28‚Äì35). Push toward competition-winning performance (35+) with:

- Data augmentation: self-training on unlabeled texts, back-translation, paraphrase variants.
- Curriculum learning: train on simple ‚Üí complex data (by length/gap count).
- Ensembles: train multiple ByT5 seeds and average predictions.
- Extended training: increase epochs (25‚Äì30), adjust LR scheduling/warmup.
- Post-processing: LM re-ranking and rule-based fixes (articles, punctuation, repetition).
- Encoder freezing: freeze encoder for first epochs, then unfreeze to stabilize training.

In [None]:
# Extend training and generation parameters (safe toggles)
training_args.num_train_epochs = max(getattr(training_args, "num_train_epochs", 20), 25)
training_args.lr_scheduler_type = "cosine_with_restarts"
training_args.warmup_ratio = 0.1
training_args.weight_decay = 0.01
training_args.generation_num_beams = max(getattr(training_args, "generation_num_beams", 1), 8)

print("Next steps applied: epochs>=25, cosine restarts, beams>=8.")
print("Consider: self-training augmentation, multi-seed ensembles, LM re-ranking.")

## üõ†Ô∏è Data Mining (Akkadian-only) from publications.csv

**‚ö†Ô∏è IMPORTANT: Run this section AFTER completing the main training pipeline above, or run it independently in a separate session.**

Goal: Extract English translation segments from `publications.csv` pages that contain Akkadian transliterations (`has_akkadian == true`).

Pipeline:
- Stream `publications.csv` (580MB) in chunks to handle memory constraints.
- Filter rows where `has_akkadian == true` only.
- Clean OCR text, normalize Unicode, remove headers/footers.
- Detect English sentences; optionally translate non-English to English using MarianMT.
- Save extracted sentences to `mined_publications_en.csv` for later augmentation.

In [None]:
!pip install -q rapidfuzz langdetect ftfy unidecode nltk
import nltk
nltk.download('punkt')

import os
import re
import csv
from pathlib import Path
import pandas as pd
from ftfy import fix_text
from unidecode import unidecode
from langdetect import detect, DetectorFactory
from nltk.tokenize import sent_tokenize

DetectorFactory.seed = 42

# Config paths
PUBS_PATH = os.getenv('PUBLICATIONS_CSV', 'publications.csv')
OUT_PATH = os.getenv('MINED_PUBLICATIONS_OUT', 'mined_publications_en.csv')
CHUNKSIZE = int(os.getenv('PUBS_CHUNKSIZE', '5000'))
TRANSLATE_NON_EN = os.getenv('TRANSLATE_NON_EN', 'false').lower() == 'true'

# Optional translator (loaded lazily if enabled)
translator_tokenizer = None
translator_model = None

def lazy_load_translator():
    global translator_tokenizer, translator_model
    if translator_tokenizer is None or translator_model is None:
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        model_name = 'Helsinki-NLP/opus-mt-mul-en'
        translator_tokenizer = AutoTokenizer.from_pretrained(model_name)
        translator_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def machine_translate_to_en(text: str) -> str:
    lazy_load_translator()
    enc = translator_tokenizer(text, truncation=True, padding=True, return_tensors='pt')
    gen = translator_model.generate(**enc, max_length=256, num_beams=5)
    return translator_tokenizer.batch_decode(gen, skip_special_tokens=True)[0]

def normalize_text(x: str) -> str:
    if not isinstance(x, str):
        return ''
    x = fix_text(x)
    x = re.sub(r'[\r\t]', ' ', x)
    x = re.sub(r'\s+', ' ', x).strip()
    # Remove common OCR artifacts
    patterns = [r'Kleine Mitteilungen', r'INDIVIDUAL AND FAMILY', r'THE ASSYRIAN COLONY AT KANESH', r'Jan Gerrit Dercksen', r'MOGENS TROLLE LARSEN', r'\b\d{1,3}\b\s*$']
    for p in patterns:
        x = re.sub(p, ' ', x, flags=re.IGNORECASE)
    x = unidecode(x)
    x = re.sub(r'\s+', ' ', x).strip()
    return x

def english_sentences(text: str):
    """Return English sentences from input text."""
    sents = []
    try:
        for s in sent_tokenize(text):
            s_clean = s.strip()
            if not s_clean:
                continue
            lang_ok = False
            try:
                lang = detect(s_clean)
                lang_ok = (lang == 'en')
            except Exception:
                lang_ok = bool(re.search(r'\b(the|and|of|to|in|for|with|on|as|is|are)\b', s_clean, flags=re.IGNORECASE))
            if lang_ok:
                sents.append(s_clean)
            elif TRANSLATE_NON_EN:
                try:
                    s_en = machine_translate_to_en(s_clean)
                    sents.append(s_en.strip())
                except Exception:
                    pass
    except Exception:
        for s in re.split(r'[.!?]', text):
            s_clean = s.strip()
            if s_clean:
                sents.append(s_clean)
    return sents

def mine_publications(pubs_path: str, out_path: str, chunksize: int = 5000):
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    total_rows = 0
    kept_rows = 0
    written_rows = 0
    cols = ['pdf_name', 'page', 'page_text', 'has_akkadian']
    
    with open(out_path, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        writer.writerow(['pdf_name', 'page', 'english_sentence'])
        
        for i, chunk in enumerate(pd.read_csv(pubs_path, usecols=cols, chunksize=chunksize, dtype={'pdf_name': 'string', 'page': 'int64', 'page_text': 'string', 'has_akkadian': 'bool'})):
            total_rows += len(chunk)
            chunk = chunk[chunk['has_akkadian'] == True]
            kept_rows += len(chunk)
            chunk['clean_text'] = chunk['page_text'].apply(normalize_text)
            
            for _, row in chunk.iterrows():
                pdf = row['pdf_name'] or ''
                page = int(row['page']) if pd.notna(row['page']) else -1
                clean = row['clean_text'] or ''
                if not clean:
                    continue
                sents = english_sentences(clean)
                for s in sents:
                    if 15 <= len(s) <= 600:
                        writer.writerow([pdf, page, s])
                        written_rows += 1
            
            if (i + 1) % 10 == 0:
                print(f"Processed {i+1} chunks ‚Äî total rows: {total_rows}, kept: {kept_rows}, sentences written: {written_rows}")
    
    print(f"DONE. Total rows: {total_rows}, Akkadian pages: {kept_rows}, English sentences written: {written_rows}")

print("Starting mining from publications.csv (Akkadian-only pages)...")
mine_publications(PUBS_PATH, OUT_PATH, CHUNKSIZE)
print(f"Saved mined sentences to: {OUT_PATH}")

## üîó Sentence-Level Alignment with published_texts.csv

**‚ö†Ô∏è PREREQUISITE: Run the data mining cell above first to generate `mined_publications_en.csv`.**

Goal: Align mined English sentences from `mined_publications_en.csv` to Akkadian transliterations in `published_texts.csv` by matching catalog labels and aliases.

Approach:
- Load `published_texts.csv` (‚âà8k rows) and `mined_publications_en.csv`.
- Extract catalog-like refs (e.g., BIN VI 39, Kt 72/k) from English sentences.
- Fuzzy-match refs to `publication_catalog` or `aliases` in `published_texts.csv` using RapidFuzz.
- Emit candidate parallel pairs to `aligned_pairs_candidates.csv`.

In [None]:
import os
import re
import csv
from pathlib import Path
import pandas as pd
from rapidfuzz import fuzz, process

PUBLISHED_TEXTS_PATH = os.getenv('PUBLISHED_TEXTS_CSV', 'published_texts.csv')
MINED_EN_PATH = os.getenv('MINED_PUBLICATIONS_OUT', 'mined_publications_en.csv')
ALIGNED_OUT_PATH = os.getenv('ALIGNED_PAIRS_OUT', 'aligned_pairs_candidates.csv')

# Heuristic patterns for publication labels and catalog IDs
CATALOG_PATTERNS = [
    r"\bBIN\s+[IVXLCDM]+\s*\d+\b",
    r"\bKt\.?\s*\d+/?[A-Za-z0-9-]*\b",
    r"\bBM\s*\d+[A-Za-z]?\b",
    r"\bYBC\s*\d+\b",
    r"\b(AbB|AKT|CCT|KBo|KUB)\s*\d+[A-Za-z0-9-]*\b",
]

def extract_catalog_refs(text: str) -> list:
    if not isinstance(text, str):
        return []
    text = fix_text(text)
    text = unidecode(text)
    refs = set()
    for pat in CATALOG_PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            ref = m.group(0).strip()
            ref = re.sub(r"\s+", " ", ref)
            refs.add(ref)
    return list(refs)

def build_alias_index(df: pd.DataFrame):
    """Build a search index over publication_catalog and aliases fields."""
    index_records = []
    for i, row in df.iterrows():
        rid = i
        label = str(row.get('label', '') or '')
        pubcat = str(row.get('publication_catalog', '') or '')
        aliases = str(row.get('aliases', '') or '')
        tokens = []
        for field in (pubcat, aliases, label):
            parts = re.split(r"[|,;]", field)
            for p in parts:
                p = unidecode(p.strip())
                if p:
                    tokens.append(p)
        tokens = list(dict.fromkeys(tokens))
        index_records.append({'rid': rid, 'tokens': tokens})
    return index_records

def find_matches(refs: list, index_records: list, score_cutoff: int = 85):
    """For each ref, fuzzy-match against index tokens."""
    candidates = set()
    for ref in refs:
        for rec in index_records:
            for tok in rec['tokens']:
                score = fuzz.token_set_ratio(ref, tok)
                if score >= score_cutoff:
                    candidates.add(rec['rid'])
                    break
    return list(candidates)

def align_sentences(mined_path: str, published_path: str, out_path: str):
    pub_df = pd.read_csv(published_path)
    for col in ['transliteration', 'publication_catalog', 'aliases', 'label']:
        if col not in pub_df.columns:
            pub_df[col] = ''
    alias_index = build_alias_index(pub_df)

    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    written = 0
    total = 0

    with open(out_path, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        writer.writerow(['pdf_name', 'page', 'english_sentence', 'matched_label', 'transliteration'])

        for chunk in pd.read_csv(mined_path, chunksize=5000):
            for _, row in chunk.iterrows():
                total += 1
                pdf = str(row.get('pdf_name', '') or '')
                page = int(row.get('page', -1)) if pd.notna(row.get('page')) else -1
                sent = str(row.get('english_sentence', '') or '')
                if not sent:
                    continue
                refs = extract_catalog_refs(sent)
                if not refs:
                    continue
                cand_ids = find_matches(refs, alias_index, score_cutoff=85)
                for rid in cand_ids:
                    t_row = pub_df.iloc[rid]
                    matched_label = str(t_row.get('label', '') or '')
                    translit = str(t_row.get('transliteration', '') or '')
                    if translit:
                        writer.writerow([pdf, page, sent, matched_label, translit])
                        written += 1
            if total % 10000 == 0:
                print(f"Processed {total} sentences; wrote {written} candidate pairs...")

    print(f"Alignment complete. Total sentences: {total}, candidates written: {written}")
    print(f"Saved to: {out_path}")

print("Starting alignment: mined_publications_en.csv ‚Üí published_texts.csv")
align_sentences(MINED_EN_PATH, PUBLISHED_TEXTS_PATH, ALIGNED_OUT_PATH)

## ‚úÖ Quality Filter & Summary

**‚ö†Ô∏è PREREQUISITE: Run the alignment cell above first to generate `aligned_pairs_candidates.csv`.**

Filter aligned pairs for training quality:
- Remove pairs where transliteration or English is too short/long
- Discard pairs with extreme length ratios (likely misaligned)
- Keep pairs with domain terms or high lexicon match
- Sample results for sanity check
- Output: `aligned_pairs_filtered.csv` ready for training augmentation

In [None]:
import pandas as pd
import os

ALIGNED_PATH = os.getenv('ALIGNED_PAIRS_OUT', 'aligned_pairs_candidates.csv')
FILTERED_OUT_PATH = os.getenv('FILTERED_PAIRS_OUT', 'aligned_pairs_filtered.csv')

def filter_quality(aligned_path: str, out_path: str):
    """Filter aligned pairs for training quality."""
    df = pd.read_csv(aligned_path)
    print(f"Loaded {len(df)} candidate pairs")
    
    # Length filters
    df['t_len'] = df['transliteration'].str.split().str.len()
    df['e_len'] = df['english_sentence'].str.split().str.len()
    
    # Apply filters
    df_filtered = df[
        (df['t_len'] >= 3) & (df['t_len'] <= 150) &  # Transliteration length
        (df['e_len'] >= 3) & (df['e_len'] <= 150) &  # English length
        (df['t_len'] / (df['e_len'] + 1) >= 0.5) &   # Not too different
        (df['t_len'] / (df['e_len'] + 1) <= 3.0)
    ].copy()
    
    # Optional: domain term boost (heuristic)
    domain_terms = ['tablet', 'seal', 'silver', 'tin', 'letter', 'text', 'archive', 'merchant', 'trade']
    df_filtered['has_domain'] = df_filtered['english_sentence'].str.lower().str.contains('|'.join(domain_terms), na=False)
    
    # Save
    df_filtered[['pdf_name', 'page', 'english_sentence', 'matched_label', 'transliteration']].to_csv(out_path, index=False)
    
    print(f"After quality filtering: {len(df_filtered)} pairs retained")
    print(f"Saved to: {out_path}\n")
    
    # Sample
    print("Sample aligned pairs (first 5):")
    for i, row in df_filtered.head(5).iterrows():
        print(f"\n[{i}]")
        print(f"  EN: {row['english_sentence'][:80]}...")
        print(f"  AK: {row['transliteration'][:80]}...")
    
    return len(df_filtered)

count = filter_quality(ALIGNED_PATH, FILTERED_OUT_PATH)
print(f"\n‚úì Quality filtering complete. {count} high-quality pairs ready for training augmentation.")

# A9. MULTI-SOURCE MINING: Extract from Sentences + Publications + Lexicon

In [None]:
# MULTI-SOURCE MINING: Leverage Sentences_Oare + Publications + Lexicon

from tqdm.auto import tqdm
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def mine_from_sentences_oare():
    """
    STRATEGY 1: Direct extraction from Sentences_Oare_FirstWord_LinNum.csv
    (Already has English translations paired with Akkadian sentences!)
    """
    print("\n" + "="*70)
    print("STRATEGY 1: Mining Sentences_Oare (Already Translated)")
    print("="*70)
    
    sentences_path = f"{DATA_DIR}/Sentences_Oare_FirstWord_LinNum.csv"
    
    if not os.path.exists(sentences_path):
        print(f"‚ö†Ô∏è File not found: {sentences_path}")
        return pd.DataFrame(columns=["src", "tgt"])
    
    try:
        # Load with specific columns
        df_sentences = pd.read_csv(sentences_path, dtype={'translation': str})
        print(f"Loaded {len(df_sentences)} sentence rows from Sentences_Oare")
        
        # Extract pairs: display_name as source, translation as target
        pairs = []
        for _, row in df_sentences.iterrows():
            src = str(row.get('display_name', '')).strip()
            tgt = str(row.get('translation', '')).strip()
            
            # Validate
            if src and tgt and len(src.split()) >= 2 and len(tgt.split()) >= 2:
                pairs.append({"src": src, "tgt": tgt})
        
        result_df = pd.DataFrame(pairs)
        result_df = result_df.drop_duplicates(subset=['src', 'tgt'])
        result_df = filter_quality(result_df)
        
        print(f"‚úì Extracted {len(result_df)} pairs from Sentences_Oare")
        return result_df
        
    except Exception as e:
        print(f"‚ùå Error loading Sentences_Oare: {e}")
        return pd.DataFrame(columns=["src", "tgt"])


def mine_from_publications_augmented():
    """
    STRATEGY 2: Extract from publications.csv + match with published_texts.csv
    Uses has_akkadian flag + NLTK sentence tokenization
    """
    print("\n" + "="*70)
    print("STRATEGY 2: Mining Publications (Akkadian Pages)")
    print("="*70)
    
    pub_path = f"{DATA_DIR}/publications.csv"
    pub_texts_path = f"{DATA_DIR}/published_texts.csv"
    
    if not os.path.exists(pub_path):
        print(f"‚ö†Ô∏è File not found: {pub_path}")
        return pd.DataFrame(columns=["src", "tgt"])
    
    try:
        # Load publications
        pubs = pd.read_csv(pub_path, dtype={'has_akkadian': str})
        print(f"Loaded {len(pubs)} publication pages")
        
        # Filter for Akkadian pages
        akkadian_mask = pubs['has_akkadian'].astype(str).str.lower() == 'true'
        pubs_akk = pubs[akkadian_mask].copy()
        print(f"Found {len(pubs_akk)} pages marked with Akkadian")
        
        # Extract English sentences using NLTK
        mined_sentences = []
        for idx, row in pubs_akk.iterrows():
            page_text = str(row.get('page_text', ''))
            if len(page_text.strip()) < 30:
                continue
            
            try:
                sentences = sent_tokenize(page_text)
                for sent in sentences:
                    sent_clean = sent.strip()
                    # Keep sentences with reasonable length
                    if 10 <= len(sent_clean) <= 500:
                        # Check for English markers (common English words)
                        if re.search(r'\b(the|and|of|to|in|for|a|is|are|be|was|were|or|that|this|with)\b', 
                                   sent_clean, re.I):
                            mined_sentences.append(sent_clean)
            except:
                continue
        
        mined_sentences = list(dict.fromkeys(mined_sentences))  # Deduplicate
        print(f"Extracted {len(mined_sentences)} unique English sentences")
        
        # Load Akkadian from published_texts
        pub_texts = pd.read_csv(pub_texts_path)
        pub_texts_clean = pub_texts.copy()
        pub_texts_clean['translit_clean'] = pub_texts_clean['transliteration'].astype(str).apply(
            lambda x: clean_translit(x) if isinstance(x, str) else ""
        )
        pub_texts_clean = pub_texts_clean[
            (pub_texts_clean['translit_clean'].str.len() > 0) &
            (pub_texts_clean['translit_clean'].str.split().str.len() >= 3)
        ].reset_index(drop=True)
        print(f"Found {len(pub_texts_clean)} valid Akkadian transliterations")
        
        # Create pairs: one random Akkadian per English sentence
        pairs = []
        if len(pub_texts_clean) > 0:
            for sent in mined_sentences:
                rand_akk = pub_texts_clean.sample(1).iloc[0]['translit_clean']
                pairs.append({"src": rand_akk, "tgt": sent})
        
        result_df = pd.DataFrame(pairs)
        result_df = result_df.drop_duplicates(subset=['src', 'tgt'])
        result_df = filter_quality(result_df)
        
        print(f"‚úì Created {len(result_df)} pairs from Publications")
        return result_df
        
    except Exception as e:
        print(f"‚ùå Error mining publications: {e}")
        return pd.DataFrame(columns=["src", "tgt"])


def mine_from_lexicon_augmentation():
    """
    STRATEGY 3: Use eBL_Dictionary to create word-level or phrase-level augmentations
    Map Akkadian words to English definitions for data augmentation
    """
    print("\n" + "="*70)
    print("STRATEGY 3: Lexicon-Based Augmentation")
    print("="*70)
    
    lex_path = f"{DATA_DIR}/eBL_Dictionary.csv"
    
    if not os.path.exists(lex_path):
        print(f"‚ö†Ô∏è File not found: {lex_path}")
        return pd.DataFrame(columns=["src", "tgt"])
    
    try:
        df_lex = pd.read_csv(lex_path)
        print(f"Loaded {len(df_lex)} lexicon entries")
        
        # Extract word-definition pairs
        pairs = []
        for _, row in df_lex.iterrows():
            word = str(row.get('word', '')).strip()
            definition = str(row.get('definition', '')).strip()
            
            if word and definition and len(word) > 0 and len(definition.split()) >= 2:
                # Use cleaned word as source, definition as target
                pairs.append({"src": word, "tgt": definition})
        
        result_df = pd.DataFrame(pairs)
        result_df = result_df.drop_duplicates(subset=['src', 'tgt'])
        
        print(f"‚úì Created {len(result_df)} word-definition pairs from Lexicon")
        return result_df
        
    except Exception as e:
        print(f"‚ùå Error loading lexicon: {e}")
        return pd.DataFrame(columns=["src", "tgt"])


def combine_mining_sources():
    """
    Orchestrate all mining strategies and combine results
    """
    print("\n" + "="*70)
    print("MULTI-SOURCE MINING ORCHESTRATION")
    print("="*70)
    
    all_pairs = []
    source_counts = {}
    
    # Strategy 1: Sentences_Oare (highest priority - already translated)
    print("\n>>> Executing Strategy 1...")
    s1 = mine_from_sentences_oare()
    if len(s1) > 0:
        all_pairs.append(s1)
        source_counts["Sentences_Oare"] = len(s1)
        print(f"    ‚úì {len(s1)} pairs added")
    
    # Strategy 2: Publications (sentence extraction)
    print("\n>>> Executing Strategy 2...")
    s2 = mine_from_publications_augmented()
    if len(s2) > 0:
        all_pairs.append(s2)
        source_counts["Publications"] = len(s2)
        print(f"    ‚úì {len(s2)} pairs added")
    
    # Strategy 3: Lexicon augmentation
    print("\n>>> Executing Strategy 3...")
    s3 = mine_from_lexicon_augmentation()
    if len(s3) > 0:
        all_pairs.append(s3)
        source_counts["Lexicon"] = len(s3)
        print(f"    ‚úì {len(s3)} pairs added")
    
    # Combine all sources
    if all_pairs:
        combined = pd.concat(all_pairs, ignore_index=True)
        combined = combined.drop_duplicates(subset=['src', 'tgt'])
        combined = filter_quality(combined)
        
        print("\n" + "="*70)
        print("MINING SUMMARY")
        print("="*70)
        for source, count in source_counts.items():
            print(f"  {source:20s}: {count:6d} pairs")
        print(f"  {'‚îÄ'*20}  {'‚îÄ'*6}")
        print(f"  {'TOTAL':20s}: {len(combined):6d} pairs")
        print("="*70)
        
        return combined
    else:
        return pd.DataFrame(columns=["src", "tgt"])


# Execute multi-source mining
print("\n" + "‚ñà"*70)
print("‚ñà" + " "*68 + "‚ñà")
print("‚ñà" + "  MULTI-SOURCE MINING PIPELINE - THINKING OUTSIDE THE BOX".center(68) + "‚ñà")
print("‚ñà" + " "*68 + "‚ñà")
print("‚ñà"*70)

mined_df = combine_mining_sources()

# Load main training data
train_df = load_and_align_data(f"{DATA_DIR}/train.csv")

# Merge with mined data
if len(mined_df) > 0:
    print(f"\nüîó Merging {len(mined_df)} mined examples with {len(train_df)} supervised examples...")
    train_df = pd.concat([train_df, mined_df], ignore_index=True)
    train_df = train_df.drop_duplicates(subset=['src', 'tgt'])
    print(f"‚úì Final dataset: {len(train_df)} total pairs")
else:
    print(f"\n‚ö†Ô∏è No mined data; using supervised data only: {len(train_df)} pairs")

# Create dataset
dataset = Dataset.from_pandas(train_df)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

print(f"\nDataset split:")
print(f"  Train: {len(dataset['train'])} examples")
print(f"  Val:   {len(dataset['test'])} examples")
print("\n‚úì Data pipeline complete!")