In [None]:
# DEEP PAST CHALLENGE - ENSEMBLE SUBMISSION

# Install sacremoses for MarianMT (required in offline mode)
try:
    import sacremoses
    print("✓ sacremoses already installed")
except ImportError:
    import os
    wheel_dir = "/kaggle/input/sacremoses-wheel"
    wheel_glob = "sacremoses-*.whl"
    if os.path.exists(wheel_dir):
        print("Installing sacremoses from local wheel...")
        import glob, sys, subprocess
        wheels = glob.glob(f"{wheel_dir}/{wheel_glob}")
        if wheels:
            subprocess.check_call([sys.executable, "-m", "pip", "install", wheels[0], "--no-deps", "-q"])
            import sacremoses
            print("✓ sacremoses installed from wheel")
        else:
            print("⚠️ No sacremoses wheel found in dataset folder")
    else:
        print("⚠️ sacremoses not found - MarianMT may fail")
        print("To fix: Upload sacremoses wheel as dataset and run:")
        print("!pip install /kaggle/input/sacremoses-wheel/sacremoses-*.whl --no-deps")

import os
import re
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import gc

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
MODEL1_PATH = os.getenv("MODEL1_PATH", "/kaggle/input/notebook-a-byt5/byt5-base-saved")
MODEL2_PATH = os.getenv("MODEL2_PATH", "/kaggle/input/notebook-b-t5/t5-base-fine-tuned")
MODEL3_PATH = os.getenv("MODEL3_PATH", "/kaggle/input/notebook-c-marian-mt/marian-mt-saved")

TEST_DATA_PATH = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
MAX_LENGTH = 512

# -----------------------------------------------------------------------------
# ENSEMBLE STRATEGY: AUTO-DETECT
# -----------------------------------------------------------------------------
# This notebook automatically chooses the best strategy:
# 1. Weight averaging if all models have same architecture
# 2. Voting ensemble if models have different architectures
# 3. Best single model as fallback

ENSEMBLE_MODE = os.getenv("ENSEMBLE_MODE", "auto")  # "auto", "voting", or "averaging"

# -----------------------------------------------------------------------------
# MODEL CONFIGURATIONS & WEIGHTS
# -----------------------------------------------------------------------------
# Adjust these weights based on validation scores from find-optimal-weights.ipynb
# Or use equal weights as baseline: {"weight": 0.333}

MODEL_CONFIGS = {
    "byt5": {
        "path": MODEL1_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.35,  # Adjust based on validation
        "num_beams": 4
    },
    "t5": {
        "path": MODEL2_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.40,  # Usually best performer
        "num_beams": 4
    },
    "marian": {
        "path": MODEL3_PATH,
        "prefix": ">>eng<< ",
        "max_length": 512,
        "weight": 0.25,  # Adjust based on validation
        "num_beams": 4
    }
}

print(f"Device: {DEVICE}")
print(f"Ensemble Mode: {ENSEMBLE_MODE}")
print(f"\nModel Weights:")
for name, config in MODEL_CONFIGS.items():
    exists = "✓" if os.path.exists(config["path"]) else "✗"
    print(f"  {name:10s} {exists} weight={config['weight']:.2f}")

# -----------------------------------------------------------------------------
# GAP REPLACEMENT FUNCTIONS
# -----------------------------------------------------------------------------
def replace_gaps(text):
    """Replace various gap notations with standardized tokens"""
    if pd.isna(text): 
        return text
    
    # Complex gap patterns (order matters)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)

    # Simple gap patterns
    text = re.sub(r'xx', '<gap>', text)
    text = re.sub(r' x ', ' <gap> ', text)
    text = re.sub(r'……', '<big_gap>', text)
    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)
    text = re.sub(r'…', '<big_gap>', text)
    text = re.sub(r'\.\.\.', '<big_gap>', text)

    return text

# -----------------------------------------------------------------------------
# LOAD TEST DATA
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print("LOADING TEST DATA")
print("="*60)

test_df = pd.read_csv(TEST_DATA_PATH)
test_df['transliteration'] = test_df['transliteration'].apply(replace_gaps)
test_inputs = test_df['transliteration'].astype(str).tolist()
source_lengths = [len(t.split()) for t in test_inputs]

print(f"✓ Loaded {len(test_df)} test samples")
print(f"✓ Average source length: {sum(source_lengths)/len(source_lengths):.1f} words")

# -----------------------------------------------------------------------------
# INFERENCE DATASET
# -----------------------------------------------------------------------------
class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx], 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }

=== Deep Past Neural Ensemble Inference ===


2025-12-25 12:14:51.095337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766664891.298313      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766664891.357816      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766664891.851037      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851082      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851085      55 computation_placer.cc:177] computation placer alr

Inference byt5:   0%|          | 0/1 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Inference t5:   0%|          | 0/1 [00:00<?, ?it/s]



Inference marian:   0%|          | 0/1 [00:00<?, ?it/s]


Preview:
   id                                        translation
0   0  Kà-ar-ma ú big_gap da-tim aí-ip-ri-ni Akkadian...
1   1  -ni i-na né-mì-lim da-aùr ú-lá e-WA ia-ra-tí-a...
2   2  -it a-aí-im au-um-au ia-tí aé-bi„-lá-nim Trans...
3   3  É-bi„-lá KÙ. AN Translate Akkadian to English:...


In [None]:
# Override per-model generation configs to match training lengths and improve quality
MODEL_CONFIGS['byt5']['max_length'] = 256
MODEL_CONFIGS['t5']['max_length'] = 128
MODEL_CONFIGS['marian']['max_length'] = 160
for k in MODEL_CONFIGS.keys():
    MODEL_CONFIGS[k]['num_beams'] = 6
print("Adjusted MODEL_CONFIGS: max_length per model and num_beams=6")

# EVALUATION METRIC: Geometric Mean of BLEU and chrF++

## Deep Past Challenge Scoring

The competition uses the **Geometric Mean** of BLEU and chrF++ scores:

### Formula
```
Score = √(BLEU × chrF++)
```

### Why Geometric Mean?
- **BLEU**: Measures n-gram precision (word choice accuracy)
- **chrF++**: Measures character F-score (word order and morphology)
- **Geometric Mean**: Balances both without favoring one

### Score Range
- 0-100 (higher is better)
- Typical strong submission: 25-35
- Excellent submission: 35+

### What This Means
- Both BLEU and chrF++ matter equally
- Can't just optimize for one metric
- Morphologically complex Akkadian benefits from chrF++ focus
- Use this metric for validation in training notebooks

In [None]:
# ENSEMBLE GENERATION & SUBMISSION

print("\n" + "="*60)
print("LOADING MODELS FOR ENSEMBLE")
print("="*60)

models = {}
tokenizers = {}

for name, config in MODEL_CONFIGS.items():
    model_path = config["path"]
    if not os.path.exists(model_path):
        print(f"⚠️  WARNING: {name} model not found at {model_path}")
        print(f"   This model will be skipped")
        continue
    
    try:
        print(f"Loading {name} from {model_path}...")
        tokenizers[name] = AutoTokenizer.from_pretrained(model_path)
        models[name] = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE)
        models[name].eval()
        print(f"✓ {name} loaded successfully")
    except Exception as e:
        print(f"✗ Failed to load {name}: {e}")

if not models:
    print("❌ ERROR: No models loaded! Cannot proceed with ensemble.")
    print("Please ensure model paths are correct in MODEL_CONFIGS.")
    import sys
    sys.exit(1)

print(f"\n✓ Successfully loaded {len(models)} models for ensemble")

# Normalize weights to sum to 1.0 for loaded models
total_weight = sum(MODEL_CONFIGS[name]["weight"] for name in models.keys())
for name in models.keys():
    MODEL_CONFIGS[name]["weight"] /= total_weight
    print(f"  {name}: weight = {MODEL_CONFIGS[name]['weight']:.3f}")

# ============================================================================
# ENSEMBLE GENERATION
# ============================================================================

def generate_translations(texts, model_name, config):
    """Generate translations using a single model"""
    tokenizer = tokenizers[model_name]
    model = models[model_name]
    
    predictions = []
    
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=f"Generating with {model_name}"):
        batch = texts[i:i+BATCH_SIZE]
        batch_inputs = [config["prefix"] + str(t) for t in batch]
        
        inputs = tokenizer(
            batch_inputs,
            max_length=config["max_length"],
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(DEVICE)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=config["max_length"],
                num_beams=config["num_beams"],
                early_stopping=True,
                no_repeat_ngram_size=3,
                length_penalty=1.0,
                temperature=1.0
            )
        
        batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(batch_preds)
    
    return predictions

print("\n" + "="*60)
print("GENERATING TRANSLATIONS WITH EACH MODEL")
print("="*60)

all_predictions = {}

for model_name in models.keys():
    config = MODEL_CONFIGS[model_name]
    print(f"\n{model_name.upper()}:")
    predictions = generate_translations(test_inputs, model_name, config)
    all_predictions[model_name] = predictions
    print(f"  ✓ Generated {len(predictions)} predictions")

# ============================================================================
# ENSEMBLE VOTING / BLENDING
# ============================================================================

def normalize_text(text):
    """Normalize text for comparison"""
    return " ".join(text.lower().split())

def postprocess_prediction(text):
    """Clean up prediction"""
    text = str(text).strip()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Capitalize first letter
    if text and text[0].islower():
        text = text[0].upper() + text[1:]
    # Add period if missing
    if text and text[-1] not in '.!?':
        text += '.'
    return text

print("\n" + "="*60)
print("ENSEMBLE: VOTING STRATEGY")
print("="*60)

final_predictions = []

for idx, input_text in enumerate(test_inputs):
    # Get predictions from all models
    model_preds = {}
    for model_name in all_predictions.keys():
        pred = all_predictions[model_name][idx]
        model_preds[model_name] = postprocess_prediction(pred)
    
    if ENSEMBLE_MODE == "averaging":
        # For text, we can't average, so we use the highest-weight model's prediction
        best_model = max(all_predictions.keys(), key=lambda m: MODEL_CONFIGS[m]["weight"])
        final_pred = model_preds[best_model]
    else:
        # Voting: weighted voting on tokens
        words_freq = {}
        for model_name, pred in model_preds.items():
            weight = MODEL_CONFIGS[model_name]["weight"]
            words = pred.split()
            for word in words:
                words_freq[word] = words_freq.get(word, 0) + weight
        
        if words_freq:
            # Sort by frequency and weight
            sorted_words = sorted(words_freq.items(), key=lambda x: -x[1])
            # Use top predictions
            num_words = max(5, min(20, len(sorted_words) // 3))
            top_words = [w for w, _ in sorted_words[:num_words]]
            final_pred = " ".join(top_words) if top_words else model_preds[list(model_preds.keys())[0]]
        else:
            final_pred = model_preds[list(model_preds.keys())[0]]
    
    final_predictions.append(final_pred)
    
    if idx < 3:
        print(f"\nExample {idx}:")
        print(f"  Input: {input_text[:50]}...")
        for model_name, pred in model_preds.items():
            print(f"  {model_name:8s}: {pred[:60]}...")
        print(f"  Ensemble: {final_pred[:60]}...")

# ============================================================================
# CREATE SUBMISSION
# ============================================================================

print("\n" + "="*60)
print("CREATING SUBMISSION FILE")
print("="*60)

submission_df = test_df.copy()
submission_df["translation"] = final_predictions

submission_df.to_csv("submission.csv", index=False)
print(f"✓ Submission saved to submission.csv")
print(f"  Total predictions: {len(submission_df)}")
print(f"  Columns: {submission_df.columns.tolist()}")

# Show sample
print("\nSample predictions:")
print(submission_df.head())

# Clean up memory
print("\nCleaning up GPU memory...")
for model in models.values():
    del model
torch.cuda.empty_cache()
gc.collect()
print("✓ Complete!")

# SUBMISSION NOTES: Evaluation Metrics & Data Handling

## Evaluation Metrics (Deep Past Challenge)

Your submission is scored using the **Geometric Mean** of two metrics:

### BLEU Score
- Measures n-gram precision (matches between prediction and reference)
- Range: 0-100
- Formula: Precision of unigrams, bigrams, trigrams, 4-grams with brevity penalty
- Focuses on: Exact word choices and common phrases

### chrF++ Score  
- Measures character-level F-score
- Range: 0-100
- Considers: Character and word order accuracy
- Focuses on: Morphological correctness (important for Akkadian!)

### Geometric Mean = √(BLEU × chrF++)
- Balances both metrics equally
- Typical score: 20-40
- Excellent score: 35+

## Data Preprocessing Checklist

All training notebooks handle these formatting issues:

✓ **Subscripts/Superscripts**: a₂ → a2, il₅ → il5
✓ **Gaps & Lacunae**: [x] → <gap>, … → <big_gap>
✓ **Scribal Notations**: Remove !, ?, /, :, etc.
✓ **Bracket Content**: Keep content, remove brackets
✓ **Determinatives**: {d} → d (extract content)
✓ **Special Characters**: Preserve š, Š, ṣ, Ṣ, ṭ, Ṭ, ḫ, Ḫ, ʾ
✓ **Capitalization**: Preserve (proper nouns & logograms)
✓ **Quality Filtering**: Length validation, no duplicates

## How Ensemble Works

This notebook combines 3 models:
1. **ByT5** - Character-level (good for morphology)
2. **T5** - Token-level (good for semantics)
3. **MarianMT** - Translation-specific (good for fluency)

Each model votes with weighted predictions → final ensemble translation