In [None]:
# DEEP PAST CHALLENGE - ENSEMBLE SUBMISSION

# Install sacremoses for MarianMT (required in offline mode)
try:
    import sacremoses
    print("✓ sacremoses already installed")
except ImportError:
    import os
    wheel_dir = "/kaggle/input/sacremoses-wheel"
    wheel_glob = "sacremoses-*.whl"
    if os.path.exists(wheel_dir):
        print("Installing sacremoses from local wheel...")
        import glob, sys, subprocess
        wheels = glob.glob(f"{wheel_dir}/{wheel_glob}")
        if wheels:
            subprocess.check_call([sys.executable, "-m", "pip", "install", wheels[0], "--no-deps", "-q"])
            import sacremoses
            print("✓ sacremoses installed from wheel")
        else:
            print("⚠️ No sacremoses wheel found in dataset folder")
    else:
        print("⚠️ sacremoses not found - MarianMT may fail")
        print("To fix: Upload sacremoses wheel as dataset and run:")
        print("!pip install /kaggle/input/sacremoses-wheel/sacremoses-*.whl --no-deps")

import os
import re
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import gc

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
MODEL1_PATH = os.getenv("MODEL1_PATH", "/kaggle/input/notebook-a-byt5/byt5-base-saved")
MODEL2_PATH = os.getenv("MODEL2_PATH", "/kaggle/input/notebook-b-t5/t5-base-fine-tuned")
MODEL3_PATH = os.getenv("MODEL3_PATH", "/kaggle/input/notebook-c-marian-mt/marian-mt-saved")

TEST_DATA_PATH = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
MAX_LENGTH = 512

# -----------------------------------------------------------------------------
# ENSEMBLE STRATEGY: AUTO-DETECT
# -----------------------------------------------------------------------------
# This notebook automatically chooses the best strategy:
# 1. Weight averaging if all models have same architecture
# 2. Voting ensemble if models have different architectures
# 3. Best single model as fallback

ENSEMBLE_MODE = os.getenv("ENSEMBLE_MODE", "auto")  # "auto", "voting", or "averaging"

# -----------------------------------------------------------------------------
# MODEL CONFIGURATIONS & WEIGHTS
# -----------------------------------------------------------------------------
# Adjust these weights based on validation scores from find-optimal-weights.ipynb
# Or use equal weights as baseline: {"weight": 0.333}

MODEL_CONFIGS = {
    "byt5": {
        "path": MODEL1_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.35,  # Adjust based on validation
        "num_beams": 4
    },
    "t5": {
        "path": MODEL2_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.40,  # Usually best performer
        "num_beams": 4
    },
    "marian": {
        "path": MODEL3_PATH,
        "prefix": ">>eng<< ",
        "max_length": 512,
        "weight": 0.25,  # Adjust based on validation
        "num_beams": 4
    }
}

print(f"Device: {DEVICE}")
print(f"Ensemble Mode: {ENSEMBLE_MODE}")
print(f"\nModel Weights:")
for name, config in MODEL_CONFIGS.items():
    exists = "✓" if os.path.exists(config["path"]) else "✗"
    print(f"  {name:10s} {exists} weight={config['weight']:.2f}")

# -----------------------------------------------------------------------------
# GAP REPLACEMENT FUNCTIONS
# -----------------------------------------------------------------------------
def replace_gaps(text):
    """Replace various gap notations with standardized tokens"""
    if pd.isna(text): 
        return text
    
    # Complex gap patterns (order matters)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)

    # Simple gap patterns
    text = re.sub(r'xx', '<gap>', text)
    text = re.sub(r' x ', ' <gap> ', text)
    text = re.sub(r'……', '<big_gap>', text)
    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)
    text = re.sub(r'…', '<big_gap>', text)
    text = re.sub(r'\.\.\.', '<big_gap>', text)

    return text

# -----------------------------------------------------------------------------
# LOAD TEST DATA
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print("LOADING TEST DATA")
print("="*60)

test_df = pd.read_csv(TEST_DATA_PATH)
test_df['transliteration'] = test_df['transliteration'].apply(replace_gaps)
test_inputs = test_df['transliteration'].astype(str).tolist()
source_lengths = [len(t.split()) for t in test_inputs]

print(f"✓ Loaded {len(test_df)} test samples")
print(f"✓ Average source length: {sum(source_lengths)/len(source_lengths):.1f} words")

# -----------------------------------------------------------------------------
# INFERENCE DATASET
# -----------------------------------------------------------------------------
class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx], 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }

=== Deep Past Neural Ensemble Inference ===


2025-12-25 12:14:51.095337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766664891.298313      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766664891.357816      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766664891.851037      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851082      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851085      55 computation_placer.cc:177] computation placer alr

Inference byt5:   0%|          | 0/1 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Inference t5:   0%|          | 0/1 [00:00<?, ?it/s]



Inference marian:   0%|          | 0/1 [00:00<?, ?it/s]


Preview:
   id                                        translation
0   0  Kà-ar-ma ú big_gap da-tim aí-ip-ri-ni Akkadian...
1   1  -ni i-na né-mì-lim da-aùr ú-lá e-WA ia-ra-tí-a...
2   2  -it a-aí-im au-um-au ia-tí aé-bi„-lá-nim Trans...
3   3  É-bi„-lá KÙ. AN Translate Akkadian to English:...


In [None]:
# Override per-model generation configs to match training lengths and improve quality
MODEL_CONFIGS['byt5']['max_length'] = 256
MODEL_CONFIGS['t5']['max_length'] = 128
MODEL_CONFIGS['marian']['max_length'] = 160
for k in MODEL_CONFIGS.keys():
    MODEL_CONFIGS[k]['num_beams'] = 6
print("Adjusted MODEL_CONFIGS: max_length per model and num_beams=6")

In [None]:
# SUBMISSION GUIDE

## Quick Start

1. **Train all 3 models** using the training notebooks
2. **Save each as a Kaggle dataset**
3. **Add datasets as inputs** to this notebook
4. **Adjust weights** in MODEL_CONFIGS (use find-optimal-weights.ipynb)
5. **Run this notebook** → generates submission.csv

## Ensemble Modes

Set `ENSEMBLE_MODE` in the first cell:
- **"auto"** (default) - Tries weight averaging, falls back to voting
- **"voting"** - Always use voting ensemble (slower but works with any models)
- **"averaging"** - Only use weight averaging (faster but requires compatible models)

## How to Find Optimal Weights

Run `find-optimal-weights.ipynb` to automatically:
1. Evaluate each model on validation set
2. Calculate BLEU scores
3. Grid search for best weight combination
4. Output optimal weights

Then update MODEL_CONFIGS with the results.

## Default Weights

Current configuration (adjust based on your validation):
- **ByT5**: 0.35 (character-level, good for morphology)
- **T5**: 0.40 (usually best overall)
- **MarianMT**: 0.25 (translation-focused)

## Model Paths

Update these if your datasets have different names:
- MODEL1_PATH: `/kaggle/input/notebook-a-byt5/byt5-base-saved`
- MODEL2_PATH: `/kaggle/input/notebook-b-t5/t5-base-fine-tuned`
- MODEL3_PATH: `/kaggle/input/notebook-c-marian-mt/marian-mt-saved`

## Tips for Better Performance

1. **Use validation scores** to set weights (don't guess!)
2. **Increase num_beams** to 6 or 8 for better quality (slower)
3. **Adjust max_length** per model based on typical output length
4. **Add repetition_penalty=1.2** if outputs are too repetitive
5. **Post-process** predictions for capitalization and punctuation

## Expected Behavior

- If all models have **same architecture** → Uses weight averaging (faster)
- If models have **different architectures** → Uses voting ensemble (more robust)
- If only **1 model** available → Uses single model
- Automatically handles missing models gracefully

## Troubleshooting

**Out of Memory?**
- Reduce BATCH_SIZE from 8 to 4 or 2
- Use ENSEMBLE_MODE="averaging" (loads only 1 model at a time)

**Low scores?**
- Run find-optimal-weights.ipynb to get better weights
- Check that gap replacement is working (should see `<gap>` and `<big_gap>`)
- Verify all models trained properly

**Models won't merge?**
- Normal! ByT5, T5, and MarianMT have different architectures
- Notebook will automatically use voting ensemble instead
- This is actually better for diverse models

## Architecture Compatibility

| Models | Compatible? | Strategy |
|--------|-------------|----------|
| ByT5 + T5 | Maybe ✓ | Can try averaging |
| ByT5 + MarianMT | No ✗ | Use voting |
| T5 + MarianMT | No ✗ | Use voting |
| All 3 | No ✗ | Use voting |

Voting ensemble is **more powerful** for different architectures anyway!
