In [None]:
# DEEP PAST CHALLENGE - ENSEMBLE SUBMISSION
import os
import re
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import gc

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
MODEL1_PATH = os.getenv("MODEL1_PATH", "/kaggle/input/notebook-a-byt5/byt5-base-saved")
MODEL2_PATH = os.getenv("MODEL2_PATH", "/kaggle/input/notebook-b-t5/t5-base-fine-tuned")
MODEL3_PATH = os.getenv("MODEL3_PATH", "/kaggle/input/notebook-c-marian-mt/marian-mt-saved")

TEST_DATA_PATH = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
MAX_LENGTH = 512

# -----------------------------------------------------------------------------
# ENSEMBLE STRATEGY: AUTO-DETECT
# -----------------------------------------------------------------------------
# This notebook automatically chooses the best strategy:
# 1. Weight averaging if all models have same architecture
# 2. Voting ensemble if models have different architectures
# 3. Best single model as fallback

ENSEMBLE_MODE = os.getenv("ENSEMBLE_MODE", "auto")  # "auto", "voting", or "averaging"

# -----------------------------------------------------------------------------
# MODEL CONFIGURATIONS & WEIGHTS
# -----------------------------------------------------------------------------
# Adjust these weights based on validation scores from find-optimal-weights.ipynb
# Or use equal weights as baseline: {"weight": 0.333}

MODEL_CONFIGS = {
    "byt5": {
        "path": MODEL1_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.35,  # Adjust based on validation
        "num_beams": 4
    },
    "t5": {
        "path": MODEL2_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.40,  # Usually best performer
        "num_beams": 4
    },
    "marian": {
        "path": MODEL3_PATH,
        "prefix": ">>eng<< ",
        "max_length": 512,
        "weight": 0.25,  # Adjust based on validation
        "num_beams": 4
    }
}

print(f"Device: {DEVICE}")
print(f"Ensemble Mode: {ENSEMBLE_MODE}")
print(f"\nModel Weights:")
for name, config in MODEL_CONFIGS.items():
    exists = "‚úì" if os.path.exists(config["path"]) else "‚úó"
    print(f"  {name:10s} {exists} weight={config['weight']:.2f}")

# -----------------------------------------------------------------------------
# GAP REPLACEMENT FUNCTIONS
# -----------------------------------------------------------------------------
def replace_gaps(text):
    """Replace various gap notations with standardized tokens"""
    if pd.isna(text): 
        return text
    
    # Complex gap patterns (order matters)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)

    # Simple gap patterns
    text = re.sub(r'xx', '<gap>', text)
    text = re.sub(r' x ', ' <gap> ', text)
    text = re.sub(r'‚Ä¶‚Ä¶', '<big_gap>', text)
    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)
    text = re.sub(r'‚Ä¶', '<big_gap>', text)
    text = re.sub(r'\.\.\.', '<big_gap>', text)

    return text

# -----------------------------------------------------------------------------
# LOAD TEST DATA
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print("LOADING TEST DATA")
print("="*60)

test_df = pd.read_csv(TEST_DATA_PATH)
test_df['transliteration'] = test_df['transliteration'].apply(replace_gaps)
test_inputs = test_df['transliteration'].astype(str).tolist()
source_lengths = [len(t.split()) for t in test_inputs]

print(f"‚úì Loaded {len(test_df)} test samples")
print(f"‚úì Average source length: {sum(source_lengths)/len(source_lengths):.1f} words")

# -----------------------------------------------------------------------------
# INFERENCE DATASET
# -----------------------------------------------------------------------------
class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx], 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }

# -----------------------------------------------------------------------------
# STRATEGY 1: TRY WEIGHT AVERAGING
# -----------------------------------------------------------------------------
def try_weight_averaging():
    """Try to merge models by averaging weights"""
    print("\n" + "="*60)
    print("ATTEMPTING WEIGHT AVERAGING")
    print("="*60)
    
    available_models = {k: v for k, v in MODEL_CONFIGS.items() if os.path.exists(v["path"])}
    
    if len(available_models) < 2:
        print("‚ö†Ô∏è  Need at least 2 models for weight averaging")
        return None, None
    
    try:
        # Load all models
        models = {}
        state_dicts = {}
        
        for name, config in available_models.items():
            print(f"Loading {name}...")
            models[name] = AutoModelForSeq2SeqLM.from_pretrained(config["path"])
            state_dicts[name] = models[name].state_dict()
        
        # Check architecture compatibility
        keys_list = [set(sd.keys()) for sd in state_dicts.values()]
        if not all(keys == keys_list[0] for keys in keys_list):
            print("‚ö†Ô∏è  Models have incompatible architectures")
            # Try pairwise compatibility
            names = list(state_dicts.keys())
            if len(names) >= 2 and set(state_dicts[names[0]].keys()) == set(state_dicts[names[1]].keys()):
                print(f"‚úì Can merge {names[0]} and {names[1]}")
                # Merge first two compatible models
                w_total = available_models[names[0]]["weight"] + available_models[names[1]]["weight"]
                w1 = available_models[names[0]]["weight"] / w_total
                w2 = available_models[names[1]]["weight"] / w_total
                
                merged_sd = {}
                for key in state_dicts[names[0]].keys():
                    merged_sd[key] = w1 * state_dicts[names[0]][key] + w2 * state_dicts[names[1]][key]
                
                model = models[names[1]]
                model.load_state_dict(merged_sd)
                tokenizer = AutoTokenizer.from_pretrained(available_models[names[1]]["path"])
                
                # Cleanup
                for m in models.values():
                    if m is not model:
                        del m
                del state_dicts
                torch.cuda.empty_cache()
                gc.collect()
                
                print(f"‚úì Created 2-model ensemble ({names[0]} + {names[1]})")
                return model, tokenizer
            else:
                return None, None
        
        # All compatible - merge all models
        print("‚úì All models compatible - merging all")
        weights = {k: v["weight"] for k, v in available_models.items()}
        w_total = sum(weights.values())
        weights = {k: v/w_total for k, v in weights.items()}
        
        merged_sd = {}
        first_name = list(state_dicts.keys())[0]
        for key in state_dicts[first_name].keys():
            merged_sd[key] = sum(weights[name] * state_dicts[name][key] for name in state_dicts.keys())
        
        model = models[first_name]
        model.load_state_dict(merged_sd)
        tokenizer = AutoTokenizer.from_pretrained(available_models[first_name]["path"])
        
        # Cleanup
        for m in models.values():
            if m is not model:
                del m
        del state_dicts
        torch.cuda.empty_cache()
        gc.collect()
        
        print(f"‚úì Created {len(available_models)}-model ensemble")
        return model, tokenizer
        
    except Exception as e:
        print(f"‚úó Weight averaging failed: {e}")
        return None, None

# -----------------------------------------------------------------------------
# STRATEGY 2: VOTING ENSEMBLE
# -----------------------------------------------------------------------------
def voting_ensemble():
    """Generate predictions from all models and combine via voting"""
    print("\n" + "="*60)
    print("USING VOTING ENSEMBLE")
    print("="*60)
    
    all_predictions = {}
    
    for model_name, config in MODEL_CONFIGS.items():
        if not os.path.exists(config["path"]):
            print(f"‚ö†Ô∏è  Skipping {model_name}: Model not found")
            continue
        
        print(f"\n[{model_name.upper()}]")
        print(f"Loading from {config['path']}...")
        
        try:
            tokenizer = AutoTokenizer.from_pretrained(config["path"])
            model = AutoModelForSeq2SeqLM.from_pretrained(config["path"])
            model = model.to(DEVICE)
            model.eval()
            
            dataset = InferenceDataset(test_inputs, tokenizer, config["max_length"], config["prefix"])
            loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
            
            predictions = []
            with torch.no_grad():
                for batch in tqdm(loader, desc=f"{model_name}"):
                    outputs = model.generate(
                        input_ids=batch["input_ids"].to(DEVICE),
                        attention_mask=batch["attention_mask"].to(DEVICE),
                        max_length=config["max_length"],
                        num_beams=config["num_beams"],
                        early_stopping=True,
                    )
                    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                    predictions.extend([d.strip() for d in decoded])
            
            all_predictions[model_name] = {
                "preds": predictions,
                "weight": config["weight"]
            }
            
            print(f"‚úì Generated {len(predictions)} predictions")
            
            # Cleanup
            del model, tokenizer, dataset, loader
            torch.cuda.empty_cache()
            gc.collect()
            
        except Exception as e:
            print(f"‚úó Error with {model_name}: {e}")
    
    if not all_predictions:
        raise RuntimeError("No models loaded successfully!")
    
    # Combine predictions via weighted voting
    print(f"\nCombining predictions from {len(all_predictions)} models...")
    final_predictions = []
    
    for i in range(len(test_inputs)):
        best_pred = "broken text"
        best_score = -999999
        
        for model_name, data in all_predictions.items():
            pred = data["preds"][i] if i < len(data["preds"]) else ""
            weight = data["weight"]
            
            # Score prediction
            if pred and len(pred) > 0:
                pred_len = len(pred.split())
                # Prefer reasonable length (0.8x to 2x source)
                ratio = pred_len / max(1, source_lengths[i])
                length_score = 0 if 0.5 < ratio < 3 else -10 * abs(ratio - 1.5)
                score = weight * (pred_len + length_score)
                
                if score > best_score:
                    best_score = score
                    best_pred = pred
        
        final_predictions.append(best_pred)
    
    return final_predictions

# -----------------------------------------------------------------------------
# MAIN ENSEMBLE LOGIC
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print("STARTING ENSEMBLE")
print("="*60)

final_predictions = None

if ENSEMBLE_MODE == "voting":
    # Force voting ensemble
    final_predictions = voting_ensemble()
    
elif ENSEMBLE_MODE == "averaging":
    # Force weight averaging
    merged_model, tokenizer = try_weight_averaging()
    if merged_model is not None:
        merged_model = merged_model.to(DEVICE).eval()
        dataset = InferenceDataset(test_inputs, tokenizer, MAX_LENGTH, "translate Akkadian to English: ")
        loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        predictions = []
        with torch.no_grad():
            for batch in tqdm(loader, desc="Inference"):
                outputs = merged_model.generate(
                    input_ids=batch["input_ids"].to(DEVICE),
                    attention_mask=batch["attention_mask"].to(DEVICE),
                    max_length=MAX_LENGTH,
                    num_beams=4,
                    early_stopping=True,
                )
                decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                predictions.extend([d.strip() for d in decoded])
        
        final_predictions = predictions
    else:
        print("Falling back to voting ensemble...")
        final_predictions = voting_ensemble()
        
else:  # auto mode
    # Try weight averaging first, fall back to voting
    merged_model, tokenizer = try_weight_averaging()
    
    if merged_model is not None:
        print("\n‚úì Using weight-averaged model")
        merged_model = merged_model.to(DEVICE).eval()
        dataset = InferenceDataset(test_inputs, tokenizer, MAX_LENGTH, "translate Akkadian to English: ")
        loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        predictions = []
        with torch.no_grad():
            for batch in tqdm(loader, desc="Inference"):
                outputs = merged_model.generate(
                    input_ids=batch["input_ids"].to(DEVICE),
                    attention_mask=batch["attention_mask"].to(DEVICE),
                    max_length=MAX_LENGTH,
                    num_beams=4,
                    early_stopping=True,
                )
                decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                predictions.extend([d.strip() for d in decoded])
        
        final_predictions = predictions
    else:
        print("\n‚úì Using voting ensemble (models incompatible for averaging)")
        final_predictions = voting_ensemble()

# -----------------------------------------------------------------------------
# CREATE SUBMISSION
# -----------------------------------------------------------------------------
print("\n" + "="*60)
print("CREATING SUBMISSION")
print("="*60)

submission = pd.DataFrame({
    "id": test_df["id"],
    "translation": final_predictions
})

submission["translation"] = submission["translation"].apply(
    lambda x: x if (x and len(x) > 0) else "broken text"
)

submission.to_csv("submission.csv", index=False)

print("‚úÖ Submission saved to submission.csv")
print(f"\nStatistics:")
print(f"  Total: {len(submission)}")
print(f"  Avg length: {submission['translation'].str.split().str.len().mean():.1f} words")
print(f"  Empty/broken: {(submission['translation'] == 'broken text').sum()}")

print(f"\nüìã Sample predictions:")
print(submission.head(10).to_string(index=False))


=== Deep Past Neural Ensemble Inference ===


2025-12-25 12:14:51.095337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766664891.298313      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766664891.357816      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766664891.851037      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851082      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851085      55 computation_placer.cc:177] computation placer alr

Inference byt5:   0%|          | 0/1 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Inference t5:   0%|          | 0/1 [00:00<?, ?it/s]



Inference marian:   0%|          | 0/1 [00:00<?, ?it/s]


Preview:
   id                                        translation
0   0  K√†-ar-ma √∫ big_gap da-tim a√≠-ip-ri-ni Akkadian...
1   1  -ni i-na n√©-m√¨-lim da-a√πr √∫-l√° e-WA ia-ra-t√≠-a...
2   2  -it a-a√≠-im au-um-au ia-t√≠ a√©-bi‚Äû-l√°-nim Trans...
3   3  √â-bi‚Äû-l√° K√ô. AN Translate Akkadian to English:...


In [None]:
# SUBMISSION GUIDE

## Quick Start

1. **Train all 3 models** using the training notebooks
2. **Save each as a Kaggle dataset**
3. **Add datasets as inputs** to this notebook
4. **Adjust weights** in MODEL_CONFIGS (use find-optimal-weights.ipynb)
5. **Run this notebook** ‚Üí generates submission.csv

## Ensemble Modes

Set `ENSEMBLE_MODE` in the first cell:
- **"auto"** (default) - Tries weight averaging, falls back to voting
- **"voting"** - Always use voting ensemble (slower but works with any models)
- **"averaging"** - Only use weight averaging (faster but requires compatible models)

## How to Find Optimal Weights

Run `find-optimal-weights.ipynb` to automatically:
1. Evaluate each model on validation set
2. Calculate BLEU scores
3. Grid search for best weight combination
4. Output optimal weights

Then update MODEL_CONFIGS with the results.

## Default Weights

Current configuration (adjust based on your validation):
- **ByT5**: 0.35 (character-level, good for morphology)
- **T5**: 0.40 (usually best overall)
- **MarianMT**: 0.25 (translation-focused)

## Model Paths

Update these if your datasets have different names:
- MODEL1_PATH: `/kaggle/input/notebook-a-byt5/byt5-base-saved`
- MODEL2_PATH: `/kaggle/input/notebook-b-t5/t5-base-fine-tuned`
- MODEL3_PATH: `/kaggle/input/notebook-c-marian-mt/marian-mt-saved`

## Tips for Better Performance

1. **Use validation scores** to set weights (don't guess!)
2. **Increase num_beams** to 6 or 8 for better quality (slower)
3. **Adjust max_length** per model based on typical output length
4. **Add repetition_penalty=1.2** if outputs are too repetitive
5. **Post-process** predictions for capitalization and punctuation

## Expected Behavior

- If all models have **same architecture** ‚Üí Uses weight averaging (faster)
- If models have **different architectures** ‚Üí Uses voting ensemble (more robust)
- If only **1 model** available ‚Üí Uses single model
- Automatically handles missing models gracefully

## Troubleshooting

**Out of Memory?**
- Reduce BATCH_SIZE from 8 to 4 or 2
- Use ENSEMBLE_MODE="averaging" (loads only 1 model at a time)

**Low scores?**
- Run find-optimal-weights.ipynb to get better weights
- Check that gap replacement is working (should see `<gap>` and `<big_gap>`)
- Verify all models trained properly

**Models won't merge?**
- Normal! ByT5, T5, and MarianMT have different architectures
- Notebook will automatically use voting ensemble instead
- This is actually better for diverse models

## Architecture Compatibility

| Models | Compatible? | Strategy |
|--------|-------------|----------|
| ByT5 + T5 | Maybe ‚úì | Can try averaging |
| ByT5 + MarianMT | No ‚úó | Use voting |
| T5 + MarianMT | No ‚úó | Use voting |
| All 3 | No ‚úó | Use voting |

Voting ensemble is **more powerful** for different architectures anyway!
