In [None]:
# Install required packages
!pip install -q evaluate sacrebleu

# Find Optimal Ensemble Weights

This notebook helps you determine the best weights for your 3-model ensemble by evaluating on validation data.

## What This Notebook Does

1. **Loads validation data** (5% of training set)
2. **Generates predictions** from all 3 models
3. **Calculates BLEU scores** for each model individually
4. **Performs grid search** over weight combinations
5. **Outputs optimal weights** to use in submission notebook

## How to Use

1. **Train all 3 models** first
2. **Add them as inputs** to this notebook
3. **Run all cells**
4. **Copy the optimal weights** to `final-submission-notebook.ipynb`

## Expected Output

```python
ByT5:     BLEU = 25.5    weight = 0.35
T5:       BLEU = 26.8    weight = 0.42  ← Best
MarianMT: BLEU = 22.7    weight = 0.23

Ensemble: BLEU = 28.1   (+1.3 improvement)
```

## Time Required

- Small validation set (100 samples): ~5 minutes
- Full validation (1000+ samples): ~30-60 minutes
- Worth it for optimal performance!


In [None]:
import os
import re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import evaluate

# Load BLEU metric
bleu_metric = evaluate.load("bleu")

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
MODEL1_PATH = "/kaggle/input/notebook-a-byt5/byt5-base-saved"
MODEL2_PATH = "/kaggle/input/notebook-b-t5/t5-base-fine-tuned"
MODEL3_PATH = "/kaggle/input/notebook-c-marian-mt/marian-mt-saved"

DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
MAX_LENGTH = 512

print(f"Device: {DEVICE}")

In [None]:
# -----------------------------------------------------------------------------
# LOAD VALIDATION DATA
# -----------------------------------------------------------------------------
# Create validation split from training data
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")

# Take last 5% as validation
val_size = int(len(train_df) * 0.05)
val_df = train_df.tail(val_size).reset_index(drop=True)

print(f"Validation samples: {len(val_df)}")

# Prepare validation data
val_sources = val_df['transliteration'].astype(str).tolist()
val_targets = val_df['translation'].astype(str).tolist()

In [None]:
# -----------------------------------------------------------------------------
# GAP REPLACEMENT
# -----------------------------------------------------------------------------
def replace_gaps(text):
    if pd.isna(text): 
        return text
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'xx', '<gap>', text)
    text = re.sub(r' x ', ' <gap> ', text)
    text = re.sub(r'……', '<big_gap>', text)
    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)
    text = re.sub(r'…', '<big_gap>', text)
    text = re.sub(r'\.\.\.', '<big_gap>', text)
    return text

val_sources = [replace_gaps(s) for s in val_sources]

In [None]:
# -----------------------------------------------------------------------------
# GENERATE PREDICTIONS FROM EACH MODEL
# -----------------------------------------------------------------------------
class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx], 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }

def generate_predictions(model_path, sources, prefix="translate Akkadian to English: "):
    """Generate predictions from a model"""
    print(f"\nGenerating from {model_path}...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    model = model.to(DEVICE)
    model.eval()
    
    dataset = InferenceDataset(sources, tokenizer, MAX_LENGTH, prefix)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = []
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=MAX_LENGTH,
                num_beams=4,
                early_stopping=True,
            )
            
            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            predictions.extend([d.strip() for d in decoded])
    
    del model, tokenizer
    torch.cuda.empty_cache()
    
    return predictions

# Generate from each model
print("="*60)
print("GENERATING PREDICTIONS FROM ALL MODELS")
print("="*60)

preds_byt5 = generate_predictions(MODEL1_PATH, val_sources, "translate Akkadian to English: ")
preds_t5 = generate_predictions(MODEL2_PATH, val_sources, "translate Akkadian to English: ")
preds_marian = generate_predictions(MODEL3_PATH, val_sources, ">>eng<< ")

In [None]:
# -----------------------------------------------------------------------------
# EVALUATE INDIVIDUAL MODELS
# -----------------------------------------------------------------------------
def calculate_bleu(predictions, references):
    """Calculate BLEU score"""
    result = bleu_metric.compute(
        predictions=predictions,
        references=[[ref] for ref in references]
    )
    return result['bleu'] * 100  # Convert to percentage

print("\n" + "="*60)
print("INDIVIDUAL MODEL SCORES")
print("="*60)

score_byt5 = calculate_bleu(preds_byt5, val_targets)
score_t5 = calculate_bleu(preds_t5, val_targets)
score_marian = calculate_bleu(preds_marian, val_targets)

print(f"ByT5:     BLEU = {score_byt5:.2f}")
print(f"T5:       BLEU = {score_t5:.2f}")
print(f"MarianMT: BLEU = {score_marian:.2f}")

# Calculate proportional weights
total_score = score_byt5 + score_t5 + score_marian
w1_prop = score_byt5 / total_score
w2_prop = score_t5 / total_score
w3_prop = score_marian / total_score

print(f"\nProportional weights:")
print(f"  ByT5:     {w1_prop:.4f}")
print(f"  T5:       {w2_prop:.4f}")
print(f"  MarianMT: {w3_prop:.4f}")

In [None]:
# -----------------------------------------------------------------------------
# GRID SEARCH FOR OPTIMAL WEIGHTS
# -----------------------------------------------------------------------------
def ensemble_with_weights(preds1, preds2, preds3, w1, w2, w3, references):
    """Create ensemble predictions and evaluate"""
    ensemble_preds = []
    
    for i in range(len(references)):
        # Simple voting: pick prediction from highest weighted model
        # For more sophisticated approach, use scoring + weighted combination
        options = [
            (w1, preds1[i]),
            (w2, preds2[i]),
            (w3, preds3[i])
        ]
        # Pick from best weighted model
        best_pred = max(options, key=lambda x: x[0] * len(x[1].split()))[1]
        ensemble_preds.append(best_pred)
    
    score = calculate_bleu(ensemble_preds, references)
    return score, ensemble_preds

print("\n" + "="*60)
print("GRID SEARCH FOR OPTIMAL WEIGHTS")
print("="*60)

best_score = 0
best_weights = None
best_predictions = None

# Search over weight combinations
search_range = np.arange(0.2, 0.6, 0.1)

results = []

for w1 in search_range:
    for w2 in search_range:
        w3 = 1.0 - w1 - w2
        
        # Skip invalid combinations
        if w3 < 0.1 or w3 > 0.6:
            continue
        
        score, preds = ensemble_with_weights(
            preds_byt5, preds_t5, preds_marian,
            w1, w2, w3,
            val_targets
        )
        
        results.append({
            'w1': w1,
            'w2': w2,
            'w3': w3,
            'bleu': score
        })
        
        if score > best_score:
            best_score = score
            best_weights = (w1, w2, w3)
            best_predictions = preds

# Display results
results_df = pd.DataFrame(results).sort_values('bleu', ascending=False)

print("\nTop 10 weight combinations:")
print(results_df.head(10).to_string(index=False))

print(f"\n{'='*60}")
print(f"BEST WEIGHTS FOUND")
print(f"{'='*60}")
print(f"ByT5:     {best_weights[0]:.4f}")
print(f"T5:       {best_weights[1]:.4f}")
print(f"MarianMT: {best_weights[2]:.4f}")
print(f"BLEU:     {best_score:.2f}")

print(f"\nImprovement over best single model: +{best_score - max(score_byt5, score_t5, score_marian):.2f}")

In [None]:
# -----------------------------------------------------------------------------
# SAVE OPTIMAL WEIGHTS
# -----------------------------------------------------------------------------
optimal_config = {
    'weights': {
        'byt5': best_weights[0],
        't5': best_weights[1],
        'marian': best_weights[2]
    },
    'validation_bleu': best_score,
    'individual_scores': {
        'byt5': score_byt5,
        't5': score_t5,
        'marian': score_marian
    }
}

print("\n" + "="*60)
print("RECOMMENDED CONFIGURATION")
print("="*60)
print(f"""
Copy these weights to your ensemble notebook:

MODEL_CONFIGS = {{
    "byt5": {{
        "path": MODEL1_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": {best_weights[0]:.4f},
        "num_beams": 4
    }},
    "t5": {{
        "path": MODEL2_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": {best_weights[1]:.4f},
        "num_beams": 4
    }},
    "marian": {{
        "path": MODEL3_PATH,
        "prefix": ">>eng<< ",
        "max_length": 512,
        "weight": {best_weights[2]:.4f},
        "num_beams": 4
    }}
}}
""")

# Save to file
import json
with open('optimal_weights.json', 'w') as f:
    json.dump(optimal_config, f, indent=2)

print("✅ Optimal weights saved to optimal_weights.json")

## How to Use These Results

1. **Copy the optimal weights** to your ensemble notebook
2. **Update MODEL_CONFIGS** with the recommended values
3. **Run the ensemble** on test data
4. **Submit** to competition

## Tips for Further Improvement

- Try different **num_beams** values (4, 6, 8)
- Adjust **max_length** per model based on output patterns
- Add **repetition_penalty** if outputs are repetitive
- Implement **sophisticated voting** (e.g., BLEU-based selection)
- Use **temperature** sampling for diversity

## Expected Performance

With optimal weights, expect:
- **Validation BLEU**: ~{best_score:.1f}
- **Improvement over single best**: +{best_score - max(score_byt5, score_t5, score_marian):.2f} points
- **Test performance**: Should generalize well if validation set is representative