In [1]:
# Install required packages and fix pyarrow incompatibility
!pip install -q --upgrade pyarrow
!pip install -q evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# Find Optimal Ensemble Weights

This notebook evaluates three ByT5 checkpoints and suggests weights for a lightweight ensemble/model soup.

## What This Notebook Does

1. **Loads validation data** (5% of training set)
2. **Generates predictions** from all 3 ByT5 variants
3. **Calculates BLEU scores** for each checkpoint
4. **Performs grid search** over weight combinations
5. **Outputs optimal weights** to use in the submission notebook

## How to Use

1. **Train all 3 ByT5 models** first
2. **Add them as inputs** to this notebook
3. **Run all cells**
4. **Copy the optimal weights** to `final-submission-notebook.ipynb`

## Expected Output

```python
ByT5-Purist:      BLEU = 25.5    weight = 0.35
ByT5-Greedy:      BLEU = 26.0    weight = 0.35
ByT5-Specialist:  BLEU = 24.5    weight = 0.30

Ensemble: BLEU ≈ 28.2   (+1.5 improvement)
```

## Time Required

- Small validation set (100 samples): ~5 minutes
- Full validation (1000+ samples): ~30-60 minutes
- Worth it for optimal performance!


In [None]:
import os
import re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import evaluate

# Load BLEU metric
bleu_metric = evaluate.load("bleu")

# -----------------------------------------------------------------------------
# CONFIG (ByT5-only ensemble)
# -----------------------------------------------------------------------------
MODEL_PURIST = "/kaggle/input/notebook-a-byt5/byt5-base-saved"
MODEL_GREEDY = "/kaggle/input/notebook-b-byt5-augmented/byt5-greedy-saved"
MODEL_SPECIALIST = "/kaggle/input/notebook-c-byt5-dropout/byt5-specialist-saved"

DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 2
MAX_LENGTH = 512
PREFIX = "translate Akkadian to English: "

print(f"Device: {DEVICE}")

2026-01-08 08:11:44.789339: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767859904.995575      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767859905.055391      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767859905.521448      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767859905.521488      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767859905.521491      24 computation_placer.cc:177] computation placer alr

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Device: cuda


In [3]:
# -----------------------------------------------------------------------------
# LOAD VALIDATION DATA
# -----------------------------------------------------------------------------
# Create validation split from training data
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")

# Take last 5% as validation
val_size = int(len(train_df) * 0.05)
val_df = train_df.tail(val_size).reset_index(drop=True)

print(f"Validation samples: {len(val_df)}")

# Prepare validation data
val_sources = val_df['transliteration'].astype(str).tolist()
val_targets = val_df['translation'].astype(str).tolist()

Validation samples: 78


In [4]:
# -----------------------------------------------------------------------------
# GAP REPLACEMENT
# -----------------------------------------------------------------------------
def replace_gaps(text):
    if pd.isna(text): 
        return text
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+\s+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.3(?:\s+\.3)+\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'\.{3}(?:\s+\.{3})+', '<big_gap>', text)
    text = re.sub(r'xx', '<gap>', text)
    text = re.sub(r' x ', ' <gap> ', text)
    text = re.sub(r'……', '<big_gap>', text)
    text = re.sub(r'\.\.\.\.\.\.', '<big_gap>', text)
    text = re.sub(r'…', '<big_gap>', text)
    text = re.sub(r'\.\.\.', '<big_gap>', text)
    return text

val_sources = [replace_gaps(s) for s in val_sources]

In [None]:
# -----------------------------------------------------------------------------
# GENERATE PREDICTIONS FROM EACH MODEL
# -----------------------------------------------------------------------------
class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx], 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }

def generate_predictions(model_path, sources, prefix=PREFIX):
    """Generate predictions from a model"""
    print(f"\nGenerating from {model_path}...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    model = model.to(DEVICE)
    model.eval()
    
    dataset = InferenceDataset(sources, tokenizer, MAX_LENGTH, prefix)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = []
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=MAX_LENGTH,
                num_beams=4,
                early_stopping=True,
            )
            
            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            predictions.extend([d.strip() for d in decoded])
    
    del model, tokenizer
    torch.cuda.empty_cache()
    
    return predictions

# Generate from each model
print("="*60)
print("GENERATING PREDICTIONS FROM ALL BYT5 CHECKPOINTS")
print("="*60)

preds_purist = generate_predictions(MODEL_PURIST, val_sources, PREFIX)
preds_greedy = generate_predictions(MODEL_GREEDY, val_sources, PREFIX)
preds_specialist = generate_predictions(MODEL_SPECIALIST, val_sources, PREFIX)

GENERATING PREDICTIONS FROM ALL MODELS

Generating from /kaggle/input/notebook-a-byt5/byt5-base-saved...


  0%|          | 0/10 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers



Generating from /kaggle/input/notebook-b-t5/t5-base-fine-tuned...


  0%|          | 0/10 [00:00<?, ?it/s]


Generating from /kaggle/input/notebook-c-marian-mt/marian-mt-saved...




  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# -----------------------------------------------------------------------------
# EVALUATE INDIVIDUAL MODELS
# -----------------------------------------------------------------------------
def calculate_bleu(predictions, references):
    """Calculate BLEU score"""
    result = bleu_metric.compute(
        predictions=predictions,
        references=[[ref] for ref in references]
    )
    return result['bleu'] * 100  # Convert to percentage

print("\n" + "="*60)
print("INDIVIDUAL MODEL SCORES")
print("="*60)

score_purist = calculate_bleu(preds_purist, val_targets)
score_greedy = calculate_bleu(preds_greedy, val_targets)
score_specialist = calculate_bleu(preds_specialist, val_targets)

print(f"ByT5-Purist:      BLEU = {score_purist:.2f}")
print(f"ByT5-Greedy:      BLEU = {score_greedy:.2f}")
print(f"ByT5-Specialist:  BLEU = {score_specialist:.2f}")

# Calculate proportional weights
total_score = score_purist + score_greedy + score_specialist
w_purist = score_purist / total_score
w_greedy = score_greedy / total_score
w_specialist = score_specialist / total_score

print(f"\nProportional weights:")
print(f"  ByT5-Purist:      {w_purist:.4f}")
print(f"  ByT5-Greedy:      {w_greedy:.4f}")
print(f"  ByT5-Specialist:  {w_specialist:.4f}")


INDIVIDUAL MODEL SCORES
ByT5:     BLEU = 0.00
T5:       BLEU = 11.45
MarianMT: BLEU = 14.38

Proportional weights:
  ByT5:     0.0000
  T5:       0.4433
  MarianMT: 0.5567


In [None]:
# -----------------------------------------------------------------------------
# GRID SEARCH FOR OPTIMAL WEIGHTS
# -----------------------------------------------------------------------------
def ensemble_with_weights(preds1, preds2, preds3, w1, w2, w3, references):
    """Create ensemble predictions and evaluate"""
    ensemble_preds = []
    
    for i in range(len(references)):
        options = [
            (w1, preds1[i]),
            (w2, preds2[i]),
            (w3, preds3[i])
        ]
        best_pred = max(options, key=lambda x: x[0] * len(x[1].split()))[1]
        ensemble_preds.append(best_pred)
    
    score = calculate_bleu(ensemble_preds, references)
    return score, ensemble_preds

print("\n" + "="*60)
print("GRID SEARCH FOR OPTIMAL WEIGHTS")
print("="*60)

best_score = 0
best_weights = None
best_predictions = None

# Search over weight combinations
search_range = np.arange(0.2, 0.6, 0.1)

results = []

for w1 in search_range:
    for w2 in search_range:
        w3 = 1.0 - w1 - w2
        
        # Skip invalid combinations
        if w3 < 0.1 or w3 > 0.6:
            continue
        
        score, preds = ensemble_with_weights(
            preds_purist, preds_greedy, preds_specialist,
            w1, w2, w3,
            val_targets
        )
        
        results.append({
            'w_purist': w1,
            'w_greedy': w2,
            'w_specialist': w3,
            'bleu': score
        })
        
        if score > best_score:
            best_score = score
            best_weights = (w1, w2, w3)
            best_predictions = preds

# Display results
results_df = pd.DataFrame(results).sort_values('bleu', ascending=False)

print("\nTop 10 weight combinations:")
print(results_df.head(10).to_string(index=False))

print(f"\n{'='*60}")
print(f"BEST WEIGHTS FOUND")
print(f"{'='*60}")
print(f"ByT5-Purist:      {best_weights[0]:.4f}")
print(f"ByT5-Greedy:      {best_weights[1]:.4f}")
print(f"ByT5-Specialist:  {best_weights[2]:.4f}")
print(f"BLEU:             {best_score:.2f}")

print(f"\nImprovement over best single model: +{best_score - max(score_purist, score_greedy, score_specialist):.2f}")


GRID SEARCH FOR OPTIMAL WEIGHTS

Top 10 weight combinations:
 w1  w2  w3      bleu
0.3 0.2 0.5 12.016591
0.2 0.3 0.5 11.633018
0.4 0.2 0.4 10.533071
0.3 0.5 0.2 10.450838
0.3 0.3 0.4 10.041289
0.2 0.4 0.4  9.536001
0.2 0.5 0.3  8.606597
0.4 0.3 0.3  8.567405
0.5 0.2 0.3  8.515164
0.4 0.4 0.2  8.511833

BEST WEIGHTS FOUND
ByT5:     0.3000
T5:       0.2000
MarianMT: 0.5000
BLEU:     12.02

Improvement over best single model: +-2.36


In [None]:
# -----------------------------------------------------------------------------
# SAVE OPTIMAL WEIGHTS
# -----------------------------------------------------------------------------
optimal_config = {
    'weights': {
        'byt5_purist': best_weights[0],
        'byt5_greedy': best_weights[1],
        'byt5_specialist': best_weights[2]
    },
    'validation_bleu': best_score,
    'individual_scores': {
        'byt5_purist': score_purist,
        'byt5_greedy': score_greedy,
        'byt5_specialist': score_specialist
    },
    'paths': {
        'byt5_purist': MODEL_PURIST,
        'byt5_greedy': MODEL_GREEDY,
        'byt5_specialist': MODEL_SPECIALIST
    }
}

print("\n" + "="*60)
print("RECOMMENDED CONFIGURATION")
print("="*60)
print(f"""
Copy these weights to your ensemble notebook:

MODEL_CONFIGS = {{
    "byt5_purist": {{
        "path": MODEL_PURIST,
        "prefix": "{PREFIX}",
        "max_length": 512,
        "weight": {best_weights[0]:.4f},
        "num_beams": 4
    }},
    "byt5_greedy": {{
        "path": MODEL_GREEDY,
        "prefix": "{PREFIX}",
        "max_length": 512,
        "weight": {best_weights[1]:.4f},
        "num_beams": 4
    }},
    "byt5_specialist": {{
        "path": MODEL_SPECIALIST,
        "prefix": "{PREFIX}",
        "max_length": 512,
        "weight": {best_weights[2]:.4f},
        "num_beams": 4
    }}
}}
""")

# Save to file
import json
with open('optimal_weights.json', 'w') as f:
    json.dump(optimal_config, f, indent=2)

print("✅ Optimal weights saved to optimal_weights.json")


RECOMMENDED CONFIGURATION

Copy these weights to your ensemble notebook:

MODEL_CONFIGS = {
    "byt5": {
        "path": MODEL1_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.3000,
        "num_beams": 4
    },
    "t5": {
        "path": MODEL2_PATH,
        "prefix": "translate Akkadian to English: ",
        "max_length": 512,
        "weight": 0.2000,
        "num_beams": 4
    },
    "marian": {
        "path": MODEL3_PATH,
        "prefix": ">>eng<< ",
        "max_length": 512,
        "weight": 0.5000,
        "num_beams": 4
    }
}

✅ Optimal weights saved to optimal_weights.json


## How to Use These Results

1. **Copy the optimal weights** to your submission notebook
2. **Update MODEL_CONFIGS** with the recommended values
3. **Run the ensemble** on test data
4. **Submit** to competition

## Tips for Further Improvement

- Try different **num_beams** values (4, 6, 8)
- Adjust **max_length** per checkpoint based on output patterns
- Add **repetition_penalty** if outputs are repetitive
- Implement **sophisticated voting** (e.g., BLEU-based selection)
- Use **temperature** sampling for diversity

## Expected Performance

With optimal weights, expect:
- **Validation BLEU**: ~{best_score:.1f}
- **Improvement over single best**: +{best_score - max(score_purist, score_greedy, score_specialist):.2f} points
- **Test performance**: Should generalize well if validation set is representative
