In [None]:
import sys

import os

import torch

import pandas as pd

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



# ==========================================

# ðŸ¥£ MODEL SOUP CONFIGURATION

# ==========================================

# Paths to the OUTPUT DATASETS from your 3 notebooks

# Ensure you add these datasets to this notebook via the "Add Data" button

MODEL_DIRS = [

    "../input/notebook-a-output/byt5-base-saved",        # From Notebook A

    "../input/notebook-b-output/byt5-greedy-saved",      # From Notebook B

    "../input/notebook-c-output/byt5-specialist-saved"   # From Notebook C

]



MAX_LENGTH = 300  # Match the training length

PREFIX = "translate Akkadian to English: "



# ==========================================

# 1. COOK THE SOUP (WEIGHT AVERAGING)

# ==========================================

print(f"ðŸ¥£ Starting Model Soup with {len(MODEL_DIRS)} ingredients...")



# Load the first model (The Purist) as the base container

print(f"Loading Base: {MODEL_DIRS[0]}")

base_model = AutoModelForSeq2SeqLM.from_pretrained(

    MODEL_DIRS[0], device_map="cpu", low_cpu_mem_usage=True

)

soup_state_dict = base_model.state_dict()



# Add the other models (Greedy & Specialist)

for model_path in MODEL_DIRS[1:]:

    print(f"Merging Ingredients from: {model_path}")

    participant_model = AutoModelForSeq2SeqLM.from_pretrained(

        model_path, device_map="cpu", low_cpu_mem_usage=True

    )

    participant_state = participant_model.state_dict()

    

    for key in soup_state_dict:

        # Sum the weights: W_total = W_a + W_b + W_c

        soup_state_dict[key] += participant_state[key]



# Divide by 3 to get the mathematical average

print("Mixing (Averaging weights)...")

for key in soup_state_dict:

    soup_state_dict[key] = soup_state_dict[key] / len(MODEL_DIRS)



# Load the averaged weights back into the base model

base_model.load_state_dict(soup_state_dict)

print("âœ… Model Soup Ready! This model contains the knowledge of all three.")



# Move to GPU

if torch.cuda.is_available():

    base_model = base_model.cuda()

base_model.eval()



# Load Tokenizer (All ByT5 tokenizers are identical)

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIRS[0])



# ==========================================

# 2. GENERATE PREDICTIONS

# ==========================================

def predict(text):

    if pd.isna(text): return ""

    inputs = tokenizer(

        PREFIX + text, 

        max_length=MAX_LENGTH, 

        truncation=True, 

        padding=True, 

        return_tensors="pt"

    ).to(base_model.device)

    

    with torch.no_grad():

        generated_ids = base_model.generate(

            **inputs,

            max_length=MAX_LENGTH,

            num_beams=5,

            early_stopping=True

        )

    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)



# Load Test Data

test_path = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"

test_df = pd.read_csv(test_path)

print(f"Generating predictions for {len(test_df)} samples...")



# Run Inference

submission_ids = []

predictions = []



for idx, row in test_df.iterrows():

    pred = predict(row['transliteration'])

    submission_ids.append(row['id'])

    predictions.append(pred)

    if idx % 50 == 0: 

        print(f"Processed {idx} samples...")



# ==========================================

# 3. SAVE SUBMISSION

# ==========================================

submission = pd.DataFrame({'id': submission_ids, 'translation': predictions})

submission.to_csv("submission.csv", index=False)

print("ðŸŽ‰ Submission saved successfully! Ready to submit.")
