In [1]:
# DEEP PAST CHALLENGE - SINGLE HYBRID PIPELINE (Neural Ensemble Only)
import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm

torch.manual_seed(42)
np.random.seed(42)

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
def resolve_path(env_name, working_default, input_fallback):
    path = os.getenv(env_name, working_default)
    if not os.path.exists(path) and os.path.exists(input_fallback):
        path = input_fallback
    return path

MODEL_PATHS = {
    "byt5": resolve_path("BYT5_PATH", "/kaggle/working/byt5-base-saved", "/kaggle/input/notebook-a-byt5/byt5-base-saved"),
    "t5": resolve_path("T5_PATH", "/kaggle/working/t5-base-fine-tuned", "/kaggle/input/notebook-b-t5/t5-base-fine-tuned"),
    "marian": resolve_path("MARIAN_PATH", "/kaggle/working/marian-mt-saved", "/kaggle/input/notebook-c-marian-mt/marian-mt-saved"),
}

DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16

# -----------------------------------------------------------------------------
# CLEANING HELPERS (match training notebooks)
# -----------------------------------------------------------------------------
SUBSCRIPT_TRANS = str.maketrans({"₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9", "ₓ": "x"})

def normalize_subscripts(text: str) -> str:
    return text.translate(SUBSCRIPT_TRANS)

def clean_translit(text):
    if not isinstance(text, str):
        return ""
    text = normalize_subscripts(text)
    text = text.replace("…", " <big_gap> ")
    text = re.sub(r"\\.\\.\\.+", " <big_gap> ", text)
    text = re.sub(r"\[[^\]]*\]", " ", text)
    text = re.sub(r"<<[^>]*>>", " ", text)
    text = re.sub(r"[˹˺]", " ", text)
    text = re.sub(r"\([^)]*\)", " ", text)
    text = re.sub(r"\{([^}]*)\}", r"\1", text)
    text = re.sub(r"<([^>]*)>", r"\1", text)
    text = re.sub(r"[!?/:·]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_translation(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("…", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def dedup_repeats(text: str) -> str:
    tokens = text.split()
    out = []
    for tok in tokens:
        if len(out) >= 2 and tok == out[-1] == out[-2]:
            continue
        out.append(tok)
    return " ".join(out)

def postprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r"\s+([.,!?;:])", r"\1", text)
    text = re.sub(r"([.,!?;:])([A-Za-z])", r"\1 \2", text)
    text = dedup_repeats(text)
    if text and text[0].islower():
        text = text[0].upper() + text[1:]
    if text and text[-1] not in ".!?":
        text += "."
    text = re.sub(r"([.!?]){2,}", ".", text)
    return text.strip()

def sanitize_outputs(primary, backups):
    cleaned = []
    for i, text in enumerate(primary):
        cand = postprocess_text(text)
        if len(cand) < 5 or cand.lower() in {"unknown", ""}:
            fallbacks = [postprocess_text(b[i]) for b in backups if i < len(b) and len(postprocess_text(b[i])) >= 5]
            if fallbacks:
                cand = max(fallbacks, key=len)
            else:
                cand = "Unknown"
        cleaned.append(cand)
    return cleaned

# -----------------------------------------------------------------------------
# DATASET + GENERATION
# -----------------------------------------------------------------------------
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

def generate_predictions(model_name, model_path, inputs):
    if not os.path.exists(model_path):
        print(f"[WARNING] Missing model for {model_name}: {model_path}")
        return [""] * len(inputs)

    if "byt5" in model_name:
        max_len = 420; prefix = "translate Akkadian to English: "; beams = 8
    elif "t5" in model_name:
        max_len = 280; prefix = "translate Akkadian to English: "; beams = 8
    else:
        max_len = 180; prefix = ""; beams = 6

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE).eval()
    except Exception as e:
        print(f"[ERROR] Failed to load {model_name}: {e}")
        return [""] * len(inputs)

    dataset = TextDataset(inputs, tokenizer, max_len, prefix)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    preds = []
    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Inference {model_name}"):
            gen_ids = model.generate(
                input_ids=batch["input_ids"].to(DEVICE),
                attention_mask=batch["attention_mask"].to(DEVICE),
                max_length=max_len,
                min_length=6,
                num_beams=beams,
                no_repeat_ngram_size=3,
                repetition_penalty=1.08,
                length_penalty=1.05,
                early_stopping=True,
            )
            decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
            preds.extend([postprocess_text(d) for d in decoded])

    del model, tokenizer, dataset, loader
    torch.cuda.empty_cache(); gc.collect()
    return preds

# -----------------------------------------------------------------------------
# ENSEMBLE
# -----------------------------------------------------------------------------
def score_candidate(text: str, src_len: int, base_weight: float) -> float:
    tok_len = max(1, len(text.split()))
    ratio = tok_len / max(1, src_len)
    length_penalty = -abs(ratio - 1.3) * 0.8  # prefer around 1.3x source tokens
    short_penalty = -3.0 if tok_len < 4 else 0.0
    garbage_penalty = -4.0 if text.lower() in {"unknown", ""} else 0.0
    return base_weight + length_penalty + short_penalty + garbage_penalty

def neural_vote(preds_dict, src_lens):
    weights = {"byt5": 4.0, "t5": 1.6, "marian": 1.2}
    final = []
    n = max(len(v) for v in preds_dict.values()) if preds_dict else 0
    for i in range(n):
        best_text, best_score = "", -1e9
        for name, preds in preds_dict.items():
            if i >= len(preds):
                continue
            score = score_candidate(preds[i], src_lens[i], weights.get(name, 1.0))
            if score > best_score:
                best_score, best_text = score, preds[i]
        final.append(best_text)
    return final

# -----------------------------------------------------------------------------
# MAIN PIPELINE (Neural-only)
# -----------------------------------------------------------------------------
def main():
    print("=== Deep Past Neural Ensemble Inference ===")
    test_df = pd.read_csv(f"{DATA_DIR}/test.csv")

    test_inputs_raw = test_df["transliteration"].fillna("").astype(str).tolist()
    test_inputs = [clean_translit(t) for t in test_inputs_raw]
    src_lens = [len(t.split()) for t in test_inputs]

    neural_preds = {
        "byt5": generate_predictions("byt5", MODEL_PATHS["byt5"], test_inputs),
        "t5": generate_predictions("t5", MODEL_PATHS["t5"], test_inputs),
        "marian": generate_predictions("marian", MODEL_PATHS["marian"], test_inputs),
    }
    ensemble_preds = neural_vote(neural_preds, src_lens)

    final_outputs = sanitize_outputs(ensemble_preds, list(neural_preds.values()))
    submission = pd.DataFrame({"id": test_df["id"], "translation": final_outputs})
    submission["translation"] = submission["translation"].apply(lambda x: x if len(str(x)) > 1 else "Unknown")
    submission.to_csv("submission.csv", index=False)

    print("\nPreview:")
    print(submission.head())
    return submission

if __name__ == "__main__":
    main()

=== Deep Past Neural Ensemble Inference ===


2025-12-25 12:14:51.095337: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766664891.298313      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766664891.357816      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766664891.851037      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851082      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766664891.851085      55 computation_placer.cc:177] computation placer alr

Inference byt5:   0%|          | 0/1 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Inference t5:   0%|          | 0/1 [00:00<?, ?it/s]



Inference marian:   0%|          | 0/1 [00:00<?, ?it/s]


Preview:
   id                                        translation
0   0  Kà-ar-ma ú big_gap da-tim aí-ip-ri-ni Akkadian...
1   1  -ni i-na né-mì-lim da-aùr ú-lá e-WA ia-ra-tí-a...
2   2  -it a-aí-im au-um-au ia-tí aé-bi„-lá-nim Trans...
3   3  É-bi„-lá KÙ. AN Translate Akkadian to English:...
