In [None]:
# DEEP PAST CHALLENGE - SINGLE HYBRID PIPELINE (Retrieval + Neural Ensemble)
import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

torch.manual_seed(42)
np.random.seed(42)

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
MODEL_PATHS = {
    "byt5": os.getenv("BYT5_PATH", "/kaggle/input/notebook-a-byt5/byt5-base-saved"),
    "t5": os.getenv("T5_PATH", "/kaggle/input/notebook-b-t5/t5-base-fine-tuned"),
    "marian": os.getenv("MARIAN_PATH", "/kaggle/input/notebook-c-marian-mt/marian-mt-saved"),
}

DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
GLOBAL_MATCH_THRESHOLD = 0.80
LOCAL_MATCH_THRESHOLD = 0.75

# -----------------------------------------------------------------------------
# CLEANING HELPERS (match training notebooks)
# -----------------------------------------------------------------------------
def clean_translit(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("…", " <big_gap> ")
    text = re.sub(r"\.\.\.+", " <big_gap> ", text)
    text = re.sub(r"\[[^\]]*\]", " ", text)
    text = re.sub(r"<([^>]*)>", r"\1", text)
    text = re.sub(r"[!?/]", " ", text)
    text = re.sub(r"[:]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_translation(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("…", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def postprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r"\s+([.,!?;:])", r"\1", text)
    if text and text[0].islower():
        text = text[0].upper() + text[1:]
    if text and text[-1] not in ".!?":
        text += "."
    return text

# -----------------------------------------------------------------------------
# RETRIEVAL ENGINE
# -----------------------------------------------------------------------------
class RetrievalEngine:
    def __init__(self, train_df):
        self.df = train_df.copy()
        self.df["transliteration_clean"] = self.df["transliteration"].fillna("").map(clean_translit)
        self.vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 6), min_df=1, sublinear_tf=True)
        self.train_vectors = self.vectorizer.fit_transform(self.df["transliteration_clean"])
        print(f"[INFO] Indexed {self.train_vectors.shape[0]} training rows for retrieval.")

    def find_global_parent(self, test_clean_texts):
        full_test = " ".join(test_clean_texts)
        test_vec = self.vectorizer.transform([full_test])
        sims = cosine_similarity(test_vec, self.train_vectors).flatten()
        best_idx = int(np.argmax(sims))
        return self.df.iloc[best_idx]["translation"], float(sims[best_idx])

    def find_local_matches(self, test_clean_texts):
        test_vecs = self.vectorizer.transform(test_clean_texts)
        matches = []
        sims = cosine_similarity(test_vecs, self.train_vectors)
        for i in range(test_vecs.shape[0]):
            best_idx = int(np.argmax(sims[i]))
            matches.append((self.df.iloc[best_idx]["translation"], float(sims[i][best_idx])))
        return matches

# -----------------------------------------------------------------------------
# DATASET + GENERATION
# -----------------------------------------------------------------------------
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

def generate_predictions(model_name, model_path, inputs):
    if not os.path.exists(model_path):
        print(f"[WARNING] Missing model for {model_name}: {model_path}")
        return [""] * len(inputs)

    if "byt5" in model_name:
        max_len = 420; prefix = "translate Akkadian to English: "; beams = 6
    elif "t5" in model_name:
        max_len = 280; prefix = "translate Akkadian to English: "; beams = 6
    else:
        max_len = 180; prefix = ""; beams = 5

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE).eval()
    except Exception as e:
        print(f"[ERROR] Failed to load {model_name}: {e}")
        return [""] * len(inputs)

    dataset = TextDataset(inputs, tokenizer, max_len, prefix)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

    preds = []
    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Inference {model_name}"):
            gen_ids = model.generate(
                input_ids=batch["input_ids"].to(DEVICE),
                attention_mask=batch["attention_mask"].to(DEVICE),
                max_length=max_len,
                num_beams=beams,
                no_repeat_ngram_size=3,
                repetition_penalty=1.15,
                length_penalty=1.0,
                early_stopping=True,
            )
            decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
            preds.extend([postprocess_text(d) for d in decoded])

    del model, tokenizer, dataset, loader
    torch.cuda.empty_cache(); gc.collect()
    return preds

# -----------------------------------------------------------------------------
# ENSEMBLE
# -----------------------------------------------------------------------------
def neural_vote(preds_dict):
    weights = {"byt5": 5.0, "t5": 1.5, "marian": 1.0}
    final = []
    n = max(len(v) for v in preds_dict.values()) if preds_dict else 0
    for i in range(n):
        best_text, best_score = "", -1e9
        for name, preds in preds_dict.items():
            if i >= len(preds):
                continue
            score = weights.get(name, 1.0)
            if len(preds[i]) < 5:
                score -= 5.0
            if score > best_score:
                best_score, best_text = score, preds[i]
        final.append(best_text)
    return final

# -----------------------------------------------------------------------------
# SLICING FOR GLOBAL PARENT MATCH
# -----------------------------------------------------------------------------
def extract_translation_segment(translation, line_start, line_end, total_lines):
    if not isinstance(translation, str) or total_lines <= 0:
        return translation if isinstance(translation, str) else ""
    start_ratio = max(0, (line_start - 1) / total_lines)
    end_ratio = min(1, line_end / total_lines)
    start_char = int(len(translation) * start_ratio)
    end_char = int(len(translation) * end_ratio)

    if start_char > 0:
        last_period = translation.rfind('.', max(0, start_char - 150), start_char)
        if last_period > 0:
            start_char = last_period + 2

    if end_char < len(translation):
        next_period = translation.find('.', end_char, min(len(translation), end_char + 150))
        if next_period > 0:
            end_char = next_period + 1

    if start_char >= end_char:
        start_char = int(len(translation) * start_ratio)
        end_char = int(len(translation) * end_ratio)

    return translation[start_char:end_char].strip()

# -----------------------------------------------------------------------------
# MAIN PIPELINE
# -----------------------------------------------------------------------------
def main():
    print("=== Deep Past Hybrid Inference ===")
    train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
    test_df = pd.read_csv(f"{DATA_DIR}/test.csv")

    test_inputs_raw = test_df["transliteration"].fillna("").astype(str).tolist()
    test_inputs = [clean_translit(t) for t in test_inputs_raw]

    retriever = RetrievalEngine(train_df)

    # Global parent search
    parent_translation, global_score = retriever.find_global_parent(test_inputs)
    print(f"[INFO] Global similarity: {global_score:.2%}")

    if global_score >= GLOBAL_MATCH_THRESHOLD:
        print("[INFO] Strong global match -> using proportional slicing.")
        total_lines = test_df["line_end"].max() if "line_end" in test_df.columns else len(test_df) * 10
        final_outputs = []
        for _, row in test_df.iterrows():
            l_start = int(row.get("line_start", 1)) if not pd.isna(row.get("line_start", 1)) else 1
            l_end = int(row.get("line_end", l_start)) if not pd.isna(row.get("line_end", l_start)) else l_start
            final_outputs.append(extract_translation_segment(parent_translation, l_start, l_end, total_lines))
        sources = [f"Global Retrieval ({global_score:.2f})"] * len(final_outputs)
    else:
        print("[INFO] No strong global match -> hybrid local + neural.")
        neural_preds = {
            "byt5": generate_predictions("byt5", MODEL_PATHS["byt5"], test_inputs),
            "t5": generate_predictions("t5", MODEL_PATHS["t5"], test_inputs),
            "marian": generate_predictions("marian", MODEL_PATHS["marian"], test_inputs),
        }
        ensemble_preds = neural_vote(neural_preds)
        local_matches = retriever.find_local_matches(test_inputs)

        final_outputs, sources = [], []
        for i, (retr_text, score) in enumerate(local_matches):
            if retr_text and score >= LOCAL_MATCH_THRESHOLD:
                final_outputs.append(clean_translation(retr_text))
                sources.append(f"Local Retrieval ({score:.2f})")
            else:
                final_outputs.append(ensemble_preds[i])
                sources.append("Neural Ensemble")

    submission = pd.DataFrame({"id": test_df["id"], "translation": final_outputs})
    submission["translation"] = submission["translation"].apply(lambda x: x if len(str(x)) > 1 else "Unknown")
    submission.to_csv("submission.csv", index=False)

    print("\nPreview:")
    print(submission.head())
    return submission, sources

if __name__ == "__main__":
    main()

  text = re.sub(r'\s+([.,!?;:])', r'\1', text)


