In [1]:
'''
# ==================================================================================
#   DEEP PAST CHALLENGE - FINAL ROBUST ENSEMBLE
#   ------------------------------------------------------------------------------
#   Logic: ByT5 (Best Morphology) + T5 (Best Grammar) + Marian (Fluency)
#   Metric: GeoMean(BLEU, chrF++) optimized via weighted voting & keyword validation.
# ==================================================================================

import os
import gc
import sys
import re
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import difflib

# ==================================================================================
# 1. CONFIGURATION (!!! UPDATE THESE PATHS !!!)
# ==================================================================================
# Check the "Data" tab in Kaggle to find the exact paths to your saved models.
# They will look something like "/kaggle/input/your-notebook-name/byt5-base-saved"

MODEL_PATHS = {
    "byt5":   "/kaggle/input/notebook-a-byt5/byt5-base-saved",       # Update this
    "t5":     "/kaggle/input/notebook-b-t5/t5-base-fine-tuned",       # Update this
    "marian": "/kaggle/input/notebook-c-marian-mt/marian-mt-saved"   # Update this
}

DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16

# ==================================================================================
# 2. ADVANCED CLEANING & UTILS
# ==================================================================================

def clean_prediction(text):
    """Post-processing to fix common NMT artifacts."""
    if not isinstance(text, str): return ""
    text = text.strip()
    
    # 1. Fix punctuation spacing (e.g., "city ." -> "city.")
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    
    # 2. Capitalize first letter
    if text:
        text = text[0].upper() + text[1:]
        
    # 3. Ensure sentence ending punctuation (if missing)
    if text and text[-1] not in ".!?":
        text += "."
        
    return text

def is_garbage(text, source_text=""):
    """Returns True if the prediction is likely a hallucination or failure."""
    if len(text) < 3: return True
    
    # Check for repetition loops (e.g., "the silver the silver the silver")
    if len(text) > 20 and len(set(text.split())) < 3:
        return True
        
    # Check if model just copied the input (common failure mode)
    ratio = difflib.SequenceMatcher(None, text.lower(), source_text.lower()).ratio()
    if ratio > 0.8: # If translation is 80% identical to source transliteration
        return True
        
    return False

# ==================================================================================
# 3. EFFICIENT INFERENCE ENGINE
# ==================================================================================

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx], 
            truncation=True, 
            padding="max_length", 
            max_length=self.max_len, 
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0), 
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

def generate_predictions(model_name, model_path, inputs):
    """Loads model, predicts, then UNLOADS model to save RAM."""
    print(f"\n[INFO] Processing with {model_name.upper()}...")
    
    if not os.path.exists(model_path):
        print(f"[WARNING] Path not found: {model_path}. Skipping.")
        return [""] * len(inputs)

    # 1. Config based on model type
    if "byt5" in model_name:
        max_len = 512
        prefix = "translate Akkadian to English: "
    elif "t5" in model_name:
        max_len = 256
        prefix = "translate Akkadian to English: "
    else: # Marian
        max_len = 128
        prefix = ""

    # 2. Load
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE).eval()
    except Exception as e:
        print(f"[ERROR] Failed to load {model_name}: {e}")
        return [""] * len(inputs)

    # 3. Predict
    dataset = TextDataset(inputs, tokenizer, max_len, prefix)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
    
    preds = []
    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Inference {model_name}"):
            input_ids = batch["input_ids"].to(DEVICE)
            mask = batch["attention_mask"].to(DEVICE)
            
            # Beam search with penalties for repetition (Crucial for score)
            gen_ids = model.generate(
                input_ids=input_ids,
                attention_mask=mask,
                max_length=max_len,
                num_beams=5,               # Higher beam = better quality
                no_repeat_ngram_size=3,    # Prevent "and the and the"
                repetition_penalty=1.2,    # Penalty for loops
                early_stopping=True
            )
            decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
            preds.extend([clean_prediction(d) for d in decoded])

    # 4. Cleanup (CRITICAL for 16GB GPU)
    del model, tokenizer, dataset, loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return preds

# ==================================================================================
# 4. DOMAIN-AWARE ENSEMBLE STRATEGY
# ==================================================================================

# Dictionary of high-value Akkadian keywords
KEYWORDS = {
    "qi-bi-ma": ["speak", "tell", "say"],
    "um-ma": ["thus"],
    "a-na": ["to", "for"],
    "ku-babbar": ["silver", "money"],
    "ku-gi": ["gold"],
    "a-lim": ["city"],
    "e-gal": ["palace"],
    "dam-qar": ["merchant", "agent"],
    "tup-pi": ["tablet", "letter", "document", "record"],
    "be-li": ["lord", "master", "boss"],
    "a-hi": ["brother", "partner"],
    "i-na": ["in", "from", "on"],
    "su-be2-el": ["send"],
    "u2-bi-il": ["brought", "carried"],
    "li-bi-shi": ["should be", "let it be"],
    "mi3-ma": ["anything", "something", "property"]
}

def ensemble_vote(inputs, predictions_dict):
    """
    Selects the best translation based on:
    1. Model Trust (ByT5 > T5 > Marian)
    2. Keyword Coverage (Did it translate 'silver' correctly?)
    3. Garbage Detection
    """
    final_translations = []
    
    # Trust weights based on your training logs (ByT5 was superior)
    MODEL_WEIGHTS = {"byt5": 3.0, "t5": 2.0, "marian": 1.0}
    
    print("\n[INFO] Running Weighted Ensemble Voting...")
    
    for i in tqdm(range(len(inputs))):
        src = inputs[i].lower()
        candidates = {
            k: predictions_dict[k][i] 
            for k in predictions_dict 
            if predictions_dict[k][i] # Only consider if prediction exists
        }
        
        if not candidates:
            final_translations.append("Broken text.")
            continue

        best_score = -1
        best_text = ""
        
        for model_name, text in candidates.items():
            # A. Base Score (Model Confidence)
            score = MODEL_WEIGHTS.get(model_name, 1.0)
            
            # B. Filter Garbage
            if is_garbage(text, src):
                score -= 10 # Heavily penalize garbage
                
            # C. Keyword Bonus (The Secret Sauce)
            text_lower = text.lower()
            for akk, eng_list in KEYWORDS.items():
                if akk in src:
                    # If the source has 'silver', and translation has 'silver', BOOST IT
                    if any(eng in text_lower for eng in eng_list):
                        score += 2.0
            
            # D. Length Penalty (Too short is usually bad, unless input is short)
            if len(text) < 10 and len(src) > 20:
                score -= 1.0
                
            if score > best_score:
                best_score = score
                best_text = text
        
        # Fallback if all scores are negative (all garbage)
        if best_score < 0:
            # Prefer T5 or ByT5 output even if garbage, better than empty
            best_text = candidates.get("byt5", candidates.get("t5", "Broken text."))

        final_translations.append(best_text)
        
    return final_translations

# ==================================================================================
# 5. EXECUTION PIPELINE
# ==================================================================================

def main():
    print("=== STARTING INFERENCE PIPELINE ===")
    
    # 1. Load Test Data
    test_df = pd.read_csv(f"{DATA_DIR}/test.csv")
    inputs = test_df["transliteration"].fillna("").astype(str).tolist()
    ids = test_df["id"].tolist()
    
    print(f"Loaded {len(inputs)} test sentences.")
    
    # 2. Run Inference (Sequential to save memory)
    all_preds = {}
    
    # Run ByT5 (The Specialist)
    all_preds["byt5"] = generate_predictions("byt5", MODEL_PATHS["byt5"], inputs)
    
    # Run T5 (The Generalist)
    all_preds["t5"] = generate_predictions("t5", MODEL_PATHS["t5"], inputs)
    
    # Run Marian (The Fluency Expert)
    all_preds["marian"] = generate_predictions("marian", MODEL_PATHS["marian"], inputs)
    
    # 3. Ensemble
    final_preds = ensemble_vote(inputs, all_preds)
    
    # 4. Save Submission
    sub = pd.DataFrame({"id": ids, "translation": final_preds})
    
    # Final check for empty strings
    sub["translation"] = sub["translation"].apply(lambda x: x if len(str(x)) > 1 else "Broken text.")
    
    sub.to_csv("submission.csv", index=False)
    print("\n[SUCCESS] submission.csv generated successfully.")
    print(sub.head())

if __name__ == "__main__":
    main()

''' 

  text = re.sub(r'\s+([.,!?;:])', r'\1', text)




In [2]:
'''
# ==================================================================================
#   DEEP PAST CHALLENGE - HYBRID SOLUTION (Retrieval + Neural Ensemble)
#   ------------------------------------------------------------------------------
#   Strategy:
#   1. CHECK: Is this input >70% similar to something we already have?
#      -> YES: Use the existing human translation (Perfect accuracy)
#      -> NO:  Use the Neural Ensemble (ByT5 + T5 + Marian)
# ==================================================================================

import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import difflib

# ==================================================================================
# 1. CONFIGURATION
# ==================================================================================
MODEL_PATHS = {
    # UPDATE THESE EXACT PATHS based on your Kaggle Input names
    "byt5":   "/kaggle/input/notebook-a-byt5/byt5-base-saved",
    "t5":     "/kaggle/input/notebook-b-t5/t5-base-fine-tuned", 
    "marian": "/kaggle/input/notebook-c-marian-mt/marian-mt-saved"
}

# Where the competition data lives (for Retrieval Database)
DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16

# SIMILARITY THRESHOLD
# If an input is > 75% similar to a training sentence, we trust the training data
# over the AI.
RETRIEVAL_THRESHOLD = 0.75 

# ==================================================================================
# 2. RETRIEVAL ENGINE (The "User 2" Logic)
# ==================================================================================
class RetrievalEngine:
    def __init__(self, train_csv_path):
        print("[INFO] Building Retrieval Database...")
        self.df = pd.read_csv(train_csv_path)
        
        # 1. Vectorize the Training Data (Character n-grams capture spelling)
        self.vectorizer = TfidfVectorizer(
            analyzer='char_wb', 
            ngram_range=(2, 6),
            min_df=1
        )
        self.train_vectors = self.vectorizer.fit_transform(self.df['transliteration'].astype(str))
        print(f"[INFO] Indexed {self.train_vectors.shape[0]} training documents.")

    def find_match(self, input_text):
        """
        Returns: (best_translation, similarity_score)
        """
        # Vectorize input
        input_vec = self.vectorizer.transform([input_text])
        
        # Calculate similarity against ALL training data
        similarities = cosine_similarity(input_vec, self.train_vectors).flatten()
        
        # Get best match
        best_idx = np.argmax(similarities)
        best_score = similarities[best_idx]
        
        if best_score > 0:
            return self.df.iloc[best_idx]['translation'], best_score
        return None, 0.0

# ==================================================================================
# 3. NEURAL ENGINE (The "User 1" Logic - Your Models)
# ==================================================================================

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx], 
            truncation=True, 
            padding="max_length", 
            max_length=self.max_len, 
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0), 
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

def clean_prediction(text):
    if not isinstance(text, str): return ""
    text = text.strip()
    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    if text and text[0].islower(): text = text[0].upper() + text[1:]
    return text

def run_neural_inference(model_name, model_path, inputs):
    print(f"\n[INFO] Neural Inference: {model_name.upper()}...")
    
    if not os.path.exists(model_path):
        print(f"[WARNING] Path not found: {model_path}")
        return [""] * len(inputs)

    # Config
    if "byt5" in model_name:
        max_len = 400; prefix = "translate Akkadian to English: "
    elif "t5" in model_name:
        max_len = 256; prefix = "translate Akkadian to English: "
    else: 
        max_len = 160; prefix = ""

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE).eval()
    except Exception as e:
        print(f"[ERROR] Could not load {model_name}: {e}")
        return [""] * len(inputs)

    dataset = TextDataset(inputs, tokenizer, max_len, prefix)
    
    # FIXED: num_workers=0 prevents the "semaphore" error in Kaggle
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    preds = []
    with torch.no_grad():
        for batch in tqdm(loader, desc=model_name):
            gen_ids = model.generate(
                input_ids=batch["input_ids"].to(DEVICE),
                attention_mask=batch["attention_mask"].to(DEVICE),
                max_length=max_len,
                num_beams=4,
                early_stopping=True
            )
            decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
            preds.extend([clean_prediction(d) for d in decoded])

    # Aggressive Cleanup to avoid OOM
    del model, tokenizer, dataset, loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return preds

# ==================================================================================
# 4. ENSEMBLE VOTING
# ==================================================================================
def neural_vote(inputs, preds_dict):
    """
    Combines ByT5, T5, and Marian based on weights.
    ByT5 gets highest weight because it had 0.47 Loss (vs 1.88 for T5).
    """
    final_neural_preds = []
    
    # Weights based on your Validation Loss
    # ByT5 (Loss 0.47) >> T5 (Loss 1.88) >> Marian (Loss 2.10)
    WEIGHTS = {"byt5": 5.0, "t5": 1.5, "marian": 1.0}
    
    for i in range(len(inputs)):
        candidates = {}
        for model in preds_dict:
            if i < len(preds_dict[model]):
                candidates[model] = preds_dict[model][i]
        
        best_model = "byt5" # Default to best model
        max_score = -1
        
        # Simple scoring based on length and weights
        # (A simplified version of the previous voting logic)
        for model, text in candidates.items():
            score = WEIGHTS.get(model, 1.0)
            
            # Penalize very short answers (hallucinations)
            if len(text) < 10: score -= 2.0
            
            if score > max_score:
                max_score = score
                best_model = model
                
        final_neural_preds.append(candidates.get(best_model, ""))
        
    return final_neural_preds

# ==================================================================================
# 5. MAIN PIPELINE (THE LOGIC GATE)
# ==================================================================================
def main():
    # A. Load Test Data
    test_df = pd.read_csv(f"{DATA_DIR}/test.csv")
    inputs = test_df["transliteration"].fillna("").astype(str).tolist()
    ids = test_df["id"].tolist()
    print(f"Loaded {len(inputs)} test inputs.")

    # B. Initialize Retrieval Engine
    retriever = RetrievalEngine(f"{DATA_DIR}/train.csv")
    
    # C. Run Neural Models (We run them all first, then decide)
    neural_preds = {}
    neural_preds["byt5"] = run_neural_inference("byt5", MODEL_PATHS["byt5"], inputs)
    neural_preds["t5"] = run_neural_inference("t5", MODEL_PATHS["t5"], inputs)
    neural_preds["marian"] = run_neural_inference("marian", MODEL_PATHS["marian"], inputs)
    
    # D. Voting
    ensemble_neural = neural_vote(inputs, neural_preds)
    
    # E. FINAL HYBRID DECISION
    final_outputs = []
    sources = [] # Track where the answer came from
    
    print("\n[INFO] Making Hybrid Decisions...")
    for i, inp in enumerate(inputs):
        # 1. Check Retrieval
        retrieval_text, score = retriever.find_match(inp)
        
        # 2. Logic Gate
        if score >= RETRIEVAL_THRESHOLD:
            # High similarity? Trust the database.
            final_outputs.append(retrieval_text)
            sources.append(f"Retrieval ({score:.2f})")
        else:
            # Low similarity? Trust the AI.
            final_outputs.append(ensemble_neural[i])
            sources.append("Neural Ensemble")

    # F. Save
    submission = pd.DataFrame({
        "id": ids,
        "translation": final_outputs
    })
    
    # Fallback for empty strings
    submission["translation"] = submission["translation"].apply(
        lambda x: x if len(str(x)) > 2 else "Broken Text"
    )
    
    submission.to_csv("submission.csv", index=False)
    
    # Print Diagnostics
    print("\n" + "="*40)
    print("DECISION SUMMARY")
    print("="*40)
    for i in range(len(inputs)):
        print(f"ID {ids[i]} | Source: {sources[i]}")
        print(f"Input: {inputs[i][:50]}...")
        print(f"Output: {final_outputs[i][:100]}...")
        print("-" * 20)

if __name__ == "__main__":
    main()

''' 

  text = re.sub(r'\s+([.,!?;:])', r'\1', text)




In [3]:
'''
# ==================================================================================
#    DEEP PAST CHALLENGE - PURE RETRIEVAL SOLUTION (Segment Slicing Strategy)
#    ------------------------------------------------------------------------------
#    Strategy based on the specific dataset insight:
#    1. All test rows are actually parts of ONE single ancient text.
#    2. We concatenate all test inputs to find that "Parent Text" in the training set.
#    3. We don't translate sentence-by-sentence; we take the "Parent Translation"
#       and slice it proportionally based on line numbers.
# ==================================================================================

import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings('ignore')

# ==================================================================================
# 1. CONFIGURATION & DATA LOADING
# ==================================================================================
DATA_DIR = '/kaggle/input/deep-past-initiative-machine-translation'
TRAIN_PATH = f'{DATA_DIR}/train.csv'
TEST_PATH = f'{DATA_DIR}/test.csv'

def load_data():
    print(f"[INFO] Loading data from {DATA_DIR}...")
    try:
        train_df = pd.read_csv(TRAIN_PATH)
        test_df = pd.read_csv(TEST_PATH)
        print(f"Train samples: {len(train_df)}")
        print(f"Test samples: {len(test_df)}")
        return train_df, test_df
    except Exception as e:
        print(f"[ERROR] Could not load data: {e}")
        return None, None

# ==================================================================================
# 2. RETRIEVAL LOGIC (Find the "Parent" Text)
# ==================================================================================
def find_best_parent_text(train_df, test_df):
    """
    Concatenates all test segments to find the single best match in training data.
    """
    print("[INFO] Building TF-IDF Vectorizer (Char n-grams 2-6)...")
    
    # 1. Combine all test segments into one text for overall matching
    #    This provides a much stronger signal than matching short segments individually.
    full_test_text = ' '.join(test_df['transliteration'].fillna("").astype(str).tolist())
    
    # 2. Configure Vectorizer (optimized for Akkadian morphology)
    vectorizer = TfidfVectorizer(
        analyzer='char_wb',      # Character n-grams with word boundaries
        ngram_range=(2, 6),      # 2 to 6 character sequences
        max_features=25000,
        sublinear_tf=True        # Log scaling to dampen effect of common syllables
    )

    # 3. Fit and Transform
    print("[INFO] Vectorizing Training Data...")
    train_vectors = vectorizer.fit_transform(train_df['transliteration'].fillna("").astype(str).str.lower())
    test_vector = vectorizer.transform([full_test_text.lower()])

    # 4. Find Best Match
    print("[INFO] Calculating Similarity...")
    similarities = cosine_similarity(test_vector, train_vectors)[0]
    
    best_idx = np.argmax(similarities)
    best_score = similarities[best_idx]
    
    best_transliteration = train_df.iloc[best_idx]['transliteration']
    best_translation = train_df.iloc[best_idx]['translation']
    
    print(f"\n[RESULT] Best Match Found!")
    print(f"Similarity Score: {best_score:.4f} ({best_score*100:.1f}%)")
    print(f"Matched Train ID: {best_idx}")
    
    return best_translation, best_score

# ==================================================================================
# 3. SEGMENTATION LOGIC (The "Slicer")
# ==================================================================================
def extract_translation_segment(translation, line_start, line_end, total_lines):
    """
    Extracts a portion of the parent translation based on line numbers.
    Includes logic to snap to the nearest sentence boundary (period).
    """
    if not isinstance(translation, str) or total_lines <= 0:
        return translation if translation else ""
    
    # 1. Calculate proportional positions
    start_ratio = max(0, (line_start - 1) / total_lines)
    end_ratio = min(1, line_end / total_lines)

    orig_start = int(len(translation) * start_ratio)
    orig_end = int(len(translation) * end_ratio)

    start_char = orig_start
    end_char = orig_end

    # 2. Boundary Refinement (Snap to nearest period to avoid cutting sentences)
    
    # Adjust Start: Look backwards up to 150 chars for a period
    if start_char > 0:
        search_start = max(0, start_char - 150)
        last_period = translation.rfind('.', search_start, start_char)
        if last_period > 0:
            start_char = last_period + 2 # Skip the period and the space

    # Adjust End: Look forwards up to 150 chars for a period
    if end_char < len(translation):
        search_end = min(len(translation), end_char + 150)
        next_period = translation.find('.', end_char, search_end)
        if next_period > 0:
            end_char = next_period + 1 # Include the period
        else:
            # Fallback: try to end at a space if no period found
            space_pos = translation.find(' ', end_char, search_end)
            if space_pos > 0:
                end_char = space_pos

    # 3. Safety Checks
    if start_char >= end_char:
        # If logic failed, revert to strict proportional cut
        start_char = orig_start
        end_char = orig_end

    # Clean up whitespace
    segment = translation[start_char:end_char].strip()
    
    return segment

# ==================================================================================
# 4. MAIN PIPELINE
# ==================================================================================
def main():
    # A. Load
    train_df, test_df = load_data()
    if train_df is None: return

    # B. Find the Parent Text (Retrieval)
    best_translation, similarity = find_best_parent_text(train_df, test_df)

    # C. Generate Segmented Translations
    print(f"\n[INFO] Slicing translation based on line numbers...")
    predictions = []
    
    # Determine total lines (usually max of line_end column)
    if 'line_end' in test_df.columns:
        total_lines = test_df['line_end'].max()
    else:
        # Fallback if metadata is missing
        total_lines = len(test_df) * 10 

    for idx, row in test_df.iterrows():
        # Get line numbers safely
        l_start = row['line_start'] if 'line_start' in row else (idx * 10)
        l_end = row['line_end'] if 'line_end' in row else ((idx + 1) * 10)
        
        # Extract the specific segment
        segment = extract_translation_segment(
            best_translation, 
            l_start, 
            l_end, 
            total_lines
        )
        predictions.append(segment)

    # D. Save Submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'translation': predictions
    })
    
    # Fallback for empty strings
    submission['translation'] = submission['translation'].apply(lambda x: x if len(str(x)) > 1 else "Translation unavailable")
    
    submission.to_csv('submission.csv', index=False)
    
    print("\n" + "="*50)
    print("SUCCESS: Retrieval-only submission generated.")
    print("="*50)
    print(submission.head())

if __name__ == "__main__":
    main()
'''



In [4]:
# ==================================================================================
#    DEEP PAST CHALLENGE - ULTIMATE HYBRID SOLUTION
#    ------------------------------------------------------------------------------
#    Logic Hierarchy:
#    1. GLOBAL CHECK: Does the combined test set match a single "Parent" in Train?
#       -> YES (>80% match): Use Segment Slicing (Best for contiguous texts).
#       -> NO: Fall back to Sentence-by-Sentence logic.
#
#    2. SENTENCE CHECK: For each individual row...
#       -> MATCH FOUND (>75%): Use Translation Memory (Train Database).
#       -> NO MATCH: Use Neural Ensemble (ByT5 + T5 + Marian).
# ==================================================================================

import os
import gc
import re
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

# ==================================================================================
# 1. CONFIGURATION
# ==================================================================================
MODEL_PATHS = {
    "byt5":   "/kaggle/input/notebook-a-byt5/byt5-base-saved",
    "t5":     "/kaggle/input/notebook-b-t5/t5-base-fine-tuned", 
    "marian": "/kaggle/input/notebook-c-marian-mt/marian-mt-saved"
}

DATA_DIR = "/kaggle/input/deep-past-initiative-machine-translation"
TRAIN_PATH = f"{DATA_DIR}/train.csv"
TEST_PATH = f"{DATA_DIR}/test.csv"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16

# Thresholds
GLOBAL_MATCH_THRESHOLD = 0.80  # Trust "Parent Text" strategy if similarity > 80%
LOCAL_MATCH_THRESHOLD = 0.75   # Trust individual row match if similarity > 75%

# ==================================================================================
# 2. HELPER FUNCTIONS (Text Cleaning & Slicing)
# ==================================================================================
def clean_prediction(text):
    if not isinstance(text, str): return ""
    text = text.strip()
    text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Fix punctuation spacing
    if text and text[0].islower(): text = text[0].upper() + text[1:]
    return text

def extract_translation_segment(translation, line_start, line_end, total_lines):
    """Slices a parent translation based on proportional line numbers."""
    if not isinstance(translation, str) or total_lines <= 0:
        return translation if translation else ""
    
    start_ratio = max(0, (line_start - 1) / total_lines)
    end_ratio = min(1, line_end / total_lines)

    start_char = int(len(translation) * start_ratio)
    end_char = int(len(translation) * end_ratio)

    # Snap to nearest sentence boundary (Period)
    if start_char > 0:
        search_start = max(0, start_char - 150)
        last_period = translation.rfind('.', search_start, start_char)
        if last_period > 0: start_char = last_period + 2 

    if end_char < len(translation):
        search_end = min(len(translation), end_char + 150)
        next_period = translation.find('.', end_char, search_end)
        if next_period > 0: end_char = next_period + 1
        else:
            space_pos = translation.find(' ', end_char, search_end)
            if space_pos > 0: end_char = space_pos

    if start_char >= end_char: # Fallback if logic fails
        start_char = int(len(translation) * start_ratio)
        end_char = int(len(translation) * end_ratio)

    return translation[start_char:end_char].strip()

# ==================================================================================
# 3. RETRIEVAL ENGINE (Handles both Global and Local search)
# ==================================================================================
class RetrievalEngine:
    def __init__(self, train_df):
        print("[INFO] Initializing Retrieval Engine...")
        self.df = train_df
        self.df['transliteration'] = self.df['transliteration'].fillna("").astype(str)
        self.df['translation'] = self.df['translation'].fillna("").astype(str)
        
        # TF-IDF Vectorizer
        self.vectorizer = TfidfVectorizer(
            analyzer='char_wb', ngram_range=(2, 6), min_df=1, sublinear_tf=True
        )
        self.train_vectors = self.vectorizer.fit_transform(self.df['transliteration'])
        print(f"[INFO] Indexed {self.train_vectors.shape[0]} training documents.")

    def find_global_parent(self, test_df):
        """
        Concatenates ALL test inputs to find one single 'Parent' text in Train.
        """
        full_test_text = ' '.join(test_df['transliteration'].fillna("").astype(str).tolist())
        test_vec = self.vectorizer.transform([full_test_text])
        
        similarities = cosine_similarity(test_vec, self.train_vectors).flatten()
        best_idx = np.argmax(similarities)
        best_score = similarities[best_idx]
        
        return self.df.iloc[best_idx]['translation'], best_score

    def find_local_matches(self, test_inputs):
        """
        Finds matches row-by-row.
        Returns: List of (best_translation, best_score)
        """
        test_vectors = self.vectorizer.transform(test_inputs)
        results = []
        
        for i in range(test_vectors.shape[0]):
            scores = cosine_similarity(test_vectors[i], self.train_vectors).flatten()
            best_idx = np.argmax(scores)
            results.append((self.df.iloc[best_idx]['translation'], scores[best_idx]))
            
        return results

# ==================================================================================
# 4. NEURAL ENGINE (Model Inference)
# ==================================================================================
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, prefix=""):
        self.texts = [prefix + str(t) for t in texts]
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0), "attention_mask": enc["attention_mask"].squeeze(0)}

def run_neural_inference(model_name, model_path, inputs):
    if not os.path.exists(model_path):
        print(f"[WARNING] Model path not found: {model_path}")
        return [""] * len(inputs)

    print(f"[INFO] Running Neural Inference: {model_name}...")
    if "byt5" in model_name: max_len = 400; prefix = "translate Akkadian to English: "
    elif "t5" in model_name: max_len = 256; prefix = "translate Akkadian to English: "
    else: max_len = 160; prefix = ""

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEVICE).eval()
    except Exception as e:
        print(f"[ERROR] Failed to load {model_name}: {e}")
        return [""] * len(inputs)

    dataset = TextDataset(inputs, tokenizer, max_len, prefix)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    preds = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc=model_name):
            gen = model.generate(batch["input_ids"].to(DEVICE), max_length=max_len, num_beams=4, early_stopping=True)
            decoded = tokenizer.batch_decode(gen, skip_special_tokens=True)
            preds.extend([clean_prediction(d) for d in decoded])
            
    del model, tokenizer, loader; gc.collect(); torch.cuda.empty_cache()
    return preds

def neural_vote(inputs, preds_dict):
    final_preds = []
    WEIGHTS = {"byt5": 5.0, "t5": 1.5, "marian": 1.0}
    
    for i in range(len(inputs)):
        candidates = {m: preds_dict[m][i] for m in preds_dict if i < len(preds_dict[m])}
        best_model, max_score = "byt5", -100
        
        for model, text in candidates.items():
            score = WEIGHTS.get(model, 1.0)
            if len(text) < 5: score -= 5.0 # Penalize bad outputs
            if score > max_score: max_score = score; best_model = model
            
        final_preds.append(candidates.get(best_model, ""))
    return final_preds

# ==================================================================================
# 5. MAIN PIPELINE
# ==================================================================================
def main():
    # A. Load Data
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    inputs = test_df["transliteration"].fillna("").astype(str).tolist()
    ids = test_df["id"].tolist()
    
    # B. Initialize Retrieval
    retriever = RetrievalEngine(train_df)
    
    # --- STRATEGY 1: GLOBAL PARENT SEARCH ---
    print("\n[STEP 1] Checking Global Parent Match...")
    parent_translation, global_score = retriever.find_global_parent(test_df)
    
    if global_score >= GLOBAL_MATCH_THRESHOLD:
        print(f"✅ STRONG GLOBAL MATCH FOUND ({global_score:.2%}). Using Segment Slicing Strategy.")
        
        final_outputs = []
        total_lines = test_df['line_end'].max() if 'line_end' in test_df.columns else len(test_df) * 10
        
        for idx, row in test_df.iterrows():
            l_start = row.get('line_start', idx*10)
            l_end = row.get('line_end', (idx+1)*10)
            segment = extract_translation_segment(parent_translation, l_start, l_end, total_lines)
            final_outputs.append(segment)
            
        decision_log = ["Global Retrieval (Slicing)"] * len(inputs)
        
    else:
        # --- STRATEGY 2 & 3: HYBRID (LOCAL RETRIEVAL + NEURAL) ---
        print(f"❌ No strong global match ({global_score:.2%}). Switching to Hybrid Mode.")
        
        # 1. Run Neural Models (Pre-compute)
        neural_preds = {
            "byt5": run_neural_inference("byt5", MODEL_PATHS["byt5"], inputs),
            "t5": run_neural_inference("t5", MODEL_PATHS["t5"], inputs),
            "marian": run_neural_inference("marian", MODEL_PATHS["marian"], inputs)
        }
        ensemble_preds = neural_vote(inputs, neural_preds)
        
        # 2. Run Local Retrieval
        local_matches = retriever.find_local_matches(inputs)
        
        final_outputs = []
        decision_log = []
        
        print("[INFO] Making Final Decisions (Row-by-Row)...")
        for i in range(len(inputs)):
            retrieval_text, score = local_matches[i]
            
            # LOGIC GATE
            if retrieval_text and score >= LOCAL_MATCH_THRESHOLD:
                final_outputs.append(retrieval_text)
                decision_log.append(f"Local Retrieval ({score:.2f})")
            else:
                final_outputs.append(ensemble_preds[i])
                decision_log.append("Neural Ensemble")

    # Save
    submission = pd.DataFrame({"id": ids, "translation": final_outputs})
    submission["translation"] = submission["translation"].apply(lambda x: x if len(str(x)) > 1 else "Unknown")
    submission.to_csv("submission.csv", index=False)
    
    print("\n" + "="*40)
    print("DECISION SUMMARY")
    print("="*40)
    for i in range(min(5, len(inputs))):
        print(f"ID {ids[i]} | Source: {decision_log[i]}")
        print(f"Output: {final_outputs[i][:80]}...")
    print("="*40)

if __name__ == "__main__":
    main()

[INFO] Initializing Retrieval Engine...
[INFO] Indexed 1561 training documents.

[STEP 1] Checking Global Parent Match...
✅ STRONG GLOBAL MATCH FOUND (85.83%). Using Segment Slicing Strategy.

DECISION SUMMARY
ID 0 | Source: Global Retrieval (Slicing)
Output: Thus  Kanesh, say to the -payers, our messenger, every single colony, and the tr...
ID 1 | Source: Global Retrieval (Slicing)
Output: In the letter of the City (it is written): From this day on, whoever buys meteor...
ID 2 | Source: Global Retrieval (Slicing)
Output: As soon as you have heard our letter, who(ever) over there has either sold it to...
ID 3 | Source: Global Retrieval (Slicing)
Output: Send a copy of (this) letter of ours to every single colony and to all the tradi...
