In [1]:
# =============================================================================
# STEP 0: INSTALL DEPENDENCIES
# =============================================================================
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install -q sentence-transformers faiss-cpu

print("✓ Dependencies installed. Please RESTART RUNTIME now.")
print("After restart, run the cells below.")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.4/284.4 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m13.2 MB/s[0m eta [36m0:0

In [2]:
# =============================================================================
# STEP 1: IMPORTS
# =============================================================================
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from typing import List, Tuple
import pickle
from unsloth import FastLanguageModel
import torch

print("✓ Imports successful")


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
✓ Imports successful


In [3]:
# =============================================================================
# STEP 2: DEFINE RETRIEVER CLASS
# =============================================================================
class TranslationRetriever:
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
        """Initialize retriever with embedding model"""
        self.embedding_model = SentenceTransformer(model_name)
        self.index = None
        self.spanish_sentences = []
        self.english_sentences = []

    def build_knowledge_base(self, spanish_file, english_file):
        """Build FAISS index from parallel files"""
        print("Loading sentences...")
        with open(spanish_file, 'r', encoding='utf-8') as f:
            self.spanish_sentences = [line.strip() for line in f if line.strip()]

        with open(english_file, 'r', encoding='utf-8') as f:
            self.english_sentences = [line.strip() for line in f if line.strip()]

        assert len(self.spanish_sentences) == len(self.english_sentences), \
            f"Mismatch: {len(self.spanish_sentences)} Spanish vs {len(self.english_sentences)} English"

        print(f"✓ Loaded {len(self.spanish_sentences)} sentence pairs")
        print(f"\nFirst example:")
        print(f"ES: {self.spanish_sentences[0][:100]}...")
        print(f"EN: {self.english_sentences[0][:100]}...")

        # Generate embeddings
        print("\n⏳ Generating embeddings (this takes ~5-10 minutes for 50k sentences)...")
        embeddings = self.embedding_model.encode(
            self.spanish_sentences,
            show_progress_bar=True,
            batch_size=32,
            convert_to_numpy=True
        )

        # Normalize for cosine similarity
        faiss.normalize_L2(embeddings)

        # Build FAISS index
        print("Building FAISS index...")
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)
        self.index.add(embeddings)

        print(f"✓ Knowledge base built with {self.index.ntotal} vectors")

    def retrieve(self, query: str, k: int = 1) -> List[Tuple[str, str, float]]:
        """Retrieve top-k most similar translation pairs"""
        if self.index is None:
            raise ValueError("Knowledge base not built. Call build_knowledge_base() first")

        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_embedding)

        distances, indices = self.index.search(query_embedding, k)

        results = []
        for dist, idx in zip(distances[0], indices[0]):
            results.append((
                self.spanish_sentences[idx],
                self.english_sentences[idx],
                float(dist)
            ))

        return results

    def save(self, save_path):
        """Save the retriever to disk"""
        faiss.write_index(self.index, f"{save_path}_index.faiss")
        with open(f"{save_path}_data.pkl", 'wb') as f:
            pickle.dump({
                'spanish': self.spanish_sentences,
                'english': self.english_sentences
            }, f)
        print(f"✓ Saved to {save_path}")

    def load(self, load_path):
        """Load the retriever from disk"""
        self.index = faiss.read_index(f"{load_path}_index.faiss")
        with open(f"{load_path}_data.pkl", 'rb') as f:
            data = pickle.load(f)
            self.spanish_sentences = data['spanish']
            self.english_sentences = data['english']
        print(f"✓ Loaded from {load_path}")

print("✓ TranslationRetriever class defined")

✓ TranslationRetriever class defined


In [4]:
# =============================================================================
# STEP 3: BUILD KNOWLEDGE BASE (Run once, then use load() instead)
# =============================================================================
print("="*70)
print("BUILDING KNOWLEDGE BASE")
print("="*70)

retriever = TranslationRetriever()

# Build from your training data
retriever.build_knowledge_base(
    spanish_file='/content/drive/MyDrive/MT_data/all-filtered.es.fuzzy.smalltrain',
    english_file='/content/drive/MyDrive/MT_data/all-filtered.en.fuzzy.smalltrain'
)

# Save for future use
retriever.save('translation_kb')

print("\n✓ Knowledge base ready!")

BUILDING KNOWLEDGE BASE


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading sentences...
✓ Loaded 50000 sentence pairs

First example:
ES: E12 Sin embargo se identificó que los adolescentes perciben una imagen corporal distorsionada y sent...
EN: E12 It was identified, however, that adolescents perceive a distorted body image and feelings of inf...

⏳ Generating embeddings (this takes ~5-10 minutes for 50k sentences)...


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Building FAISS index...
✓ Knowledge base built with 50000 vectors
✓ Saved to translation_kb

✓ Knowledge base ready!


In [5]:
# =============================================================================
# STEP 4: TEST RETRIEVAL (Verify knowledge base works)
# =============================================================================
print("="*70)
print("TESTING RETRIEVAL")
print("="*70)

# Test query
test_query = "El gato está durmiendo en el sofá"

print(f"\nQuery: {test_query}\n")

# Test top 1 (for ICL)
print("--- Top 1 (ICL) ---")
top1 = retriever.retrieve(test_query, k=1)
for es, en, score in top1:
    print(f"Score: {score:.4f}")
    print(f"ES: {es}")
    print(f"EN: {en}\n")

# Test top 5 (for RAG)
print("--- Top 5 (RAG) ---")
top5 = retriever.retrieve(test_query, k=5)
for i, (es, en, score) in enumerate(top5, 1):
    print(f"{i}. Score: {score:.4f}")
    print(f"   ES: {es[:80]}...")
    print(f"   EN: {en[:80]}...\n")

TESTING RETRIEVAL

Query: El gato está durmiendo en el sofá

--- Top 1 (ICL) ---
Score: 0.4841
ES: LEUCOFELIGEN FeLV/RCP liofilizado y suspensión para suspensión inyectable para gatos
EN: LEUCOFELIGEN FeLV/RCP lyophilisate and suspension for suspension for injection for cats

--- Top 5 (RAG) ---
1. Score: 0.4841
   ES: LEUCOFELIGEN FeLV/RCP liofilizado y suspensión para suspensión inyectable para g...
   EN: LEUCOFELIGEN FeLV/RCP lyophilisate and suspension for suspension for injection f...

2. Score: 0.4516
   ES: Peso del gato (kg)...
   EN: Body Weight of Cat (kg)...

3. Score: 0.4516
   ES: Peso del gato (kg)...
   EN: Weight of cat [kg]...

4. Score: 0.4481
   ES: En un modelo de inflamación en gatos, la inyección de robenacoxib tuvo efectos a...
   EN: In an inflammation model in cats, robenacoxib injection had analgesic, anti-infl...

5. Score: 0.4213
   ES: • en los gatos con anemia, el índice de mortalidad oscila alrededor del 60%, a l...
   EN: • in anaemic cats, mortality ra

In [6]:
# =============================================================================
# STEP 5: LOAD LLAMA MODEL
# =============================================================================
print("="*70)
print("LOADING LLAMA 3.1 MODEL")
print("="*70)

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Enable fast inference
FastLanguageModel.for_inference(model)

print("✓ Llama 3.1 model loaded and ready")

LOADING LLAMA 3.1 MODEL
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

✓ Llama 3.1 model loaded and ready


In [7]:
def build_rag_prompt(query: str, retriever: TranslationRetriever) -> str:
    """Build RAG prompt with top 5 examples"""
    examples = retriever.retrieve(query, k=5)

    prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a professional Spanish to English translator. Use the reference translations below to guide your translation.

IMPORTANT: Output ONLY the English translation. Do not include phrases like "Here is the translation:" or "The translation is:". Just provide the direct translation.<|eot_id|><|start_header_id|>user<|end_header_id|>

Reference translations:

"""

    for i, (es, en, score) in enumerate(examples, 1):
        prompt += f"{i}. Spanish: {es}\n   English: {en}\n\n"

    prompt += f"""Now translate (output ONLY the translation):
Spanish: {query}
English:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

    return prompt


def build_icl_prompt(query: str, retriever: TranslationRetriever) -> str:
    """Build ICL prompt with top 1 example"""
    examples = retriever.retrieve(query, k=1)
    es, en, score = examples[0]

    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a professional Spanish to English translator.

IMPORTANT: Output ONLY the English translation. Do not add any preamble.<|eot_id|><|start_header_id|>user<|end_header_id|>

Translate the following Spanish text to English.

Example:
Spanish: {es}
English: {en}

Now translate (output ONLY the translation):
Spanish: {query}
English:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

    return prompt
print("✓ Prompt building functions defined")

✓ Prompt building functions defined


In [8]:
# =============================================================================
# GPU BATCH PROCESSING (CLEAN VERSION, NO CLEANUP)
# =============================================================================

def translate_all_icl_batched(test_sources, retriever, model, tokenizer, batch_size=8, max_new_tokens=256):
    """
    Translate using GPU batching (much faster than sequential)
    """
    print(f"Translating {len(test_sources)} sentences with GPU batching (batch_size={batch_size})...")

    all_translations = []

    for i in range(0, len(test_sources), batch_size):
        batch_queries = test_sources[i:i+batch_size]

        # Build prompts for entire batch
        batch_prompts = [build_icl_prompt(q, retriever) for q in batch_queries]

        # Tokenize batch
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")

        # Generate for entire batch at once
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.3,
                top_p=0.9,
                use_cache=True,
                pad_token_id=tokenizer.pad_token_id
            )

        # Decode batch
        batch_results = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Extract translations
        for result in batch_results:
            translation = result.split("assistant")[-1].strip()
            all_translations.append(translation)

        print(f"Batch {i//batch_size + 1}/{(len(test_sources)-1)//batch_size + 1} done ✓")

    return all_translations


def translate_all_rag_batched(test_sources, retriever, model, tokenizer, batch_size=4, max_new_tokens=256):
    """
    Translate using GPU batching for RAG
    """
    print(f"Translating {len(test_sources)} sentences with GPU batching (batch_size={batch_size})...")

    all_translations = []

    for i in range(0, len(test_sources), batch_size):
        batch_queries = test_sources[i:i+batch_size]

        # Build prompts for entire batch
        batch_prompts = [build_rag_prompt(q, retriever) for q in batch_queries]

        # Tokenize batch
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")

        # Generate for entire batch at once
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.3,
                top_p=0.9,
                use_cache=True,
                pad_token_id=tokenizer.pad_token_id
            )

        # Decode batch
        batch_results = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Extract translations
        for result in batch_results:
            translation = result.split("assistant")[-1].strip()
            all_translations.append(translation)

        print(f"Batch {i//batch_size + 1}/{(len(test_sources)-1)//batch_size + 1} done ✓")

    return all_translations

In [8]:
# # =============================================================================
# # STEP 7: DEFINE TRANSLATION FUNCTIONS
# # =============================================================================

# def translate_icl(query: str, retriever: TranslationRetriever, max_new_tokens: int = 256):
#     """Translate using ICL (top 1 example)"""
#     prompt = build_icl_prompt(query, retriever)

#     inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=max_new_tokens,
#         temperature=0.3,
#         top_p=0.9,
#         use_cache=True
#     )

#     result = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     translation = result.split("assistant")[-1].strip()

#     return translation


# def translate_rag(query: str, retriever: TranslationRetriever, max_new_tokens: int = 256):
#     """Translate using RAG (top 5 examples)"""
#     prompt = build_rag_prompt(query, retriever)

#     inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=max_new_tokens,
#         temperature=0.3,
#         top_p=0.9,
#         use_cache=True
#     )

#     result = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     translation = result.split("assistant")[-1].strip()

#     return translation

# print("✓ Translation functions defined")

✓ Translation functions defined


In [9]:
# =============================================================================
# STEP 1: IMPORTS (UPDATED)
# =============================================================================
!pip install -q sacrebleu unbabel-comet nltk
!python -m nltk.downloader wordnet omw-1.4

print("✓ Evaluation libraries installed")

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from typing import List, Tuple
import pickle
from unsloth import FastLanguageModel
import torch
import sacrebleu
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import json
import pandas as pd

print("✓ Imports successful")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.0/91.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.4/101.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [10]:
# =============================================================================
# FIX: DOWNLOAD MISSING NLTK DATA
# =============================================================================
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("✓ NLTK data downloaded")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


✓ NLTK data downloaded


[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
# =============================================================================
# STEP 6: LOAD COMET MODEL
# =============================================================================
print("="*70)
print("LOADING COMET EVALUATION MODEL")
print("="*70)

# Updated import method for COMET
from comet import download_model, load_from_checkpoint

# Download and load COMET model
comet_model_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(comet_model_path)

print("✓ COMET model loaded")

LOADING COMET EVALUATION MODEL


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

LICENSE: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

✓ COMET model loaded


/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:197: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [12]:
# =============================================================================
# STEP 7: DEFINE EVALUATION FUNCTIONS
# =============================================================================

def compute_bleu(predictions, references):
    """Compute BLEU score (0-100, higher is better)"""
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

def compute_chrf(predictions, references):
    """Compute chrF score (0-100, higher is better)"""
    chrf = sacrebleu.corpus_chrf(predictions, [references])
    return chrf.score

def compute_meteor(predictions, references):
    """Compute METEOR score (0-1, higher is better)"""
    scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = word_tokenize(pred.lower())
        ref_tokens = word_tokenize(ref.lower())
        score = meteor_score([ref_tokens], pred_tokens)
        scores.append(score)

    return sum(scores) / len(scores) if scores else 0.0

def compute_comet(predictions, references, sources):
    """Compute COMET score (0-1+, higher is better)"""
    data = []
    for src, pred, ref in zip(sources, predictions, references):
        data.append({
            "src": src,
            "mt": pred,
            "ref": ref
        })

    model_output = comet_model.predict(data, batch_size=8, gpus=1)

    return model_output.system_score

def evaluate_all_metrics(predictions, references, sources, approach_name=""):
    """Compute all metrics at once"""
    print(f"\n{'='*70}")
    print(f"COMPUTING METRICS: {approach_name}")
    print(f"{'='*70}")

    bleu = compute_bleu(predictions, references)
    chrf = compute_chrf(predictions, references)
    meteor = compute_meteor(predictions, references)
    comet = compute_comet(predictions, references, sources)

    print(f"\n RESULTS (n={len(predictions)} sentences):")
    print(f"{'='*70}")
    print(f"BLEU:   {bleu:.2f}  (higher is better, 0-100)")
    print(f"chrF:   {chrf:.2f}  (higher is better, 0-100)")
    print(f"METEOR: {meteor:.4f}  (higher is better, 0-1)")
    print(f"COMET:  {comet:.4f}  (higher is better, 0-1+)")
    print(f"{'='*70}\n")

    return {
        'bleu': bleu,
        'chrf': chrf,
        'meteor': meteor,
        'comet': comet
    }

print("✓ Evaluation functions defined")

✓ Evaluation functions defined


In [13]:
# =============================================================================
# STEP 8: LOAD TEST DATA
# =============================================================================
# IMPORTANT: Replace these paths with your actual test files
# These should be DIFFERENT from your training files (all-filtered_*_fuzzy.smalltrain)

# Option 1: If you have test.fuzzy files, upload them and use:
# test_sources_file = '/mnt/user-data/uploads/test.fuzzy.es'
# test_references_file = '/mnt/user-data/uploads/test.fuzzy.en'

# Option 2: Use a small sample for quick testing
# test_sources = [
# "Período de validez después de abierto el envase: 10 horas.",
# "Niños y adolescentes No se recomienda el uso de Telmisartán Teva en niños y adolescentes hasta 18 años.",
# "Promover la mejora, de conformidad con las normas internacionales, de las condiciones de internamiento y el trato a las personas privadas de libertad.",
# "De los 1,165 pacientes tratados con Picato en ensayos clínicos sobre queratosis actínica llevados a cabo con ingenol mebutato gel, 656 pacientes (56%) tenían 65 años o más, mientras que 241 pacientes (21%) tenían 75 años o más.",
# "Ante la poca eficacia de este mecanismo en los trabajos de los Miembros y de la OMC, las ONG hoy defienden que tales materiales estén mejor organizados en la página electrónica e, incluso, que la Secretaría adopte una posición más activa, indicando algunos temas para la presentación del material en base a plazos y patrones más predefinidos.",
# "275 • Cuando el área de piel limpiada previamente se haya secado, pellízquela y sujétela firmemente con una mano.",
# "Una dosis comúnmente utilizada para inducir la superovulación consiste en la administración de 150- 225 UI (0,24-0,36 ml) de GONAL-f por día, comenzando el día 2 ó 3 del ciclo de tratamiento.",
# "150 UI/ kg 3x/ semana ó 450 UI/ kg una vez a la semana.",
# "Sin embargo, en la muestra, el SAPS II y el LODS mostraron buena capacidad discriminatoria para ocurrencia de muerte en la UTI AUC=0,85 y 0,83.",
# "El factor 'Aumento del costo de material lleva al uso de materiales de baja calidad lo que resulta prejudicial a los trabajadores' ocupó la segunda posición dentro del grupo implementación del proyecto con un IIR = 0.631 y se posicionó 43 dentro de todos los grupos de factores de cal¡dad.",
# "Se aplican por primera vez los métodos de la epidemiología moderna en nuestras publicaciones."
# ]

# test_references = [
# "Shelf life after first opening the container: 10 hours.",
# "Children and adolescents The use of Telmisartan Teva in children and adolescents up to the age of 18 years is not recommended.",
# "Support the improvement, in line with international standards, of the detention conditions and treatment of persons deprived of liberty.",
# "Of the 1,165 patients treated with Picato in the actinic keratosis clinical studies conducted with ingenol mebutate gel, 656 patients (56%) were 65 years and older, while 241 patients (21%) were 75 years and older.",
# "Aware of what little influence these position papers have on the WTO and its Members, NGOs are now calling for these papers to be better organized on the website and, moreover, for the Secretariat to take a more active stance, proposing topics on which to present papers, with more pre-defined timescales and standards.",
# "DO NOT touch this area again before giving the injection.  When the cleaned area of skin has dried, pinch and hold it firmly with one hand.",
# "A commonly used dose for superovulation involves the administration of 150-225 IU (0.24-0.36 ml) of GONAL-f daily, commencing on days 2 or 3 of the treatment cycle.",
# "140 150 IU/ kg 3x/ week or 450 IU/ kg once weekly.",
# "In He sample, however, SAPS II and LODS showed good discriminatory capacity for the occurrence of death at the ICU AUC=0.85 and 0.83.",
# "Increase of material price leads to use of low quality material thus harms workers factor was ranked in the second position among project implementation group with RII = 0.631 and was ranked in the 43rd position among all groups factors of qulity.",
# "Modern epidemiological methods were applied in our publications for the first time."
# ]

# If you have actual test files, uncomment and use this:
# =============================================================================
# LOAD FIRST 5K LINES FOR TEST SET ONLY
# =============================================================================

def load_first_n_lines(filepath, n=5000):
    """Load first n lines from a file"""
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            line = line.strip()
            if line:  # Only add non-empty lines
                sentences.append(line)
    return sentences

# Load test data (first 5k lines)
test_sources = load_first_n_lines(
    '/content/drive/MyDrive/MT_data/all-filtered.es.real.test',
    n=5000
)

# Load corresponding English references (upload the English file first)
test_references = load_first_n_lines(
    '/content/drive/MyDrive/MT_data/all-filtered.en.real.test',  # Update path when uploaded
    n=5000
)

print(f"✓ Loaded {len(test_sources)} test sentences")
print(f"✓ Loaded {len(test_references)} reference translations")

# Verify they match
assert len(test_sources) == len(test_references), "Mismatch in test data!"

print(f"\nFirst test example:")
print(f"ES: {test_sources[0]}")
print(f"EN: {test_references[0]}")

print(f"✓ Loaded {len(test_sources)} test sentences with references")

✓ Loaded 5000 test sentences
✓ Loaded 5000 reference translations

First test example:
ES: Período de validez después de abierto el envase: 10 horas.
EN: Shelf life after first opening the container: 10 hours.
✓ Loaded 5000 test sentences with references


In [41]:
# # =============================================================================
# # STEP 9: RUN ICL TRANSLATION & EVALUATION
# # =============================================================================
# print("="*70)
# print("ICL TRANSLATION (Top 1 Example)")
# print("="*70)

# icl_predictions = []

# for i, spanish_sent in enumerate(test_sources):
#     print(f"Translating {i+1}/{len(test_sources)}...", end=' ')
#     translation = translate_icl(spanish_sent, retriever)
#     icl_predictions.append(translation)
#     print("✓")

#     # Show translations
#     if i in range (0, len(test_sources)):
#         print(f"\nExample:")
#         print(f"  Source:     {spanish_sent}")
#         print(f"  ICL Output: {translation}")
#         print(f"  Reference:  {test_references[i]}")

# # Evaluate ICL
# icl_scores = evaluate_all_metrics(icl_predictions, test_references, test_sources, "ICL (Top 1)")

In [14]:
# =============================================================================
# USAGE: ICL TRANSLATION WITH BATCHING
# =============================================================================
print("="*70)
print("ICL TRANSLATION (Top 1 Example) - GPU BATCHED")
print("="*70)

import time
start_time = time.time()

icl_predictions = translate_all_icl_batched(
    test_sources,
    retriever,
    model,
    tokenizer,
    batch_size=12
)

elapsed = time.time() - start_time
print(f"\n✓ Translated {len(icl_predictions)} sentences in {elapsed:.1f}s ({len(icl_predictions)/elapsed:.1f} sent/s)")

# Evaluate
icl_scores = evaluate_all_metrics(icl_predictions, test_references, test_sources, "ICL (Top 1)")

ICL TRANSLATION (Top 1 Example) - GPU BATCHED
Translating 5000 sentences with GPU batching (batch_size=8)...
Batch 1/625 done ✓
Batch 2/625 done ✓
Batch 3/625 done ✓
Batch 4/625 done ✓
Batch 5/625 done ✓
Batch 6/625 done ✓
Batch 7/625 done ✓
Batch 8/625 done ✓
Batch 9/625 done ✓
Batch 10/625 done ✓
Batch 11/625 done ✓
Batch 12/625 done ✓
Batch 13/625 done ✓
Batch 14/625 done ✓
Batch 15/625 done ✓
Batch 16/625 done ✓
Batch 17/625 done ✓
Batch 18/625 done ✓
Batch 19/625 done ✓
Batch 20/625 done ✓
Batch 21/625 done ✓
Batch 22/625 done ✓
Batch 23/625 done ✓
Batch 24/625 done ✓
Batch 25/625 done ✓
Batch 26/625 done ✓
Batch 27/625 done ✓
Batch 28/625 done ✓
Batch 29/625 done ✓
Batch 30/625 done ✓
Batch 31/625 done ✓
Batch 32/625 done ✓
Batch 33/625 done ✓
Batch 34/625 done ✓
Batch 35/625 done ✓
Batch 36/625 done ✓
Batch 37/625 done ✓
Batch 38/625 done ✓
Batch 39/625 done ✓
Batch 40/625 done ✓
Batch 41/625 done ✓
Batch 42/625 done ✓
Batch 43/625 done ✓
Batch 44/625 done ✓
Batch 45/625 done ✓


INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 625/625 [00:42<00:00, 14.87it/s]



 RESULTS (n=5000 sentences):
BLEU:   47.45  (higher is better, 0-100)
chrF:   71.44  (higher is better, 0-100)
METEOR: 0.7289  (higher is better, 0-1)
COMET:  0.8724  (higher is better, 0-1+)



In [17]:
icl_predictions

['Shelf life after first opening the vial: 10 hours.',
 'Children and adolescents Telmisartan Teva is not recommended for use in children and adolescents until 18 years.',
 'Promote, in accordance with international standards, the improvement of detention conditions and the treatment of persons deprived of liberty.',
 'Out of 1,165 patients treated with Picato in clinical trials on actinic keratosis conducted with ingenol mebutate gel, 656 patients (56%) were aged 65 or over, while 241 patients (21%) were aged 75 or over.',
 'In light of the limited effectiveness of this mechanism in the work of Members and the WTO, NGOs today argue that such materials should be better organized on the electronic page and, even, that the Secretariat adopt a more active position, indicating some topics for the presentation of the material based on more predefined deadlines and patterns.',
 'When the previously cleaned skin area has dried, pinch it and hold it firmly with one hand.',
 'A commonly used do

In [22]:
# =============================================================================
# USAGE: RAG TRANSLATION WITH BATCHING
# =============================================================================
print("="*70)
print("RAG TRANSLATION (Top 5 Examples) - GPU BATCHED")
print("="*70)

start_time = time.time()

rag_predictions = translate_all_rag_batched(
    test_sources,
    retriever,
    model,
    tokenizer,
    batch_size=25
)

elapsed = time.time() - start_time
print(f"\n✓ Translated {len(rag_predictions)} sentences in {elapsed:.1f}s ({len(rag_predictions)/elapsed:.1f} sent/s)")

# Evaluate
rag_scores = evaluate_all_metrics(rag_predictions, test_references, test_sources, "RAG (Top 5)")

RAG TRANSLATION (Top 5 Examples) - GPU BATCHED
Translating 5000 sentences with GPU batching (batch_size=25)...
Batch 1/200 done ✓
Batch 2/200 done ✓
Batch 3/200 done ✓
Batch 4/200 done ✓
Batch 5/200 done ✓
Batch 6/200 done ✓
Batch 7/200 done ✓
Batch 8/200 done ✓
Batch 9/200 done ✓
Batch 10/200 done ✓
Batch 11/200 done ✓
Batch 12/200 done ✓
Batch 13/200 done ✓
Batch 14/200 done ✓
Batch 15/200 done ✓
Batch 16/200 done ✓
Batch 17/200 done ✓
Batch 18/200 done ✓
Batch 19/200 done ✓
Batch 20/200 done ✓
Batch 21/200 done ✓
Batch 22/200 done ✓
Batch 23/200 done ✓
Batch 24/200 done ✓
Batch 25/200 done ✓
Batch 26/200 done ✓
Batch 27/200 done ✓
Batch 28/200 done ✓
Batch 29/200 done ✓
Batch 30/200 done ✓
Batch 31/200 done ✓
Batch 32/200 done ✓
Batch 33/200 done ✓
Batch 34/200 done ✓
Batch 35/200 done ✓
Batch 36/200 done ✓
Batch 37/200 done ✓
Batch 38/200 done ✓
Batch 39/200 done ✓
Batch 40/200 done ✓
Batch 41/200 done ✓
Batch 42/200 done ✓
Batch 43/200 done ✓
Batch 44/200 done ✓
Batch 45/200 done 

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 625/625 [00:42<00:00, 14.87it/s]



 RESULTS (n=5000 sentences):
BLEU:   48.18  (higher is better, 0-100)
chrF:   72.04  (higher is better, 0-100)
METEOR: 0.7358  (higher is better, 0-1)
COMET:  0.8749  (higher is better, 0-1+)



In [23]:
rag_predictions

['Shelf life after first opening the vial: 10 hours.',
 'Children and adolescents The use of Telmisartan Teva is not recommended in children and adolescents up to 18 years.',
 '·Promote, in accordance with international standards, the improvement of conditions of detention and treatment of persons deprived of liberty.',
 'Out of 1,165 patients treated with Picato in clinical trials on actinic keratosis conducted with ingenol mebutate gel, 656 patients (56%) were aged 65 or over, while 241 patients (21%) were aged 75 or over.',
 'In light of the limited effectiveness of this mechanism in the work of Members and the WTO, NGOs today argue that such materials should be better organized on the website and, even, that the Secretariat should take a more active position, indicating some topics for the presentation of the material based on more defined deadlines and patterns.',
 '275 • When the previously cleaned skin area has dried, pinch it and hold it firmly with one hand.',
 'A commonly use

In [45]:
# # =============================================================================
# # STEP 10: RUN RAG TRANSLATION & EVALUATION
# # =============================================================================
# print("="*70)
# print("RAG TRANSLATION (Top 5 Examples)")
# print("="*70)

# rag_predictions = []

# for i, spanish_sent in enumerate(test_sources):
#     print(f"Translating {i+1}/{len(test_sources)}...", end=' ')
#     translation = translate_rag(spanish_sent, retriever)
#     rag_predictions.append(translation)
#     print("✓")

#     # Show first example
#     if i in range (0, len(test_sources)):
#         print(f"\nExample:")
#         print(f"  Source:     {spanish_sent}")
#         print(f"  RAG Output: {translation}")
#         print(f"  Reference:  {test_references[i]}")

# # Evaluate RAG
# rag_scores = evaluate_all_metrics(rag_predictions, test_references, test_sources, "RAG (Top 5)")

In [27]:
# =============================================================================
# STEP 11: COMPARE RESULTS
# =============================================================================
print("="*70)
print("FINAL COMPARISON: ICL vs RAG")
print("="*70)

comparison = pd.DataFrame({
    'Metric': ['BLEU', 'chrF', 'METEOR', 'COMET'],
    'ICL (Top 1)': [
        f"{icl_scores['bleu']:.2f}",
        f"{icl_scores['chrf']:.2f}",
        f"{icl_scores['meteor']:.4f}",
        f"{icl_scores['comet']:.4f}"
    ],
    'RAG (Top 5)': [
        f"{rag_scores['bleu']:.2f}",
        f"{rag_scores['chrf']:.2f}",
        f"{rag_scores['meteor']:.4f}",
        f"{rag_scores['comet']:.4f}"
    ],
    'Winner': [
        'ICL' if icl_scores['bleu'] > rag_scores['bleu'] else ' RAG',
        'ICL' if icl_scores['chrf'] > rag_scores['chrf'] else ' RAG',
        'ICL' if icl_scores['meteor'] > rag_scores['meteor'] else ' RAG',
        'ICL' if icl_scores['comet'] > rag_scores['comet'] else ' RAG'
    ]
})

print("\n")
print(comparison.to_string(index=False))
print("\n")

# Show sentence-level comparison for first 3
print("="*70)
print("SENTENCE-LEVEL COMPARISON")
print("="*70)

for i in range(0, len(test_sources)):
    print(f"\n{'─'*70}")
    print(f"Sentence {i+1}")
    print(f"{'─'*70}")
    print(f"Source:    {test_sources[i]}")
    print(f"Reference: {test_references[i]}")
    print(f"ICL:       {icl_predictions[i]}")
    print(f"RAG:       {rag_predictions[i]}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentence 4377
──────────────────────────────────────────────────────────────────────
Source:    Trastornos del sistema inmunológico Poco frecuentes - Urticaria, rash Muy raros – Reacciones anafilácticas Los síntomas de hipersensibilidad generalizada pueden incluir rash cutáneo generalizado, escozor, sudor, trastornos gastrointestinales, edema angioneurótico, dificultad en la respiración, palpitaciones, hipotensión y mareo/ pérdida de consciencia.
Reference: 33 Immune system disorders Uncommon - Urticaria, rash Very rare - Anaphylactic reactions Symptoms of generalised hypersensitivity may include generalised skin rash, itching, sweating, gastrointestinal upset, angioneurotic oedema, difficulties in breathing, palpitation, reduction in blood pressure and fainting/ loss of consciousness.
ICL:       Immunological disorders Rare - Urticaria, rash Very rare – Anaphylactic reactions The symptoms of generalised hypersensitivity 

In [26]:
# # =============================================================================
# # SIMPLE INDEX SIZE TEST
# # =============================================================================

def test_index_size(retriever):
    """Quick check that knowledge base is built correctly"""

    n_vectors = retriever.index.ntotal
    n_spanish = len(retriever.spanish_sentences)
    n_english = len(retriever.english_sentences)

    print("="*70)
    print("KNOWLEDGE BASE SIZE CHECK")
    print("="*70)
    print(f"\nVectors in FAISS index: {n_vectors:,}")
    print(f"Spanish sentences:      {n_spanish:,}")
    print(f"English sentences:      {n_english:,}")

    if n_vectors == n_spanish == n_english:
        print(f"\n✅ PASS: Knowledge base has {n_vectors:,} entries")
        return True
    else:
        print(f"\n❌ FAIL: Counts don't match!")
        return False

# Run the test
test_index_size(retriever)


KNOWLEDGE BASE SIZE CHECK

Vectors in FAISS index: 50,000
Spanish sentences:      50,000
English sentences:      50,000

✅ PASS: Knowledge base has 50,000 entries


True