In [2]:
!pip install evaluate --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:


import os
import csv
import numpy as np
from typing import List, Tuple, Dict

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    pipeline as hf_pipeline,
    MarianMTModel,
    MarianTokenizer
)
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import evaluate
from collections import defaultdict

DEVICE = 0 if torch.cuda.is_available() else -1


def create_context_embeddings(dataset_name: str = "lucadiliello/newsqa",
                              split: str = "validation",
                              model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                              out_csv: str = "context_embeddings.csv",
                              max_examples: int = 2000):
    """
    Loads dataset contexts, embeds them with a SentenceTransformer, and writes CSV rows:
      id, context_text, embedding_vector_json
    """
    ds = load_dataset(dataset_name, split=split)
    print(f"Loaded {len(ds)} examples from {dataset_name}/{split}")
    texts = []
    ids = []
    for i, ex in enumerate(ds):
        if i >= max_examples:
            break
        txt = ex.get("context") or ""
        if not txt.strip():
            continue
        ids.append(str(i))
        texts.append(txt)

    embedder = SentenceTransformer(model_name, device='cuda' if DEVICE == 0 else 'cpu')
    print(f"Embedding {len(texts)} contexts with {model_name} ...")
    vectors = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    with open(out_csv, "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "context", "embedding"])
        for i, (id_, ctxt, vec) in enumerate(zip(ids, texts, vectors)):
            vec_str = " ".join(map(str, vec.tolist()))
            writer.writerow([id_, ctxt.replace("\n", " "), vec_str])
    print(f"Wrote embeddings to {out_csv}")
    return out_csv, ids, texts, vectors

def load_embeddings_from_csv(csv_path: str) -> Tuple[List[str], List[str], np.ndarray]:
    ids, contexts, vecs = [], [], []
    with open(csv_path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            ids.append(row["id"])
            contexts.append(row["context"])
            vec = np.fromstring(row["embedding"], sep=" ")
            vecs.append(vec)
    return ids, contexts, np.vstack(vecs)


def semantic_search(query: str, contexts: List[str], vectors: np.ndarray, embedder: SentenceTransformer, top_k: int = 3):
    q_vec = embedder.encode([query], convert_to_numpy=True)
    sims = cosine_similarity(q_vec, vectors)[0]
    topk_idx = np.argsort(sims)[::-1][:top_k]
    return [(int(i), contexts[int(i)], float(sims[int(i)])) for i in topk_idx]


def load_qa_model(model_name: str = "deepset/roberta-base-squad2"):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    qa = hf_pipeline("question-answering", model=model, tokenizer=tokenizer, device=DEVICE)
    return qa


def load_translation_model(model_name: str = "Helsinki-NLP/opus-mt-en-fr"):
    tok = MarianTokenizer.from_pretrained(model_name)
    mod = MarianMTModel.from_pretrained(model_name)
    device = torch.device("cuda" if DEVICE == 0 else "cpu")
    mod = mod.to(device)
    return tok, mod

def translate_en_to_fr(texts: List[str], tokenizer: MarianTokenizer, model: MarianMTModel, max_length: int = 256) -> List[str]:
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    inputs = {k: v.to(model.device) for k,v in inputs.items()}
    translated = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
    decoded = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return decoded


class EnglishToFrenchQA:
    def __init__(self,
                 embedding_csv: str = None,
                 embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
                 qa_model_name: str = "deepset/roberta-base-squad2",
                 mt_model_name: str = "Helsinki-NLP/opus-mt-en-fr"):

        if embedding_csv and os.path.exists(embedding_csv):
            self.ids, self.contexts, self.vectors = load_embeddings_from_csv(embedding_csv)
            self.embedder = SentenceTransformer(embedding_model_name, device='cuda' if DEVICE == 0 else 'cpu')
        else:
            raise ValueError("Please provide an embeddings CSV (create with create_context_embeddings).")

        self.qa = load_qa_model(qa_model_name)
        self.mt_tok, self.mt_model = load_translation_model(mt_model_name)

    def answer(self, english_question: str, top_k_contexts: int = 3) -> Dict:

        top_ctxs = semantic_search(english_question, self.contexts, self.vectors, self.embedder, top_k=top_k_contexts)
        candidates = []
        for idx, ctxt, score in top_ctxs:
            try:

                out = self.qa(question=english_question, context=ctxt, topk=1)

                if isinstance(out, list):
                    out = out[0] if len(out)>0 else {"answer": "", "score": 0.0, "start": 0, "end": 0}
                answer_text = out.get("answer", "")
                cand_score = float(out.get("score", 0.0))
            except Exception as e:
                answer_text = ""
                cand_score = 0.0
            candidates.append({
                "context_id": idx,
                "context": ctxt,
                "retrieval_score": score,
                "answer_en": answer_text,
                "qa_score": cand_score
            })

        best = max(candidates, key=lambda x: (x["qa_score"], len(x["answer_en"])))
        if not best["answer_en"].strip():
            answer_fr = ""
        else:
            answer_fr = translate_en_to_fr([best["answer_en"]], self.mt_tok, self.mt_model)[0]
        best["answer_fr"] = answer_fr
        return best


def evaluate_on_dataset(qa_system: EnglishToFrenchQA, dataset_name="lucadiliello/newsqa", split="validation", n_examples=200):
    ds = load_dataset(dataset_name, split=split)
    metric = evaluate.load("squad")
    preds, refs = [], []
    for i, ex in enumerate(ds):
        if i >= n_examples:
            break
        question = ex.get("question", "").strip()
        context = ex.get("context", "")
        res = qa_system.answer(question, top_k_contexts=3)
        pred_text = res["answer_en"]
        preds.append({"id": str(i), "prediction_text": pred_text})
        ans_obj = ex.get("answers", {})
        if isinstance(ans_obj, dict):
            texts = ans_obj.get("text", []) or [""]
        elif isinstance(ans_obj, list) and len(ans_obj)>0:
            first = ans_obj[0]
            texts = first.get("text", []) if isinstance(first, dict) else [""]
        else:
            texts = [""]
        refs.append({"id": str(i), "answers": {"text": texts, "answer_start": [0]*len(texts)}})

    ids_ok = [p["id"] for p in preds]
    refs = [r for r in refs if r["id"] in ids_ok]
    results = metric.compute(predictions=preds, references=refs)
    return results

if __name__ == "__main__":
    csv_path, ids, contexts, vectors = None, None, None, None
    emb_csv = "context_embeddings.csv"
    if not os.path.exists(emb_csv):
        create_context_embeddings(split="validation", max_examples=1000, out_csv=emb_csv)
    qa_system = EnglishToFrenchQA(embedding_csv=emb_csv)
    examples = [
        "What did the mayor say about the new bridge?",
        "Why did the CEO resign?"
    ]
    for q in examples:
        resp = qa_system.answer(q, top_k_contexts=4)
        print("Q:", q)
        print("Retrieved context id:", resp["context_id"])
        print("Answer (EN):", resp["answer_en"])
        print("Answer (FR):", resp["answer_fr"])
        print("---")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/681 [00:00<?, ?B/s]

data/train-00000-of-00001-ec54fbe500fc3b(…):   0%|          | 0.00/29.7M [00:00<?, ?B/s]

data/validation-00000-of-00001-3cf888b12(…):   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74160 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4212 [00:00<?, ? examples/s]

Loaded 4212 examples from lucadiliello/newsqa/validation


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding 1000 contexts with sentence-transformers/all-MiniLM-L6-v2 ...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Wrote embeddings to context_embeddings.csv


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Device set to use cuda:0


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

Q: What did the mayor say about the new bridge?
Retrieved context id: 952
Answer (EN): does not want taxpayers to pay a penny
Answer (FR): ne veut pas que les contribuables paient un centime
---




Q: Why did the CEO resign?
Retrieved context id: 252
Answer (EN): Company engineers were not able to reproduce the throttle-control problems
Answer (FR): Les ingénieurs de l'entreprise n'ont pas pu reproduire les problèmes de contrôle des gaz
---
