In [None]:
# # Evaluation de modèles d'embedding sur le corpus Quora
# 
# Ce notebook compare trois modèles de sentence embedding pour la tâche de retrieval de documents.

# %%
import json
import os
import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pytrec_eval

torch.backends.cuda.enable_flash_sdp(False)  # Désactive les optimisations GPU avancées
torch._dynamo.config.suppress_errors = True  # Ignore les erreurs de compilation


# ## 1. Chargement des données

# Chemins des fichiers
DATA_DIR = "quora"
CORPUS_PATH = os.path.join(DATA_DIR, "corpus.jsonl")
QUERIES_PATH = os.path.join(DATA_DIR, "queries.jsonl")
QRELS_TEST_PATH = os.path.join(DATA_DIR, "qrels", "test.tsv")
QRELS_DEV_PATH = os.path.join(DATA_DIR, "qrels", "dev.tsv")

# Chargement du corpus
corpus = {}
with open(CORPUS_PATH, "r") as f:
    for line in f:
        doc = json.loads(line)
        corpus[doc["_id"]] = {
            "text": doc.get("text", ""),
            "title": doc.get("title", "")
        }

# Chargement des queries
queries = {}
with open(QUERIES_PATH, "r") as f:
    for line in f:
        query = json.loads(line)
        queries[query["_id"]] = query["text"]


# Chargement des qrels (ground truth)
def load_qrels(path):
    qrels = {}
    # Ajout de 'header=0' pour ignorer la première ligne (en-tête)
    df = pd.read_csv(path, sep='\t', header=0, names=["query-id", "corpus-id", "score"])
    for _, row in df.iterrows():
        qid = str(row["query-id"])
        docid = str(row["corpus-id"])
        if qid not in qrels:
            qrels[qid] = {}
        qrels[qid][docid] = int(row["score"])
    return qrels

qrels_test = load_qrels(QRELS_TEST_PATH)
qrels_dev = load_qrels(QRELS_DEV_PATH)


# ## 2. Définition des modèles d'embedding


MODELS = {
    "gte-modernbert": "Alibaba-NLP/gte-modernbert-base",
    "distilbert-cos": "sentence-transformers/multi-qa-distilbert-cos-v1",
    "deberta-st": "embedding-data/deberta-sentence-transformer"
}


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EmbeddingModel:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)
        self.model.eval()
        
    def embed(self, texts, batch_size=32):
        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i+batch_size]
            inputs = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                return_tensors="pt", 
                max_length=512
            ).to(device)
            
            with torch.no_grad():
                outputs = self.model(**inputs)
                # Utilise le mean pooling sur les token embeddings
                batch_embeddings = self.mean_pooling(outputs, inputs["attention_mask"])
                embeddings.append(batch_embeddings.cpu().numpy())
        
        return np.concatenate(embeddings)
    
    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# ## 3. Embedding du corpus et des queries


def process_model(model_name, model_path):
    print(f"\nProcessing model: {model_name}")
    
    # Initialisation du modèle
    embedding_model = EmbeddingModel(model_path)
    
    # Embedding du corpus
    corpus_texts = [doc["text"] for doc in corpus.values()]
    corpus_embeddings = embedding_model.embed(corpus_texts)
    corpus_ids = list(corpus.keys())
    
    # Embedding des queries
    query_texts = list(queries.values())
    query_embeddings = embedding_model.embed(query_texts)
    query_ids = list(queries.keys())
    
    return {
        "model_name": model_name,
        "corpus_embeddings": corpus_embeddings,
        "corpus_ids": corpus_ids,
        "query_embeddings": query_embeddings,
        "query_ids": query_ids
    }


# Calcul des embeddings pour tous les modèles
results = {}
for name, path in MODELS.items():
    results[name] = process_model(name, path)


# ## 4. Recherche et évaluation (version modifiée)


def evaluate_model(results, qrels, top_k=100, batch_size=50):
    model_name = results["model_name"]
    print(f"\nEvaluating model: {model_name}")
    
    run = {}
    query_ids = results["query_ids"]
    query_embeddings = results["query_embeddings"]
    corpus_embeddings = results["corpus_embeddings"]
    corpus_ids = results["corpus_ids"]
    
    # Vérification des dimensions
    print(f"Shape query_embeddings: {query_embeddings.shape}")
    print(f"Shape corpus_embeddings: {corpus_embeddings.shape}")
    
    for i in tqdm(range(0, len(query_ids), batch_size)):
        batch_query_ids = query_ids[i:i+batch_size]
        batch_embeddings = query_embeddings[i:i+batch_size]
        
        try:
            similarities = cosine_similarity(batch_embeddings, corpus_embeddings)
            
            for j, qid in enumerate(batch_query_ids):
                if qid not in qrels:
                    continue
                    
                sim_scores = similarities[j]
                top_indices = np.argpartition(sim_scores, -top_k)[-top_k:]
                top_indices = top_indices[np.argsort(sim_scores[top_indices])[::-1]]
                
                run[qid] = {
                    corpus_ids[idx]: float(sim_scores[idx])
                    for idx in top_indices
                }
                
        except Exception as e:
            print(f"Erreur sur le batch {i}:{i+batch_size} - {str(e)}")
            continue
    
    if not run:
        raise ValueError("Aucune query valide trouvée pour l'évaluation")
    
    try:
        # On se limite aux métriques principales qui sont généralement disponibles
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels,
            {"ndcg", "map"}  # On garde seulement ndcg et map
        )
        evaluation = evaluator.evaluate(run)
        
        avg_metrics = {
            metric: np.mean([e[metric] for e in evaluation.values()])
            for metric in evaluation[list(evaluation.keys())[0]]
        }
        
        return avg_metrics
        
    except Exception as e:
        print(f"Erreur dans pytrec_eval: {str(e)}")
        print(f"Exemple de run: {list(run.items())[0]}")
        print(f"Exemple de qrels: {list(qrels.items())[0]}")
        return None


# ## 5. Sauvegarde et chargement des embeddings


def save_embeddings(results, model_name):
    os.makedirs("embeddings", exist_ok=True)
    path = f"embeddings/{model_name}.npz"
    np.savez(
        path,
        corpus_embeddings=results["corpus_embeddings"],
        query_embeddings=results["query_embeddings"],
        corpus_ids=results["corpus_ids"],
        query_ids=results["query_ids"]
    )
    print(f"Embeddings sauvegardés pour {model_name} dans {path}")

def load_embeddings(model_name):
    path = f"embeddings/{model_name}.npz"
    if not os.path.exists(path):
        return None
        
    data = np.load(path, allow_pickle=True)
    return {
        "model_name": model_name,
        "corpus_embeddings": data["corpus_embeddings"],
        "query_embeddings": data["query_embeddings"],
        "corpus_ids": data["corpus_ids"].tolist(),
        "query_ids": data["query_ids"].tolist()
    }

# ## 6. Calcul ou chargement des embeddings

# Calcul ou chargement des embeddings pour tous les modèles
results = {}
for name, path in MODELS.items():
    # Essai de chargement des embeddings sauvegardés
    saved_results = load_embeddings(name)
    
    if saved_results is not None:
        print(f"Embeddings chargés depuis le cache pour {name}")
        results[name] = saved_results
    else:
        # Calcul des embeddings si non trouvés
        results[name] = process_model(name, path)
        save_embeddings(results[name], name)


# ## 7. Affichage des résultats (version simplifiée)

# Évaluation sur le jeu de test
all_metrics = {}
for model_name in MODELS.keys():
    metrics = evaluate_model(results[model_name], qrels_test)
    if metrics is not None:
        all_metrics[model_name] = metrics

# Création d'un dataframe pour une visualisation claire
if all_metrics:
    metrics_df = pd.DataFrame.from_dict(all_metrics, orient="index")
    # On prend seulement les métriques disponibles
    available_metrics = [m for m in ["ndcg", "map"] if m in metrics_df]
    metrics_df = metrics_df[available_metrics]
    metrics_df.columns = ["nDCG@100", "MAP@100"][:len(available_metrics)]

    print("\nRésultats d'évaluation sur le jeu de test:")
    print(metrics_df)

    # Sauvegarde des résultats
    metrics_df.to_csv("quora_embedding_evaluation_results.csv")
    print("Résultats sauvegardés dans quora_embedding_evaluation_results.csv")
else:
    print("Aucune métrique disponible pour l'affichage")


Processing model: gte-modernbert


2025-04-11 13:38:17.748797: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744371497.763667 2317603 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744371497.768223 2317603 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744371497.779890 2317603 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744371497.779900 2317603 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744371497.779902 2317603 computation_placer.cc:177] computation placer alr


Processing model: distilbert-cos


100%|██████████| 16342/16342 [06:34<00:00, 41.45it/s]
100%|██████████| 469/469 [00:08<00:00, 55.33it/s]



Processing model: deberta-st


100%|██████████| 16342/16342 [09:12<00:00, 29.60it/s]
100%|██████████| 469/469 [00:12<00:00, 37.07it/s]



Processing model: gte-modernbert


100%|██████████| 16342/16342 [16:21<00:00, 16.65it/s]
100%|██████████| 469/469 [00:21<00:00, 21.77it/s]


Embeddings sauvegardés pour gte-modernbert dans embeddings/gte-modernbert.npz

Processing model: distilbert-cos


100%|██████████| 16342/16342 [06:34<00:00, 41.47it/s]
100%|██████████| 469/469 [00:08<00:00, 55.77it/s]


Embeddings sauvegardés pour distilbert-cos dans embeddings/distilbert-cos.npz

Processing model: deberta-st


100%|██████████| 16342/16342 [09:11<00:00, 29.63it/s]
100%|██████████| 469/469 [00:12<00:00, 36.95it/s]


Embeddings sauvegardés pour deberta-st dans embeddings/deberta-st.npz

Evaluating model: gte-modernbert
Shape query_embeddings: (15000, 768)
Shape corpus_embeddings: (522931, 768)


100%|██████████| 300/300 [07:47<00:00,  1.56s/it]



Evaluating model: distilbert-cos
Shape query_embeddings: (15000, 768)
Shape corpus_embeddings: (522931, 768)


 58%|█████▊    | 175/300 [04:28<03:17,  1.58s/it]