In [1]:
import random
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_dataset



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load a model
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.environ["HF_TOKEN"]
model = SentenceTransformer("dangvantuan/vietnamese-document-embedding", trust_remote_code=True, token=hf_token)

In [None]:
# Load the Touche-2020 IR dataset (https://huggingface.co/datasets/BeIR/webis-touche2020, https://huggingface.co/datasets/BeIR/webis-touche2020-qrels)
corpus = load_dataset("thecuong/medical-data-bk", "corpus", split="train", token=hf_token)
queries = load_dataset("thecuong/medical-data-bk", "queries", split="train", token=hf_token)
relevant_docs_data = load_dataset("thecuong/medical-data-bk", "relevant_docs", split="train", token=hf_token)

In [4]:
# For this dataset, we want to concatenate the title and texts for the corpus
# corpus = corpus.map(lambda x: {'text': x['title'] + " " + x['text']}, remove_columns=['title'])

In [5]:
# Shrink the corpus size heavily to only the relevant documents + 5,000 random documents
required_corpus_ids = set(map(str, relevant_docs_data["article_id"]))
required_corpus_ids |= set(random.sample(corpus["article_id"], k=1_000))
corpus = corpus.filter(lambda x: x["article_id"] in required_corpus_ids)

In [6]:
# Convert the datasets to dictionaries
corpus = dict(zip(corpus["article_id"], corpus["article"]))  # Our corpus (cid => document)
queries = dict(zip(queries["question_id"], queries["question"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["question_id"], relevant_docs_data["article_id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

In [7]:
# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.
ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="dangvantuan/vietnamese-document-embedding",
)

In [8]:
corpus_ids = list(corpus.keys())
corpus_texts = list(corpus.values())

# Encode trước
corpus_embeddings = model.encode(
    corpus_texts,
    batch_size=2,  # giảm batch size
    convert_to_tensor=True,
    device="cuda"  # hoặc "cpu"
)

In [9]:
# Đánh giá với embedding đã có
results = ir_evaluator(model, corpus_embeddings=corpus_embeddings)

In [10]:
# results = ir_evaluator(model)


In [11]:
print(ir_evaluator.primary_metric)
# => "BeIR-touche2020-test_cosine_map@100"
print(results[ir_evaluator.primary_metric])
# => 0.29335196224364596

dangvantuan/vietnamese-document-embedding_cosine_ndcg@10
0.5979423799936066


In [12]:
from pprint import pprint
pprint(results)

{'dangvantuan/vietnamese-document-embedding_cosine_accuracy@1': 0.4357841634382229,
 'dangvantuan/vietnamese-document-embedding_cosine_accuracy@10': 0.7613088436132557,
 'dangvantuan/vietnamese-document-embedding_cosine_accuracy@3': 0.6282226371972462,
 'dangvantuan/vietnamese-document-embedding_cosine_accuracy@5': 0.6946960617631484,
 'dangvantuan/vietnamese-document-embedding_cosine_map@100': 0.5513950852657435,
 'dangvantuan/vietnamese-document-embedding_cosine_mrr@10': 0.545638631299083,
 'dangvantuan/vietnamese-document-embedding_cosine_ndcg@10': 0.5979423799936066,
 'dangvantuan/vietnamese-document-embedding_cosine_precision@1': 0.4357841634382229,
 'dangvantuan/vietnamese-document-embedding_cosine_precision@10': 0.07613088436132556,
 'dangvantuan/vietnamese-document-embedding_cosine_precision@3': 0.20940754573241543,
 'dangvantuan/vietnamese-document-embedding_cosine_precision@5': 0.1389392123526297,
 'dangvantuan/vietnamese-document-embedding_cosine_recall@1': 0.435784163438222