In [1]:
import random
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_dataset



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load a model
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.environ["HF_TOKEN"]

model = SentenceTransformer("bkai-foundation-models/vietnamese-bi-encoder", token=hf_token)

In [None]:
# Load the Touche-2020 IR dataset (https://huggingface.co/datasets/BeIR/webis-touche2020, https://huggingface.co/datasets/BeIR/webis-touche2020-qrels)
corpus = load_dataset("GreenNode/zalo-ai-legal-text-retrieval-vn", "corpus", split="corpus", token=hf_token)
queries = load_dataset("GreenNode/zalo-ai-legal-text-retrieval-vn", "queries", split="queries", token=hf_token)
relevant_docs_data = load_dataset("GreenNode/zalo-ai-legal-text-retrieval-vn", "default", split="train", token=hf_token)

In [None]:
# For this dataset, we want to concatenate the title and texts for the corpus
corpus = corpus.map(lambda x: {'text': x['title'] + " " + x['text']}, remove_columns=['title'])

In [None]:
# Shrink the corpus size heavily to only the relevant documents + 5,000 random documents
required_corpus_ids = set(map(str, relevant_docs_data["corpus-id"]))
required_corpus_ids |= set(random.sample(corpus["_id"], k=5_000))
corpus = corpus.filter(lambda x: x["_id"] in required_corpus_ids)

Filter: 100%|██████████| 61425/61425 [00:00<00:00, 118351.12 examples/s]


In [None]:
# Convert the datasets to dictionaries
corpus = dict(zip(corpus["_id"], corpus["text"]))  # Our corpus (cid => document)
queries = dict(zip(queries["_id"], queries["text"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["query-id"], relevant_docs_data["corpus-id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

In [None]:
# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.
ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="bkai-foundation-models/vietnamese-bi-encoder",
)

In [None]:
corpus_ids = list(corpus.keys())
corpus_texts = list(corpus.values())

# Encode trước
corpus_embeddings = model.encode(
    corpus_texts,
    batch_size=64,  # giảm batch size
    convert_to_tensor=True,
    device="cuda"  # hoặc "cpu"
)

In [None]:
# Đánh giá với embedding đã có
results = ir_evaluator(model, corpus_embeddings=corpus_embeddings)

In [None]:
# results = ir_evaluator(model)


In [None]:
print(ir_evaluator.primary_metric)
# => "BeIR-touche2020-test_cosine_map@100"
print(results[ir_evaluator.primary_metric])
# => 0.29335196224364596

bkai-foundation-models/vietnamese-bi-encoder_cosine_ndcg@10
0.7939582040887408


In [None]:
from pprint import pprint
pprint(results)

{'bkai-foundation-models/vietnamese-bi-encoder_cosine_accuracy@1': 0.6928453947368421,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_accuracy@10': 0.8992598684210527,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_accuracy@3': 0.8207236842105263,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_accuracy@5': 0.8618421052631579,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_map@100': 0.7632307081557627,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_mrr@10': 0.765118199143692,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_ndcg@10': 0.7939582040887408,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_precision@1': 0.6928453947368421,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_precision@10': 0.09152960526315788,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_precision@3': 0.27631578947368424,
 'bkai-foundation-models/vietnamese-bi-encoder_cosine_precision@5': 0.17483552631578944,
 'bkai-foundation-models/vietnamese-bi-encode