In [1]:
import random
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_dataset



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load a model
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.environ["HF_TOKEN"]
model = SentenceTransformer("VoVanPhuc/sup-SimCSE-VietNamese-phobert-base", token=hf_token)

No sentence-transformers model found with name VoVanPhuc/sup-SimCSE-VietNamese-phobert-base. Creating a new one with mean pooling.


In [3]:
# Load the Touche-2020 IR dataset (https://huggingface.co/datasets/BeIR/webis-touche2020, https://huggingface.co/datasets/BeIR/webis-touche2020-qrels)
corpus = load_dataset("GreenNode/zalo-ai-legal-text-retrieval-vn", "corpus", split="corpus", token=hf_token)
queries = load_dataset("GreenNode/zalo-ai-legal-text-retrieval-vn", "queries", split="queries", token=hf_token)
relevant_docs_data = load_dataset("GreenNode/zalo-ai-legal-text-retrieval-vn", "default", split="train", token=hf_token)

In [4]:
# For this dataset, we want to concatenate the title and texts for the corpus
corpus = corpus.map(lambda x: {'text': x['title'] + " " + x['text']}, remove_columns=['title'])

In [5]:
# Shrink the corpus size heavily to only the relevant documents + 5,000 random documents
required_corpus_ids = set(map(str, relevant_docs_data["corpus-id"]))
required_corpus_ids |= set(random.sample(corpus["_id"], k=5_000))
corpus = corpus.filter(lambda x: x["_id"] in required_corpus_ids)

Filter: 100%|██████████| 61425/61425 [00:00<00:00, 112906.59 examples/s]


In [6]:
# Convert the datasets to dictionaries
corpus = dict(zip(corpus["_id"], corpus["text"]))  # Our corpus (cid => document)
queries = dict(zip(queries["_id"], queries["text"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["query-id"], relevant_docs_data["corpus-id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

In [7]:
# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.
ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="VoVanPhuc/sup-SimCSE-VietNamese-phobert-base",
)

In [8]:
corpus_ids = list(corpus.keys())
corpus_texts = list(corpus.values())
from pyvi.ViTokenizer import ViTokenizer

corpus_texts = [ViTokenizer.tokenize(text) for text in corpus_texts]  # Tokenize corpus texts

# Encode trước
corpus_embeddings = model.encode(
    corpus_texts,
    batch_size=64,  # giảm batch size
    convert_to_tensor=True,
    device="cuda"  # hoặc "cpu"
)

In [9]:
# Đánh giá với embedding đã có
results = ir_evaluator(model, corpus_embeddings=corpus_embeddings)

In [10]:
print(ir_evaluator.primary_metric)
# => "BeIR-touche2020-test_cosine_map@100"
print(results[ir_evaluator.primary_metric])
# => 0.29335196224364596

VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_ndcg@10
0.3743944340775536


In [11]:
from pprint import pprint
pprint(results)

{'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_accuracy@1': 0.23560855263157895,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_accuracy@10': 0.5444078947368421,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_accuracy@3': 0.3762335526315789,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_accuracy@5': 0.44819078947368424,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_map@100': 0.3326984432520092,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_mrr@10': 0.32557011017126175,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_ndcg@10': 0.3743944340775536,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_precision@1': 0.23560855263157895,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_precision@10': 0.05489309210526316,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_precision@3': 0.12554824561403508,
 'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base_cosine_precision@5': 0.09004934210526316,
 'VoVanPhuc/sup-SimCSE-VietNamese-phober