In [1]:
import random
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_dataset



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load a model
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.environ["HF_TOKEN"]
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Load the Touche-2020 IR dataset (https://huggingface.co/datasets/BeIR/webis-touche2020, https://huggingface.co/datasets/BeIR/webis-touche2020-qrels)
corpus = load_dataset("BeIR/webis-touche2020", "corpus", split="corpus", token=hf_token)
queries = load_dataset("BeIR/webis-touche2020", "queries", split="queries", token=hf_token)
relevant_docs_data = load_dataset("BeIR/webis-touche2020-qrels", split="test", token=hf_token)

Generating queries split: 100%|██████████| 49/49 [00:00<00:00, 11348.48 examples/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating test split: 100%|██████████| 2214/2214 [00:00<00:00, 81065.27 examples/s]


In [8]:
# For this dataset, we want to concatenate the title and texts for the corpus
corpus = corpus.map(lambda x: {'text': x['title'] + " " + x['text']}, remove_columns=['title'])

Map: 100%|██████████| 382545/382545 [00:11<00:00, 34360.90 examples/s]


In [None]:
# Shrink the corpus size heavily to only the relevant documents + 30,000 random documents
required_corpus_ids = set(map(str, relevant_docs_data["corpus-id"]))
required_corpus_ids |= set(random.sample(corpus["_id"], k=30_000))
corpus = corpus.filter(lambda x: x["_id"] in required_corpus_ids)

NameError: name 'relevant_docs_data' is not defined

In [10]:
# Convert the datasets to dictionaries
corpus = dict(zip(corpus["_id"], corpus["text"]))  # Our corpus (cid => document)
queries = dict(zip(queries["_id"], queries["text"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["query-id"], relevant_docs_data["corpus-id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

In [11]:
# Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR metrics.
ir_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="BeIR-touche2020-subset-test",
)

In [12]:
results = ir_evaluator(model)
'''
Information Retrieval Evaluation of the model on the BeIR-touche2020-test dataset:
Queries: 49
Corpus: 31923

Score-Function: cosine
Accuracy@1: 77.55%
Accuracy@3: 93.88%
Accuracy@5: 97.96%
Accuracy@10: 100.00%
Precision@1: 77.55%
Precision@3: 72.11%
Precision@5: 71.43%
Precision@10: 62.65%
Recall@1: 1.72%
Recall@3: 4.78%
Recall@5: 7.90%
Recall@10: 13.86%
MRR@10: 0.8580
NDCG@10: 0.6606
MAP@100: 0.2934
'''

'\nInformation Retrieval Evaluation of the model on the BeIR-touche2020-test dataset:\nQueries: 49\nCorpus: 31923\n\nScore-Function: cosine\nAccuracy@1: 77.55%\nAccuracy@3: 93.88%\nAccuracy@5: 97.96%\nAccuracy@10: 100.00%\nPrecision@1: 77.55%\nPrecision@3: 72.11%\nPrecision@5: 71.43%\nPrecision@10: 62.65%\nRecall@1: 1.72%\nRecall@3: 4.78%\nRecall@5: 7.90%\nRecall@10: 13.86%\nMRR@10: 0.8580\nNDCG@10: 0.6606\nMAP@100: 0.2934\n'

In [17]:
print(ir_evaluator.primary_metric)
# => "BeIR-touche2020-test_cosine_map@100"
print(results[ir_evaluator.primary_metric])
# => 0.29335196224364596

BeIR-touche2020-subset-test_cosine_ndcg@10
0.6870439332160302


In [18]:
from pprint import pprint
pprint(results)

{'BeIR-touche2020-subset-test_cosine_accuracy@1': 0.7959183673469388,
 'BeIR-touche2020-subset-test_cosine_accuracy@10': 1.0,
 'BeIR-touche2020-subset-test_cosine_accuracy@3': 0.9387755102040817,
 'BeIR-touche2020-subset-test_cosine_accuracy@5': 0.9795918367346939,
 'BeIR-touche2020-subset-test_cosine_map@100': 0.3073542398434952,
 'BeIR-touche2020-subset-test_cosine_mrr@10': 0.876530612244898,
 'BeIR-touche2020-subset-test_cosine_ndcg@10': 0.6870439332160302,
 'BeIR-touche2020-subset-test_cosine_precision@1': 0.7959183673469388,
 'BeIR-touche2020-subset-test_cosine_precision@10': 0.6551020408163266,
 'BeIR-touche2020-subset-test_cosine_precision@3': 0.7619047619047619,
 'BeIR-touche2020-subset-test_cosine_precision@5': 0.7224489795918365,
 'BeIR-touche2020-subset-test_cosine_recall@1': 0.01762769800967753,
 'BeIR-touche2020-subset-test_cosine_recall@10': 0.14480169875085985,
 'BeIR-touche2020-subset-test_cosine_recall@3': 0.05070635398280529,
 'BeIR-touche2020-subset-test_cosine_recal