In [1]:
def recall_at_k(target, predict, k):
    hits = 0
    for t, preds in zip(target, predict):
        if t in preds[:k]:
            hits += 1
    return hits / len(target)

In [2]:
def mrr(target, predict):
    rr_total = 0.0
    for t, preds in zip(target, predict):
        if t in preds:
            rank = preds.index(t) + 1
            rr_total += 1.0 / rank
    return rr_total / len(target)

In [3]:
!pip install --upgrade datasets fsspec

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-

In [4]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

data = load_dataset("sentence-transformers/natural-questions")
dataset = data['train']

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_data = split_dataset['train']
test_data = split_dataset['test']

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

vectorizer = TfidfVectorizer()
vectorizer.fit(pd.concat([train_df['query'], train_df['answer']], ignore_index=True))

test_query = test_df['query'].tolist()
test_answers = test_df['answer'].tolist()

query_vecs = vectorizer.transform(test_query)
answer_vecs = vectorizer.transform(test_answers)

predict_ids = []
for i in range(len(test_query)):
    sims = cosine_similarity(query_vecs[i], answer_vecs).flatten()
    ranked_idx = np.argsort(sims)[::-1][:10]
    predict_ids.append(ranked_idx.tolist())

target_ids = list(range(len(test_query)))

print('Recall@1:', recall_at_k(target_ids, predict_ids, k=1))
print('Recall@3:', recall_at_k(target_ids, predict_ids, k=3))
print('Recall@10:', recall_at_k(target_ids, predict_ids, k=10))
print('MRR:', mrr(target_ids, predict_ids))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100231 [00:00<?, ? examples/s]

Train size: 80184
Test size: 20047
Recall@1: 0.4143762158926523
Recall@3: 0.6194941886566568
Recall@10: 0.7883972664239038
MRR: 0.5354475712235042


In [5]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('intfloat/multilingual-e5-base', device=device)

test_query = test_df['query'].tolist()
test_answers = test_df['answer'].tolist()

q_prefix = "query: "
d_prefix = "passage: "

query_embs = model.encode([q_prefix + q for q in test_query], batch_size=64, convert_to_numpy=True, device=device, normalize_embeddings=True)
answer_embs = model.encode([d_prefix + d for d in test_answers], batch_size=64, convert_to_numpy=True, device=device, normalize_embeddings=True)

scores = np.matmul(query_embs, answer_embs.T)

top_k = 10
predict_ids = np.argsort(-scores, axis=1)[:, :top_k].tolist()
target_ids = list(range(len(test_query)))

print('Recall@1:', recall_at_k(target_ids, predict_ids, k=1))
print('Recall@3:', recall_at_k(target_ids, predict_ids, k=3))
print('Recall@10:', recall_at_k(target_ids, predict_ids, k=10))
print('MRR:', mrr(target_ids, predict_ids))

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/179k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Recall@1: 0.6933206963635457
Recall@3: 0.8911557839078166
Recall@10: 0.9687234997755275
MRR: 0.7982060609947716
