In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
from tqdm.auto import tqdm

In [4]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [4]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x77f365f43e30>

In [5]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [6]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 310.27it/s]


In [8]:
hit_rate(relevance_total)

0.848714069591528

In [7]:
from minsearch import VectorSearch

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [26]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [27]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x77f36567c860>

In [14]:
def minsearch_vector(query, course):
    boost = {'question': 1.0}

    results = vindex.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [28]:
def minsearch_vector(query, course):
    #boost = {'question': 1.0}
    
    # Transform the query into the same vector space
    query_vec = pipeline.transform([query])
    
    results = vindex.search(
        query_vec,
        filter_dict={'course': course},
        #boost_dict=boost,
        num_results=5
    )
    return results


In [29]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_vector(query=q['question'],course=q['course'])
    relevance = [d['question'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:06<00:00, 705.21it/s]


In [30]:
mrr(relevance_total)

0.0

In [17]:
len(q['question'])

27

In [32]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_vector(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:06<00:00, 674.58it/s]


In [33]:
mrr(relevance_total)

0.3571284489590088

In [34]:
texts_qA = [f"{d['question']} {d['text']}" for d in documents]

In [35]:
pipeline_qA = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X_qA = pipeline_qA.fit_transform(texts_qA)

In [37]:
#Re-index with minsearch, keeping the order aligned with `documents`
from minsearch import VectorSearch
vindex_qA = VectorSearch(keyword_fields={'course'})
vindex_qA.fit(X_qA, documents)

<minsearch.vector.VectorSearch at 0x77f365670950>

In [38]:
#Vector search function (query gets vectorized with the *qA* pipeline)
def minsearch_vector_qA(query, course):
    qvec = pipeline_qA.transform([query])
    return vindex_qA.search(
        qvec,
        filter_dict={'course': course},
        num_results=5
    )


In [39]:
# Build relevance matrix using ID-to-ID comparison
relevance_total_qA = []
for q in tqdm(ground_truth):
    doc_id = str(q['document'])
    results = minsearch_vector_qA(q['question'], q['course'])
    row = [str(d['id']) == doc_id for d in results]
    relevance_total_qA.append(row)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 635.03it/s]


In [40]:
# Your hit rate function works as-is
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

hitrate_qA = hit_rate(relevance_total_qA)
print("Hit rate (Q + A vectors):", hitrate_qA)

Hit rate (Q + A vectors): 0.8210503566025502


In [41]:
from sentence_transformers import SentenceTransformer

ModuleNotFoundError: No module named 'sentence_transformers'

In [42]:
%pip install -U sentence-transformers qdrant-client

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Downloading sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
Downloading transformers-4.55.0-py3-none-any.whl (11.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (485 kB)
Installing collected packages: safetensors, transformers, sentence-transformers
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [sentence-transformers] [32m2/3[0m [

In [43]:
%pip install -U torch --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (29 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl (183.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m:01[0m
[?25hInstalling collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.7.1+cpu
    Uninstalling torch-2.7.1+cpu:
      Successfully uninstalled torch-2.7.1+cpu
Successfully installed torch-2.8.0+cpu

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you m

In [6]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from tqdm import tqdm

In [None]:
# Model + data
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = SentenceTransformer(model_handle, trust_remote_code=True)

texts_qA = [f"{d['question']} {d['text']}" for d in documents]
embs = model.encode(texts_qA, normalize_embeddings=True)

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [7]:
5+5

10

In [1]:
# Imports (run this in the same session where you’ll use Qdrant)
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue

# Model (Q+A embeddings)
model = SentenceTransformer("jinaai/jina-embeddings-v2-small-en", trust_remote_code=True)

# Qdrant client — pick ONE of these:
# If you have a Qdrant server running:
# client = QdrantClient(host="localhost", port=6333)

# If you DON'T have a server, use embedded/in-memory:
client = QdrantClient(":memory:")  # or client = QdrantClient(path=":memory:")

# Create/recreate collection (512-dim for this model, cosine distance)
COLL = "qa_qdrant_jina_small"
client.recreate_collection(
    collection_name=COLL,
    vectors_config=VectorParams(size=512, distance=Distance.COSINE),
)

points = [
    PointStruct(
        id=str(doc["id"]),
        vector=embs[i].tolist(),
        payload={"id": str(doc["id"]), "course": doc["course"], "question": doc["question"], "text": doc["text"]},
    )
    for i, doc in enumerate(documents)
]
client.upsert(collection_name=COLL, points=points)

  from .autonotebook import tqdm as notebook_tqdm
  client.recreate_collection(


NameError: name 'documents' is not defined

In [None]:
# Qdrant collection (512-d cosine for this model)
client = QdrantClient(host="localhost", port=6333)
COLL = "qa_qdrant_jina_small"
client.recreate_collection(
    collection_name=COLL,
    vectors_config=VectorParams(size=512, distance=Distance.COSINE),
)

points = [
    PointStruct(
        id=str(doc["id"]),
        vector=embs[i].tolist(),
        payload={"id": str(doc["id"]), "course": doc["course"], "question": doc["question"], "text": doc["text"]},
    )
    for i, doc in enumerate(documents)
]
client.upsert(collection_name=COLL, points=points)

def qdrant_search(question, course, limit=5):
    qvec = model.encode([question], normalize_embeddings=True)[0]
    flt = Filter(must=[FieldCondition(key="course", match=MatchValue(value=course))])
    return client.search(COLL, query_vector=qvec.tolist(), query_filter=flt, with_payload=True, limit=5)

In [None]:
# --- Imports ---
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from tqdm import tqdm
import numpy as np

# --- Sanity checks (expects you already have these) ---
# - documents: list of dicts with keys: id, question, text, course
# - ground_truth: list of dicts with keys: question, course, document (the id of the correct doc)
assert isinstance(documents, list) and len(documents) > 0, "Expected non-empty `documents` list."
assert isinstance(ground_truth, list) and len(ground_truth) > 0, "Expected non-empty `ground_truth` list."

# --- Helper: normalize course values (avoids case/whitespace mismatches) ---
def norm_course(x):
    return (x or "").strip().lower()

# Ensure every doc has a normalized course for filtering
for d in documents:
    d["course_norm"] = norm_course(d.get("course", ""))

# --- Build texts: question + answer ---
texts_qA = [f"{d['question']} {d['text']}" for d in documents]

# --- Load embedding model ---
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = SentenceTransformer(model_handle, trust_remote_code=True)

# --- Encode corpus (normalize for cosine similarity) ---
embs = model.encode(texts_qA, normalize_embeddings=True)
embs = np.asarray(embs)
vector_size = embs.shape[1]  # should be 512 for this model


In [None]:
# --- Qdrant: use in-memory client (switch to host/port if you have a running server) ---
client = QdrantClient(":memory:")  # or QdrantClient(host="localhost", port=6333)

COLL = "qa_qdrant_jina_small"
client.recreate_collection(
    collection_name=COLL,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)

# Optional: payload index for faster filtering by course
# client.create_payload_index(collection_name=COLL, field_name="course_norm", field_schema="keyword")

# --- Upsert points (store your doc id as the point id; keep id types consistent) ---
points = []
for i, doc in enumerate(documents):
    pid = str(doc["id"])
    payload = {
        "id": pid,
        "question": doc["question"],
        "text": doc["text"],
        "course": doc["course"],
        "course_norm": doc["course_norm"],
    }
    points.append(PointStruct(id=pid, vector=embs[i].tolist(), payload=payload))

client.upsert(collection_name=COLL, points=points)

In [None]:
# --- Search helper: encode query, filter by normalized course, limit=5 ---
def qdrant_search(question, course, limit=5):
    qvec = model.encode([question], normalize_embeddings=True)[0].tolist()
    flt = Filter(must=[FieldCondition(key="course_norm", match=MatchValue(value=norm_course(course)))])
    hits = client.search(
        collection_name=COLL,
        query_vector=qvec,
        query_filter=flt,
        with_payload=True,
        limit=limit,
    )
    return hits  # list of ScoredPoint (.id, .score, .payload)

# --- Metric: Mean Reciprocal Rank (MRR) for top-5 ---
def mrr_qdrant(ground_truth, limit=5):
    total = 0.0
    for q in tqdm(ground_truth):
        gold_id = str(q["document"])
        hits = qdrant_search(q["question"], q["course"], limit=limit)
        rank = next((i + 1 for i, h in enumerate(hits) if str(h.id) == gold_id), None)
        total += 0.0 if rank is None else 1.0 / rank
    return total / len(ground_truth)

# --- Compute & print MRR (limit=5 as requested) ---
LIMIT = 5
mrr_value = mrr_qdrant(ground_truth, limit=LIMIT)
print(f"MRR (Qdrant + {model_handle}, question+answer, top-{LIMIT}): {mrr_value:.6f}")
