In [1]:
pip install -U minsearch qdrant_client scikit-learn rouge tqdm pandas requests sentence-transformers


Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Collecting qdrant_client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting pandas
  Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from t

In [2]:
pip install jina

Collecting jina
  Downloading jina-3.34.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (23 kB)
Collecting opentelemetry-instrumentation-grpc>=0.35b0 (from jina)
  Downloading opentelemetry_instrumentation_grpc-0.56b0-py3-none-any.whl.metadata (1.9 kB)
Collecting docarray>=0.16.4 (from jina)
  Downloading docarray-0.41.0-py3-none-any.whl.metadata (36 kB)
Collecting pathspec (from jina)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Collecting opentelemetry-exporter-prometheus>=0.33b0 (from jina)
  Downloading opentelemetry_exporter_prometheus-0.56b0-py3-none-any.whl.metadata (1.8 kB)
Collecting jcloud>=0.0.35 (from jina)
  Downloading jcloud-0.3.tar.gz (39 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting opentelemetry-instrumentation-fastapi>=0.33b0 (from jina)
  Downloading

In [3]:
pip install qdrant-client



In [4]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')


In [5]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [6]:
import minsearch
from tqdm.auto import tqdm



index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)
index.fit(documents)

def q1_search(q):
    return index.search(
        query=q["question"],
        filter_dict={"course": q["course"]},
        boost_dict={"question": 1.5, "section": 0.1},
        num_results=5
    )

relevance = []
for q in tqdm(ground_truth):
    docs = q1_search(q)
    relevance.append([d["id"] == q["document"] for d in docs])

print("Q1 Hit Rate:", hit_rate(relevance))



  0%|          | 0/4627 [00:00<?, ?it/s]

Q1 Hit Rate: 0.848714069591528


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from minsearch import VectorSearch

texts = [d["question"] for d in documents]
pipe = make_pipeline(TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1))
Xq = pipe.fit_transform(texts)

vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(Xq, documents)

def q2_search(q):
    vq = pipe.transform([q["question"]])
    return vindex.search(vq, filter_dict={"course": q["course"]}, num_results=5)

relevance = [[d["id"] == q["document"] for d in q2_search(q)] for q in tqdm(ground_truth)]
print("Q2 MRR:", mrr(relevance))

  0%|          | 0/4627 [00:00<?, ?it/s]

Q2 MRR: 0.3572833369353793


In [8]:
texts2 = [d["question"] + " " + d["text"] for d in documents]
Xqt = pipe.fit_transform(texts2)

vindex2 = VectorSearch(keyword_fields={"course"})
vindex2.fit(Xqt, documents)

def q3_search(q):
    vq = pipe.transform([q["question"]])
    return vindex2.search(vq, filter_dict={"course": q["course"]}, num_results=5)

relevance = [[d["id"] == q["document"] for d in q3_search(q)] for q in tqdm(ground_truth)]
print("Q3 Hit Rate:", hit_rate(relevance))

  0%|          | 0/4627 [00:00<?, ?it/s]

Q3 Hit Rate: 0.8210503566025502


In [15]:

# Qdrant client and models
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue

# Embedding model
from sentence_transformers import SentenceTransformer

#  Initialize in-memory Qdrant
client = QdrantClient(":memory:")

# Load the Jina embedding model
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = SentenceTransformer(model_handle, trust_remote_code=True)

#  Create a new collection with cosine distance
dim = model.get_sentence_embedding_dimension()
client.recreate_collection(
    collection_name="faq",
    vectors_config=VectorParams(size=dim, distance=Distance.COSINE)
)

# Prepare texts for indexing (question + text)
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

# Encode all at once (efficient batching)
embeddings = model.encode(texts, show_progress_bar=True)

# Upsert points into Qdrant
points = [
    PointStruct(
        id=int(doc["id"], 16),
        vector=emb.tolist(),
        payload={"doc_id": doc["id"], "course": doc["course"]}
    )
    for doc, emb in zip(documents, embeddings)
]
client.upsert(collection_name="faq", points=points)

def q4_search(q):
    # Use only the question as query (avoiding perfect self-match)
    qvec = model.encode([q['question']])[0]
    filt = Filter(must=[FieldCondition(key="course", match=MatchValue(value=q['course']))])
    hits = client.search(
        collection_name="faq",
        query_vector=qvec.tolist(),
        limit=5,
        query_filter=filt
    )
    # Extract hex IDs from payload
    return [{"id": hit.payload["doc_id"]} for hit in hits]

def hit_rate(rels):
    return sum(any(line) for line in rels) / len(rels)

def mrr(rels):
    total = 0
    for line in rels:
        for i, ok in enumerate(line):
            if ok:
                total += 1/(i+1)
                break
    return total / len(rels)

# Build relevance lists
relevance = [
    [d["id"] == q["document"] for d in q4_search(q)]
    for q in tqdm(ground_truth, desc="Evaluating Q4")
]

print("Q4 MRR:", round(mrr(relevance), 2))


  client.recreate_collection(


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating Q4:   0%|          | 0/4627 [00:00<?, ?it/s]

  hits = client.search(


Q4 MRR: 0.85


In [18]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

#  Load the evaluation results from gpt-4o-mini
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

#  Create the TF-IDF + SVD embedding pipeline
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
pipeline.fit(df['answer_llm'] + ' ' + df['answer_orig'] + ' ' + df['question'])

# Define cosine similarity using  formula
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

#Compute cosine similarity for each answer pair
cos_similarities = []
for _, row in df.iterrows():
    v_llm = pipeline.transform([row['answer_llm']])[0]
    v_orig = pipeline.transform([row['answer_orig']])[0]
    cos_similarities.append(cosine(v_llm, v_orig))

# Calculate and display the average cosine similarity
avg_cos = sum(cos_similarities) / len(cos_similarities)
print("Q5 – Average A→Q→A Cosine Similarity:", round(avg_cos, 2))

Q5 – Average A→Q→A Cosine Similarity: 0.84


In [19]:
from rouge import Rouge

#  Initialize the ROUGE scorer
rouge_scorer = Rouge()

# Compute ROUGE-1 F1 for each answer pair
f1_scores = [
    rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]['rouge-1']['f']
    for _, row in df_results.iterrows()
]

# Calculate the average ROUGE-1 F1 score
avg_f1 = sum(f1_scores) / len(f1_scores)
print("Q6 – Average ROUGE‑1 F1:", round(avg_f1, 2))

Q6 – Average ROUGE‑1 F1: 0.35
