In [1]:
%pip install -U minsearch qdrant_client

Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Collecting pandas (from minsearch)
  Downloading pandas-2.3.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting scikit-learn (from minsearch)
  Downloading scikit_learn-1.7.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting pytz>=2020.1 (from pandas->minsearch)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->minsearch)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn->minsearch)
  Downloading scipy-1.16.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn->minsearch)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->minsearch)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading minsearch-0.0.4-py3-none-any.whl (11 kB)
Downloadi

In [1]:
# Import required libraries for HTTP requests and data manipulation
import requests
import pandas as pd

# Define the base URL for the dataset
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'

# Download the documents JSON file from the remote repository
# 'verify=False' disables SSL certificate verification (useful for some environments)
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url, verify=False).json()

# Import StringIO to read CSV content from a string
from io import StringIO

# Download the ground truth CSV file from the remote repository
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
csv_content = requests.get(ground_truth_url, verify=False).text

# Read the CSV content into a pandas DataFrame
df_ground_truth = pd.read_csv(StringIO(csv_content))

# Convert the DataFrame to a list of dictionaries for easier processing
ground_truth = df_ground_truth.to_dict(orient='records')



In [2]:
# Import tqdm for progress bars
from tqdm.auto import tqdm

# Calculate hit rate: proportion of queries where the correct document is found in results
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)

# Calculate Mean Reciprocal Rank (MRR): average reciprocal rank of the correct document in results
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
    return total_score / len(relevance_total)

# Evaluate search function using ground truth data
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']  # Ground truth document ID
        results = search_function(q)  # Search results for the query
        relevance = [d['id'] == doc_id for d in results]  # List of booleans: True if result matches ground truth
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
## Q1. Minsearch text
boost = {'question': 1.5, 'section': 0.1}

# Evaluate minsearch with custom boosting parameters
from minsearch import Index

index = Index(
    text_fields=["question", "text", "section"],   # Fields used for full-text search
    keyword_fields=[]                      # Fields used for exact filters
)
index.fit(documents)

def search_function(query):
    results = index.search(
        query['question'],
        boost_dict=boost,
        filter_dict={'course': query['course']}
    )
    return results

# Calculate hit rate and MRR for this approach
results = evaluate(ground_truth, search_function)
print(f"Hit rate: {results['hit_rate']:.3f}")
print(f"MRR: {results['mrr']:.3f}")

100%|██████████| 4627/4627 [00:04<00:00, 1005.47it/s]

Hit rate: 0.860
MRR: 0.690





In [19]:
# Import VectorSearch from minsearch and required sklearn modules
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Prepare a list of questions from the documents for vectorization
texts = []
for doc in documents:
    t = doc['question']  # Extract the question field
    texts.append(t)

# Create a pipeline: TF-IDF vectorizer followed by dimensionality reduction (SVD)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),           # Vectorize text, ignore terms with low frequency
    TruncatedSVD(n_components=128, random_state=1)  # Reduce to 128 dimensions
)
X = pipeline.fit_transform(texts)  # Fit and transform the text data

In [20]:
## Q2. Vector search for question

# Create a VectorSearch index using the reduced-dimension question vectors and documents
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

# Define a search function for evaluation
# For each query, transform the question using the same pipeline and search with vindex

def search_function(q):
    query_vec = pipeline.transform([q['question']])
    return vindex.search(query_vec, filter_dict={'course': q['course']})  # Filter by course if needed

# Evaluate the vector search method and print the MRR
results = evaluate(ground_truth, search_function)
print(f"Hit rate: {results['hit_rate']:.3f}")
print(f"MRR: {results['mrr']:.3f}")

100%|██████████| 4627/4627 [00:02<00:00, 2021.69it/s]

Hit rate: 0.561
MRR: 0.368





In [None]:
## Q3. Vector search for question and answer
##We only used question in Q2. We can use both question and answer:

texts = []
for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

# Create a pipeline: TF-IDF vectorizer followed by dimensionality reduction (SVD)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Create a VectorSearch index using the combined question+answer vectors and documents
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

# Define a search function for evaluation
def search_function(q):
    query_text = q['question'] + ' ' + q.get('text', '')
    query_vec = pipeline.transform([query_text])[0]
    return vindex.search(query_vec, filter_dict={'course': q['course']})

# Evaluate the vector search method and print the MRR
results = evaluate(ground_truth, search_function)
print(f"Hit rate: {results['hit_rate']:.3f}")
print(f"MRR: {results['mrr']:.3f}")


100%|██████████| 4627/4627 [00:02<00:00, 1611.45it/s]

Hit rate: 0.842
MRR: 0.625





In [None]:
## Q4. Qdrant

# Import Qdrant client and embedding model
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

# Prepare texts: question + answer
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

# Use Jina embeddings model with fastembed
model_handle = "jinaai/jina-embeddings-v2-small-en"
embedder = TextEmbedding(model_name=model_handle)

# Generate embeddings for all texts
embeddings = list(embedder.embed(texts))

In [4]:
# Initialize Qdrant in-memory client
client = QdrantClient(':memory:')

# Create collection in Qdrant
client.create_collection(
    collection_name="docs",
    vectors_config=models.VectorParams(size=512, distance="Cosine")
)

True

In [5]:
# Upload documents and vectors to Qdrant
client.upload_collection(
    collection_name="docs",
    vectors=embeddings,
    payload=[doc for doc in documents],
    ids=None,  # Let Qdrant auto-assign IDs
    batch_size=64
)

In [7]:
# Define search function using Qdrant
limit = 5
def search_function(q):
    query_text = q['question'] + ' ' + q.get('text', '')
    query_vec = list(embedder.embed([query_text]))[0]
    hits = client.search(
        collection_name="docs",
        query_vector=query_vec,
        limit=limit,
        query_filter=models.Filter(must=[models.FieldCondition(key="course", match=models.MatchValue(value=q['course']))])
    )
    return [hit.payload for hit in hits]

# Evaluate and print results
results = evaluate(ground_truth, search_function)
print(f"Hit rate: {results['hit_rate']:.3f}")
print(f"MRR: {results['mrr']:.3f}")

  hits = client.search(
  hits = client.search(
100%|██████████| 4627/4627 [00:37<00:00, 123.89it/s]

Hit rate: 0.930
MRR: 0.852





In [None]:
## Q5. Cosine simiarity

# Download results from gpt-4o-mini evaluations
from io import StringIO
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
csv_content = requests.get(results_url, verify=False).text
df_results = pd.read_csv(StringIO(csv_content))

# Fit the pipeline on all text data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# Let's fit the vectorizer on all the text data we have:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

# Cosine similarity function
import numpy as np
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

# Calculate cosine similarity for each answer pair
cosines = []
for _, row in df_results.iterrows():
    v_llm = pipeline.transform([row['answer_llm']])[0]
    v_orig = pipeline.transform([row['answer_orig']])[0]
    cos = cosine(v_llm, v_orig)
    cosines.append(cos)

# Print the average cosine similarity
print(f"Average cosine similarity: {np.mean(cosines):.2f}")



Average cosine similarity: 0.84


In [14]:
## Q6. Rouge

from rouge import Rouge
rouge_scorer = Rouge()

rouge_1_f1_scores = []
for _, row in df_results.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    rouge_1_f1_scores.append(scores['rouge-1']['f'])

print(f"Average ROUGE-1 F1: {np.mean(rouge_1_f1_scores):.2f}")

Average ROUGE-1 F1: 0.35


In [13]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp
