In [8]:
pip install -U minsearch qdrant_client

Note: you may need to restart the kernel to use updated packages.


In [9]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [10]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [11]:
import minsearch
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)
index.fit(documents) 

<minsearch.minsearch.Index at 0x70cf2d5d50d0>

In [12]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results


In [13]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [14]:
from minsearch import VectorSearch

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [16]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [17]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x70cf14e078f0>

In [18]:
def vector_search(q):
    # Transform query to embedding
    q_vec = pipeline.transform([q['question']])
    
    # Search vectors with course filter
    results = vindex.search(q_vec, filter_dict={'course': q['course']}, num_results=5)
    return results


In [19]:
results = evaluate(ground_truth, vector_search)
print(results)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}


In [20]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X = pipeline.fit_transform(texts)


In [22]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)


<minsearch.vector.VectorSearch at 0x70cf213b78f0>

In [23]:
def vector_search(q):
    q_vec = pipeline.transform([q['question']])
    results = vindex.search(q_vec, filter_dict={'course': q['course']}, num_results=5)
    return results


In [24]:
results = evaluate(ground_truth, vector_search)
print(results)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}


In [25]:
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]


In [26]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en")

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # mean pooling
    return embeddings[0].numpy()


Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

In [27]:
import numpy as np
texts = [doc['question'] + " " + doc['text'] for doc in documents]

embeddings = []
for text in tqdm(texts, desc="Embedding docs"):
    emb = embed_text(text)
    embeddings.append(emb)
    
embeddings = np.array(embeddings)
embedding_dim = embeddings.shape[1]
print(f"Embedding dimension: {embedding_dim}")


Embedding docs:   0%|          | 0/948 [00:00<?, ?it/s]



Embedding dimension: 512


In [29]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue

client = QdrantClient(url="http://localhost:6333")

from qdrant_client.http.models import VectorParams

client.recreate_collection(
    collection_name="faq_collection",
    vectors_config=VectorParams(size=embedding_dim, distance="Cosine")
)



  client.recreate_collection(


True

In [30]:
points = [
    PointStruct(id=i, vector=embeddings[i].tolist(), payload=documents[i])
    for i in range(len(documents))
]

client.upsert(collection_name="faq_collection", points=points)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [34]:
def qdrant_search(query, course):
    vec = embed_text(query)
    hits = client.search(
        collection_name="faq_collection",
        query_vector=vec,
        limit=5,
        with_payload=True,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="course",
                    match=MatchValue(value=course)
                )
            ]
        )
    )
    return [hit.payload for hit in hits]

In [35]:
def hit_rate(relevance_total):
    count = sum([True in line for line in relevance_total])
    return count / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank, val in enumerate(line, start=1):
            if val:
                total_score += 1 / rank
                break
    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth, desc="Evaluating"):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [37]:
results = evaluate(ground_truth, lambda q: qdrant_search(q['question'], q['course']))
print(results)


Evaluating:   0%|          | 0/4627 [00:00<?, ?it/s]

  hits = client.search(


{'hit_rate': 0.2100713205100497, 'mrr': 0.14752179237807092}


In [44]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct, Filter, FieldCondition, MatchValue

# Load tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en")

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # mean pooling
    return embeddings[0].numpy()

def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm

# Prepare combined texts for embeddings including 'section'
texts = [
    doc['question'] + ' ' + doc['text'] + ' ' + doc.get('section', '')
    for doc in documents
]

print("Embedding documents...")
embeddings = np.array([normalize(embed_text(text)) for text in tqdm(texts)])
embedding_dim = embeddings.shape[1]
print(f"Embedding dimension: {embedding_dim}")

# Connect to Qdrant client (adjust url if needed)
client = QdrantClient(url="http://localhost:6333")

# Recreate collection with correct vector config
client.recreate_collection(
    collection_name="faq_collection",
    vectors_config=VectorParams(size=embedding_dim, distance="Cosine")
)

# Upload points with minimal payload (id and course only)
points = [
    PointStruct(
        id=i,
        vector=embeddings[i].tolist(),
        payload={
            "id": documents[i]['id'],
            "course": documents[i]['course']
        }
    )
    for i in range(len(documents))
]
print("Uploading points to Qdrant...")
client.upsert(collection_name="faq_collection", points=points)

# Search function with increased limit to get more candidates
def qdrant_search(query, course):
    query_vec = normalize(embed_text(query))
    hits = client.search(
        collection_name="faq_collection",
        query_vector=query_vec.tolist(),
        limit=10,  # increase limit for better recall
        with_payload=True,
        query_filter=Filter(
            must=[
                FieldCondition(key="course", match=MatchValue(value=course))
            ]
        )
    )
    # Return top 5 for evaluation as before
    return [hit.payload for hit in hits[:5]]

# Define evaluation metrics
def hit_rate(relevance_total):
    return sum(True in line for line in relevance_total) / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank, val in enumerate(line, start=1):
            if val:
                total_score += 1 / rank
                break
    return total_score / len(relevance_total)

# Evaluate function that runs search_function on ground_truth queries
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth, desc="Evaluating"):
        doc_id = q['document']
        results = search_function(q['question'], q['course'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

# Run evaluation and print results
results = evaluate(ground_truth, qdrant_search)
print("Evaluation results:", results)


Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

Embedding documents...


  0%|          | 0/948 [00:00<?, ?it/s]

Embedding dimension: 512
Uploading points to Qdrant...


  client.recreate_collection(


Evaluating:   0%|          | 0/4627 [00:00<?, ?it/s]

  hits = client.search(


Evaluation results: {'hit_rate': 0.17700453857791226, 'mrr': 0.10882141056119836}


In [40]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Define cosine similarity functions
def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm

def cosine(u, v):
    u = normalize(u)
    v = normalize(v)
    return u.dot(v)

# Load the results CSV
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

# Combine all text to fit the embedding pipeline
all_text = df_results['answer_llm'] + ' ' + df_results['answer_orig'] + ' ' + df_results['question']

# Create pipeline: TF-IDF + SVD to 128 dimensions
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# Fit pipeline on combined text
pipeline.fit(all_text)

# Transform LLM answers and original answers into embeddings
emb_llm = pipeline.transform(df_results['answer_llm'])
emb_orig = pipeline.transform(df_results['answer_orig'])

# Compute cosine similarities for each pair
cosine_similarities = []
for i in range(len(df_results)):
    sim = cosine(emb_llm[i], emb_orig[i])
    cosine_similarities.append(sim)

# Calculate average cosine similarity
average_cosine = np.mean(cosine_similarities)
print(f"Average cosine similarity: {average_cosine:.2f}")


Average cosine similarity: 0.84


In [43]:
import pandas as pd
from rouge import Rouge
from tqdm.auto import tqdm

# Load the data (if not loaded already)
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

# Initialize the Rouge scorer
rouge_scorer = Rouge()

# List to store all ROUGE-1 F1 scores
rouge1_f1_scores = []

# Iterate over each pair of answers
for _, row in tqdm(df_results.iterrows(), total=len(df_results)):
    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']
    
    # Compute ROUGE scores between LLM answer and original answer
    scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]
    
    # Extract ROUGE-1 F1 score
    rouge1_f1 = scores['rouge-1']['f']
    rouge1_f1_scores.append(rouge1_f1)

# Calculate average ROUGE-1 F1 score
average_rouge1_f1 = sum(rouge1_f1_scores) / len(rouge1_f1_scores)
print(f"Average ROUGE-1 F1 score: {average_rouge1_f1:.2f}")


  0%|          | 0/1830 [00:00<?, ?it/s]

Average ROUGE-1 F1 score: 0.35


In [42]:
pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.
