## Homework: Search Evaluation. Author: Glen Lopez

### Evaluation data

In [3]:
import requests
import pandas as pd
from tqdm.auto import tqdm
import minsearch

In [4]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [38]:
### evaluating retrieval code

In [127]:
def hit_rate(relevance_total):
    """Calculate the Hit Rate: at least one relevant item in top results."""
    hits = sum(any(line) for line in relevance_total)
    return hits / len(relevance_total) if relevance_total else 0.0

In [40]:
def mrr(relevance_total):
    """Calculate the Mean Reciprocal Rank (MRR)."""
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total) if relevance_total else 0.0

In [41]:
def evaluate(ground_truth, search_function):
    """Evaluate a search function using Hit Rate and MRR metrics."""
    relevance_total = []

    for query in tqdm(ground_truth):
        target_doc_id = query['document']
        results = search_function(query)
        relevance = [doc['id'] == target_doc_id for doc in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Q1. Minsearch text

In [7]:
index_doc = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"])

In [8]:
index_doc.fit(documents)

<minsearch.minsearch.Index at 0x12a887350>

In [9]:
def ms_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}
    results = index_doc.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )
    return results

In [10]:
evaluate(ground_truth, lambda query: ms_search(query=query['question'], course=query['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

### Embeddings

In [139]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [11]:
### Let's create embeddings for the "question" field:
texts = [doc['question'] for doc in documents]

In [140]:
def create_text_embedding_pipeline(texts, min_df=3, n_components=128, random_state=1):
    pipeline = make_pipeline(
        TfidfVectorizer(min_df=min_df),
        TruncatedSVD(n_components=n_components, random_state=random_state)
    )
    X = pipeline.fit_transform(texts)
    return pipeline, X

### Q2. Vector search for question

In [13]:
pipeline, X = create_text_embedding_pipeline(texts)
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x10453d880>

In [14]:
def ms_search_v(query, course):
    # Transform the query using the same TF-IDF + SVD pipeline
    query_vec = pipeline.transform([query])
    
    # Search in the vector index
    results = vindex.search(
        query_vec[0],              # vector for the query
        filter_dict={'course': course},
        num_results=5
    )
    return results

In [15]:
evaluate(ground_truth, lambda query: ms_search_v(query=query['question'], course=query['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48195374972984656, 'mrr': 0.3573085512571141}

### Q3. Vector search for question and answer

In [16]:
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]
pipeline, X = create_text_embedding_pipeline(texts)

In [17]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x12e4c5bb0>

In [18]:
evaluate(ground_truth, lambda query: ms_search_v(query=query['question'], course=query['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

### Q4. Qdrant

In [42]:
#from sentence_transformers import SentenceTransformer
import requests 
from qdrant_client import QdrantClient, models

In [113]:
qd_client = QdrantClient("http://localhost:6333")

In [114]:
model = 'jinaai/jina-embeddings-v2-small-en'
EMBEDDING_DIMENSIONALITY = 512

In [115]:
collection_name = 'course-llmxz'

In [116]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [117]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [74]:
#Check if the collections have elements
#collection_info = qd_client.get_collection(collection_name)
#print(collection_info.points_count)

948


In [118]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [119]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [128]:
def evaluate_q(ground_truth, search_function):
    """Evaluate a search function using Hit Rate and MRR metrics."""
    relevance_total = []

    for query in tqdm(ground_truth):
        target_doc_id = query['document']
        results = search_function(query)
        relevance = [point.payload['id'] == target_doc_id for point in results.points]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [121]:
def search(query, course):

    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model
        ),
        limit=5, # top closest matches
        with_payload=True #to get metadata in the results
    )
    return results

In [129]:
evaluate_q(ground_truth, lambda query: search(query=query['question'], course=query['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9120380376053598, 'mrr': 0.8247784741733316}

### Q5. Cosine simiarity

In [131]:
# Function for calculate the cosine
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [5]:
### Now let's use this function to compute the A->Q->A cosine similarity.
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [137]:
### Let's create embeddings for the "question" field:
texts_rs = df_results["question"].fillna("").tolist()

In [146]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [147]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [149]:
X_embeddings = pipeline.transform(texts_rs)

In [154]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(X_embeddings)


In [155]:
cosine_values = cosine_sim_matrix[np.triu_indices_from(cosine_sim_matrix, k=1)]

In [156]:
avg_cosine = np.mean(cosine_values)

In [162]:
round(avg_cosine, 3)

np.float64(0.084)

### Q6. Rouge

In [6]:
# Let's compute the ROUGE score between the answers at the index 10 of our dataframe (doc_id=5170565b)

In [7]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [8]:
# Let's compute it for the pairs in the entire dataframe. What's the average Rouge-1 F1?
scores = []

for _, row in tqdm(df_results.iterrows(), total=len(df_results)):
    r = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    scores.append(r['rouge-1']['f'])

average_rouge1_f1 = sum(scores) / len(scores)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [9]:
average_rouge1_f1

0.3516946452113944