# Install Required libraries

In [1]:
pip install -U minsearch qdrant_client  --trusted-host pypi.org --trusted-host files.pythonhosted.org

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install rouge --trusted-host pypi.org --trusted-host files.pythonhosted.org

Note: you may need to restart the kernel to use updated packages.


# Evaluation data

In [3]:
import requests
import pandas as pd

In [4]:
import os
os.environ["SSL_CERT_FILE"] = "Fortinet_CA_SSL(15).cer"


In [5]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [6]:
from tqdm.auto import tqdm

In [7]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [8]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [9]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Q1. Minsearch text

In [10]:
import minsearch

In [11]:
documents[50]

{'text': 'For those who wish to use the backslash as an escape character in Git Bash for Windows (as Alexey normally does), type in the terminal: bash.escapeChar=\\ (no need to include in .bashrc)',
 'section': 'Module 1: Docker and Terraform',
 'question': 'Git Bash - Backslash as an escape character in Git Bash for Windows',
 'course': 'data-engineering-zoomcamp',
 'id': '2f83dbe7'}

In [12]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course', 'id']
)

index.fit(documents)

<minsearch.minsearch.Index at 0x70cc6976bd10>

In [13]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results 

In [14]:
minsearch_evaluate = evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [15]:
minsearch_evaluate['hit_rate']

0.848714069591528

# Embeddings

In [16]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [17]:
texts = []
for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [18]:
X

array([[ 0.20189188, -0.19028114, -0.10261914, ...,  0.03719206,
         0.02850986, -0.04641277],
       [ 0.2723704 , -0.33653397, -0.1445361 , ..., -0.0499137 ,
         0.01132394,  0.02318573],
       [ 0.25137243, -0.24366293, -0.11105337, ...,  0.0322307 ,
        -0.02414921, -0.02599206],
       ...,
       [ 0.21850466,  0.2859507 ,  0.13110213, ...,  0.03990522,
        -0.02636175,  0.0350963 ],
       [ 0.01265053,  0.01110092, -0.02217507, ..., -0.02871288,
        -0.01579063, -0.08238173],
       [ 0.19543413, -0.03891868,  0.2853495 , ...,  0.11603444,
         0.03531262, -0.04113139]], shape=(948, 128))

## Q2. Vector search for question

In [19]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x70cc63869460>

In [20]:
def vector_search(query, pipeline, vindex):
    results = vindex.search(
        query_vector=pipeline.transform([query["question"]]),
        filter_dict={'course': query['course']},
        num_results=5)
    return results


In [21]:
vector_search_evaluate = evaluate(ground_truth, lambda q: vector_search(q, pipeline, vindex))


  0%|          | 0/4627 [00:00<?, ?it/s]

In [22]:
vector_search_evaluate['mrr']

0.3572833369353793

## Q3. Vector search for question and answer

In [23]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

In [24]:
pipeline_q_a = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline_q_a.fit_transform(texts)

vindex_q_a = VectorSearch(keyword_fields={'course'})
vindex_q_a.fit(X, documents)

<minsearch.vector.VectorSearch at 0x70cc63f0a9c0>

In [25]:
vector_search_evaluate_q_a = evaluate(ground_truth, lambda q: vector_search(q, pipeline_q_a, vindex_q_a))


  0%|          | 0/4627 [00:00<?, ?it/s]

In [26]:
vector_search_evaluate_q_a

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

## Q4. Qdrant

In [27]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding


In [28]:
#connecting to local Qdrant instance
qd_client = QdrantClient(url="http://localhost:6333")

In [29]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [31]:
# Delete a connection 
qd_client.delete_collection(collection_name=collection_name)

True

In [32]:
collection_name = "evaluation"
#Create the collection

qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size = 512, 
        distance=models.Distance.COSINE 
    )
)



True

In [33]:
qd_client.create_payload_index(
    collection_name=collection_name, 
    field_name="course",
    field_schema="keyword" )

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [34]:
points = []

for i, doc in enumerate(documents):
    text=doc['question'] + '' + doc['text']  # for the index
    vector=models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)


In [35]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [36]:
def qdrant_search(question, course, limit=5):

    course = course
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( 
            text=question,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, 
        with_payload=True 
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)

    return results 

In [37]:
qdrant_eval = evaluate(ground_truth, lambda q: qdrant_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [38]:
qdrant_eval

{'hit_rate': 0.929327858223471, 'mrr': 0.8499963979540389}

## Q5. Cosine simiarity

In [39]:
import numpy as np

In [40]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [41]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp


In [42]:
results = df_results.to_dict(orient='records')
results[0]

{'answer_llm': 'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).',
 'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document': '0227b872',
 'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp'}

In [43]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [44]:
answer_gen_pipeline = pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)


In [45]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [46]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [47]:
def cos_sim():
    similarities = []
    
    for r in results:
        answer_org = r['answer_orig']
        answer_llm = r['answer_llm']
        v_org = answer_gen_pipeline.transform([answer_org])[0]
        v_llm = answer_gen_pipeline.transform([answer_llm])[0]
        sim = cosine(v_llm, v_org)
        similarities.append(sim)
        
    avg_smiliarity=np.mean(similarities)
    return avg_smiliarity


In [48]:
print("The average cosine similarity: ",cos_sim().round(2))

The average cosine similarity:  0.84


## Q6. Rouge

In [49]:
from rouge import Rouge

In [50]:
rouge_score = Rouge()

In [51]:
r = df_results.iloc[10]
scores = rouge_score.get_scores(r.answer_llm, r.answer_orig)

In [52]:
r

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

In [53]:
scores

[{'rouge-1': {'r': 0.45454545454545453,
   'p': 0.45454545454545453,
   'f': 0.45454544954545456},
  'rouge-2': {'r': 0.21621621621621623,
   'p': 0.21621621621621623,
   'f': 0.21621621121621637},
  'rouge-l': {'r': 0.3939393939393939,
   'p': 0.3939393939393939,
   'f': 0.393939388939394}}]

In [75]:
rouge_sim = []

for r in results:
    scores = rouge_score.get_scores(r['answer_llm'], r['answer_orig'])
    rouge_sim.append(scores[0]['rouge-1']['f'])
    
avg_rouge_smiliarity=np.mean(similarities)
print(avg_rouge_smiliarity.round(2))


np.float64(0.35)