In [2]:
#! pip install -U minsearch qdrant_client
!pip  list | grep minsearch
!pip list | grep qdrant

minsearch                 0.0.4
qdrant-client             1.15.0


In [35]:
import requests

import json
import pprint

import numpy as np
import pandas as pd

from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

In [36]:

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [31]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

def minsearch_search(query, 
                     course,
                     index,
                     boost = {'question': 3.0, 'section': 0.5}):

    if boost is None:
        results = index.search(
            query,
            filter_dict={'course': course},
            num_results=5
        )    
    else:
        results = index.search(
            query,
            filter_dict={'course': course},
            boost_dict=boost,
            num_results=5
        )

    return results
    

In [None]:
#################################################################################################
# Q1
evaluate( ground_truth, lambda q: minsearch_search(  q['question'], 
                                                     q['course'],
                                                     index,
                                                     boost = {'question': 1.5, 'section': 0.1}) ) 

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [9]:
from minsearch import VectorSearch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline


In [10]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [12]:
X.shape

(948, 128)

In [13]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x796961b56450>

In [None]:
#pipeline.transform([ground_truth[0]['question']])[0]
#################################################################################################
# Q2
evaluate( ground_truth, lambda q: minsearch_search(  pipeline.transform([q['question']])[0], 
                                                     q['course'],
                                                     vindex,
                                                     boost=None) ) 

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}

In [None]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

#################################################################################################
# Q3
evaluate( ground_truth, lambda q: minsearch_search(  pipeline.transform([q['question']])[0], 
                                                     q['course'],
                                                     vindex,
                                                     boost=None) ) 

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}

In [48]:
# docker pull qdrant/qdrant 
# docker run --rm -it --network host \
#    -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
#    qdrant/qdrant
client = QdrantClient("http://localhost:6333")

In [49]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = TextEmbedding(model_name=model_handle)


In [50]:
embed_size=list(model.embed('xxxxxx'))[0].shape[0]

In [64]:


# Define the collection name
collection_name = "zoomcamp2"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=embed_size,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)


points = []
id = 0

for doc in documents:
    point = models.PointStruct(
            id=id,
            vector=models.Document(text=doc['question'] + ' ' + doc['text'], 
                                   model=model_handle),
            payload={
                "text": doc['text'],
                "question": doc['question'],
                "section": doc['section'],
                "course": doc['course'],
                "id": doc['id']
            } #save all needed metadata fields
        )
    points.append(point)
    id += 1

client.upsert(
    collection_name=collection_name,
    points=points
)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [72]:
from qdrant_client.models import Filter, FieldCondition, Range, MatchValue

def qdrant_search(query,
                  course,
                  collection_name,
                  limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        query_filter=Filter(
            must=[  # These conditions are required for search results
                FieldCondition(
                    key='course', 
                    match=MatchValue(
                        value=course
                    )
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True, #to get metadata in the results
        with_vectors=False
    )

    return [ point.payload for point in results.points]
results=qdrant_search('Can I join the course',
              'data-engineering-zoomcamp',
              collection_name=collection_name,
              limit=5)

In [None]:
#################################################################################################
# Q4
evaluate( ground_truth, lambda q: qdrant_search(q['question'],
                                                q['course'],
                                                collection_name=collection_name,
                                                limit=5) ) 

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

In [75]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [76]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp


In [78]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)


0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [79]:
llm_embed=pipeline.transform(df_results.answer_llm)
orig_embed=pipeline.transform(df_results.answer_orig)

In [None]:
def cosine(u, v):
    u_norm = np.sqrt( (u * u).sum(axis=1) )
    v_norm = np.sqrt( (v * v).sum(axis=1) )
    return (u * v).sum(axis=1) / (u_norm  * v_norm)

#################################################################################################
# Q5
cosine(llm_embed,orig_embed).mean()

np.float64(0.8415841233490402)

In [101]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0m

In [102]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [103]:
scores=[]
for i in tqdm(range(len(df_results))):
    r = df_results.iloc[i]
    res = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
    scores.append(res['rouge-1']['f'])
    

  0%|          | 0/1830 [00:00<?, ?it/s]

In [None]:
#################################################################################################
# Q6
np.mean(scores)

np.float64(0.3516946452113943)