In [None]:
%load_ext autoreload
%autoreload 2

import minsearch
import numpy as np
from minsearch import VectorSearch
from rouge import Rouge
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

from llmzmcp.data import (load_eval_documents, load_ground_truth_questions,
                          load_llm_eval_dataframes)
from llmzmcp.module3 import (evaluate_search, minsearch_query,
                             minsearch_vector_query, qdrant_vector_query)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
documents = load_eval_documents()
ground_truth = load_ground_truth_questions().to_dict(orient="records")
df_gpt4o_mini = load_llm_eval_dataframes("gpt4o-mini")

### Q1. Minsearch text

In [3]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

ms_res = evaluate_search(ground_truth, 
                         lambda q: minsearch_query(index, q['question'], q['course'], 
                                                   boost = {'question': 1.5, 'section': 0.1}))

print(f"Hitrate: {ms_res['hit_rate']}")


100%|██████████| 4627/4627 [00:04<00:00, 945.80it/s]


Hitrate: 0.85


### Q2. Vector search for question

In [4]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Convert the ground_truth into vectors for evaluation
for idx,doc in enumerate(ground_truth):
    doc["vector_q"] = pipeline.transform([doc["question"]])
    ground_truth[idx] = doc

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

msv_res = evaluate_search(ground_truth, 
                          lambda q: minsearch_vector_query(vindex, q['vector_q'], q['course']))
print(f"MRR: {msv_res['mrr']}")


100%|██████████| 4627/4627 [00:01<00:00, 2887.46it/s]

MRR: 0.357





### Q3. Vector search for question and answer

In [5]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

X = pipeline.fit_transform(texts)

# Convert the ground_truth into vectors for evaluation
for idx,doc in enumerate(ground_truth):
    doc["vector_q"] = pipeline.transform([doc["question"]])
    ground_truth[idx] = doc

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

msv_res = evaluate_search(ground_truth, 
                          lambda q: minsearch_vector_query(vindex, q['vector_q'], q['course']))
print(f"Hitrate: {msv_res['hit_rate']}")


100%|██████████| 4627/4627 [00:01<00:00, 2917.71it/s]

Hitrate: 0.822





### Q4. Qdrant
Now let's evaluate the following settings in Qdrant:

- text = doc['question'] + ' ' + doc['text']
- model_handle = "jinaai/jina-embeddings-v2-small-en"
- limit = 5

What's the MRR?

In [6]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "zoomcamp-faq"

qdrt_res = evaluate_search(ground_truth, 
                          lambda q: qdrant_vector_query(q['question'], collection_name, 
                                                        model_handle, q['course']))
print(f"MRR: {qdrt_res['mrr']}")

100%|██████████| 4627/4627 [00:47<00:00, 97.99it/s] 

MRR: 0.852





### Q5. Cosine simiarity
Compare the answer generated by our system (`gpt-4o-mini`) with the actual answer from the FAQ using cosine similarity.

In [7]:
def cosine_similarity(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)


text = df_gpt4o_mini.answer_llm + ' ' + df_gpt4o_mini.answer_orig + ' ' + df_gpt4o_mini.question
rag_pipe = pipeline.fit(text)

cos_sim_list = []
for idx,rows in df_gpt4o_mini.iterrows():
    v_llm = rag_pipe.transform([rows["answer_llm"]]).flatten()
    v_orig = rag_pipe.transform([rows["answer_orig"]]).flatten()
    cos_sim = cosine_similarity(v_llm,v_orig)
    cos_sim_list.append(cos_sim)


print(f"Mean cosine similarity: {np.around(np.mean(cos_sim_list),2)}")

Mean cosine similarity: 0.84


### Q6. Rouge

And alternative way to see how two texts are similar is ROUGE. <br>

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs. <br>

It can give a more nuanced view of text similarity than just cosine similarity alone. <br>

There are three scores: `rouge-1`, `rouge-2` and `rouge-l`, and _precision_, _recall_ and _F1 score_ for each.
- `rouge-1` - the overlap of unigrams,
- `rouge-2` - bigrams,
- `rouge-l` - the longest common subsequence

In [8]:
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(df_gpt4o_mini.answer_llm, df_gpt4o_mini.answer_orig)#[0]
precision = [score["rouge-1"]["f"] for score in scores]

print(f"Mean Rouge-1 F1: {np.around(np.mean(precision),2)}")

Mean Rouge-1 F1: 0.35
