#### Part 3 - RAG Evaluation Homework

In [1]:
# 1. Import the evaluation data (same as lecture)
import requests
import pandas as pd

# Raw Q&A doc
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

In [2]:
# Ground Truth doc
# Generated by LLM 
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
ground_truth[:5]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I receive course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where do I join the Slack channel?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

### RAG Evaluation Methods:
1. Offline Method:
- Before Implementation of the System 
- Part 1: Retrival Evaluation: Hit Rate (Recall), Mean Recripical Rank (MRR) 
- Part 2: LLM Answer Evaluation: Cosine Similarity, LLM as Judge  

- Link to full retrival evaluation metrics: https://github.com/DataTalksClub/llm-zoomcamp/blob/main/03-evaluation/search_evaluation/evaluation-metrics.md

2. Online Method:
- A/B Testing
- User Feedback
- Real time usage data evaluation

In [4]:
# 2. Evaluation on Retrival System
from tqdm.auto import tqdm

# 1) Recall 
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

# 2) Mean Recripical Ranking (MRR)
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


# Combined Evaluation Function 
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results] # List of whether the retrived doc is the truth doc for each testing query
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


##### Q1: Retrival Evaluation: MinSearch

In [5]:
### Q1. Minsearch text
import pandas as pd 
import numpy as np
import openai
from openai import OpenAI
import os

import minsearch

import io
import requests
import json 


# Index the doc
index = minsearch.Index(text_fields=['question','text','section'],
                keyword_fields=['course','id'])
index.fit(documents)

<minsearch.minsearch.Index at 0x739034996c30>

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [7]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [6]:
# Perform the retrival and evaluation 

# Retrive the most relavent doc for the query 
def search(query, index, course):

    boost = {'question': 1.5, 'section': 0.1}  # Specify the relative importance of the search on topics

    results = index.search(
        query=query,
        filter_dict = {'course':course},
        boost_dict=boost,
        num_results=5
    )

    # Put all the retrived document text into a signle content
    #context = ""
    #for ans in results:
    #    context += ans['text'] + "\n"

    return results

In [7]:
result = search(query = 'When does the course begin?', index = index, course = 'data-engineering-zoomcamp')
result

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'quest

In [10]:
# Result Evaluation - Hit Rate (Recall)
relevance_total = []

for q in tqdm(ground_truth):

    doc_id = q['document']
    results = search(query=q['question'], index = index, course=q['course'])
    relevance = [d['id'] == doc_id for d in results] # if the retrived doc is the truth doc 
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

100%|██████████| 4627/4627 [00:22<00:00, 207.09it/s]


In [11]:
# Hit Rate
hit_rate = hit_rate(relevance_total)
print(hit_rate)

0.848714069591528


##### Q2: Test the performance using Vector Search in MinSearch
TF-IDF and Singular Value Decomposition to create embeddings from texts.

In [8]:
# Test the performance using Vector Search in MinSearch
from minsearch import VectorSearch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

from sentence_transformers import SentenceTransformer

In [10]:
# Create embeddings for the "question" field:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Embed the question and answer text 
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)


def minsearch_vector_search(vector, course):
    return vindex.search(
        vector,
        filter_dict={'course': course},
        num_results=5
    )

In [42]:
# Convert the Questions in the Ground Truth data into vectors and perform the evaluation
q = [q['question'] for q in ground_truth]
q_vectors = pipeline.fit_transform(q)

relevance_total = []

for qs in tqdm(range(len(q_vectors))):

    doc_id = ground_truth[qs]['document']
    q_vector = q_vectors[qs]

    results = minsearch_vector_search(vector=q_vector, course = 'data-engineering-zoomcamp')
    relevance = [d['id'] == doc_id for d in results] # if the retrived doc is the truth doc 
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:03<00:00, 1233.99it/s]


In [38]:
minsearch_vector_search(vector=q_vectors[0], course = 'data-engineering-zoomcamp')

  'section': 'Module 4: analytics engineering with dbt',
  'question': 'Setup - Failed to clone repository.',
  'course': 'data-engineering-zoomcamp',
  'id': '9c85f3aa'},
 {'text': 'If you encounter data type error on trip_type column, it may due to some nan values that isn’t null in bigquery.\nSolution: try casting it to FLOAT datatype instead of NUMERIC',
  'section': 'Module 4: analytics engineering with dbt',
  'question': 'Data Type Error when running fact table',
  'course': 'data-engineering-zoomcamp',
  'id': '46aebc79'},
 {'text': 'Problem: when injecting data to bigquery, you may face the type error. This is because pandas by default will parse integer columns with missing value as float type.\nSolution:\nOne way to solve this problem is to specify/ cast data type Int64 during the data transformation stage.\nHowever, you may be lazy to type all the int columns. If that is the case, you can simply use convert_dtypes to infer the data type\n# Make pandas to infer correct data 

In [34]:
print(mrr(relevance_total))

0.001300338592320438


#### Q3: Vector search for question and answer

In [35]:
# Use both Question and Text 
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

X = pipeline.fit_transform(texts)

vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x73900c88ed80>

In [36]:
relevance_total = []

for q in tqdm(range(len(q_vectors))):

    doc_id = ground_truth[q]['document']
    q_vector = q_vectors[q]

    results = minsearch_vector_search(vector=q_vector, course = 'data-engineering-zoomcamp')
    relevance = [d['id'] == doc_id for d in results] # if the retrived doc is the truth doc 
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:03<00:00, 1198.87it/s]
