# 1 - offline-rag-evaluation
   - Cosine Similarity Metric
   - LLM as a judge

In [1]:
from tqdm.auto import tqdm
import json
import pandas as pd
from sentence_transformers import SentenceTransformer

from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
import os
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

from concurrent.futures import ThreadPoolExecutor
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb

  from .autonotebook import tqdm as notebook_tqdm


# Load documents with IDs

In [2]:
with open('../3 - Vector_DB/documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

# Load ground truth

In [3]:
df_ground_truth = pd.read_csv('../3 - Vector_DB/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
doc_idx = {d['id']: d for d in documents}

# Index data

In [5]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

# Add documents into index

In [7]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:58<00:00, 16.25it/s]


# Best Retrieval ou bien Search Engine

In [8]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [9]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

# The RAG flow

In [10]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [11]:
def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [12]:
def rag(query: dict, model='gpt-4o') -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [13]:
answer_exemple = rag(ground_truth[10])
print(answer_exemple)

Yes, you can enroll in the course after it starts. Even if you haven't registered, you're still eligible to submit the homeworks. However, keep in mind that there will be deadlines for turning in the final projects, so it's important not to delay until the last minute.


# Cosine similarity metric

<h3>Evaluating GPT 4o</h3>

In [18]:
answer_llm = answer_exemple
answer_orig = doc_idx[ground_truth[10]["document"]]['text']
v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)
v_llm.dot(v_orig)

0.7702877

In [None]:
answers = {}
for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [None]:
results_gpt4o = [None] * len(ground_truth)

for i, val in answers.items():
    results_gpt4o[i] = val.copy()
    results_gpt4o[i].update(ground_truth[i])

In [None]:
df_gpt4o = pd.DataFrame(results_gpt4o)
df_gpt4o.to_csv('results-gpt4o.csv', index=False)

<h3> Evaluating GPT 3.5</h3>

In [None]:
rag(ground_truth[10], model='gpt-3.5-turbo')

In [None]:
pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [None]:
def process_record(rec):
    model = 'gpt-3.5-turbo'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [None]:
process_record(ground_truth[10])

In [None]:
results_gpt35 = map_progress(pool, ground_truth, process_record)

In [None]:
df_gpt35 = pd.DataFrame(results_gpt35)
df_gpt35.to_csv('dresults-gpt35.csv', index=False)

<h3>Cosinus similarity quality gpt35 x gpt4o</h3>
      - A orig-> Q -> A llm
      <br>- cosine(A orig, A llm)

<h4>gpt4o</h4>

In [None]:
results_gpt4o = df_gpt4o.to_dict(orient='records')

In [19]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [None]:
similarity = []
for record in tqdm(results_gpt4o):
    sim = compute_similarity(record)
    similarity.append(sim)

In [None]:
df_gpt4o['cosine'] = similarity
df_gpt4o['cosine'].describe()

<h4>gpt35</h4>

In [None]:
results_gpt35 = df_gpt35.to_dict(orient='records')

similarity_35 = []

for record in tqdm(results_gpt35):
    sim = compute_similarity(record)
    similarity_35.append(sim)

In [None]:
df_gpt35['cosine'] = similarity_35
df_gpt35['cosine'].describe()

<h4>Compare</h4>

In [None]:
sb.histplot(df_gpt4o['cosine'], label='4o')
sb.histplot(df_gpt35['cosine'], label='35')

plt.title("RAG LLM performance")
plt.xlabel("A->Q->A' Cosine Similarity")
plt.legend()

<h3> Evaluating GPT 4o mini</h3>

In [14]:
def process_record_4o_mini(rec):
    model = 'gpt-4o-mini'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [15]:
process_record_4o_mini(ground_truth[10])

{'answer_llm': "Yes, you can enroll in the course after it starts. Even if you don't register, you are still eligible to submit the homeworks. However, be aware that there will be deadlines for turning in the final projects, so it's best not to leave everything for the last minute.",
 'answer_orig': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'document': '7842b56a',
 'question': 'Can I enroll in the course after it starts?',
 'course': 'data-engineering-zoomcamp'}

In [16]:
results_gpt4omini = []
for record in tqdm(ground_truth):
    result = process_record_4o_mini(record)
    results_gpt4omini.append(result)

100%|██████████| 4627/4627 [2:15:48<00:00,  1.76s/it]  


In [38]:
df_gpt4o_mini = pd.DataFrame(results_gpt4omini)
df_gpt4o_mini.to_csv('results-gpt4o-mini.csv', index=False, sep=";")

In [20]:
similarity_4o_mini = []

for record in tqdm(results_gpt4omini):
    sim = compute_similarity(record)
    similarity_4o_mini.append(sim)

100%|██████████| 4627/4627 [05:10<00:00, 14.88it/s]


In [21]:
df_gpt4o_mini['cosine'] = similarity_4o_mini
df_gpt4o_mini['cosine'].describe()

count    4627.000000
mean        0.687921
std         0.210011
min        -0.142060
25%         0.591705
50%         0.738753
75%         0.841162
max         0.987164
Name: cosine, dtype: float64

# LLM-as-a-Judge
Ici on demande à un llm de faire  le juje c a d il va comparer ce qu'on lui demande et voir la similarité entre par exemple la réponse original et la réponse qu'on a eu avec notre rag-llm

<h3>Préparation du prompt de comparaison enovyé à llm</h3>

In [None]:
# AO -> Q -> ALLM
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

# Q -> ALLM
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [23]:
# travailler sur un échantillon et pas toute les données
df_sample = df_gpt4o_mini.sample(n=150, random_state=1)
samples = df_sample.to_dict(orient='records')

<h5>un petit exemple</h5>

In [24]:
record = samples[0]
record

{'answer_llm': 'When appending data to a parquet file, the recommended compression to use is "gzip" with the fastparquet engine.',
 'answer_orig': 'pd.read_csv\ndf_iter = pd.read_csv(dataset_url, iterator=True, chunksize=100000)\nThe data needs to be appended to the parquet file using the fastparquet engine\ndf.to_parquet(path, compression="gzip", engine=\'fastparquet\', append=True)',
 'document': '8ab78bee',
 'question': 'Which compression should be used when appending data to a parquet file?',
 'course': 'data-engineering-zoomcamp',
 'cosine': 0.6418646574020386}

In [25]:
prompt = prompt1_template.format(**record)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: pd.read_csv
df_iter = pd.read_csv(dataset_url, iterator=True, chunksize=100000)
The data needs to be appended to the parquet file using the fastparquet engine
df.to_parquet(path, compression="gzip", engine='fastparquet', append=True)
Generated Question: Which compression should be used when appending data to a parquet file?
Generated Answer: When appending data to a parquet file, the recommended compression to use is "gzip" with the fastparquet engine.

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON with

In [26]:
answer = llm(prompt, model='gpt-4o-mini')
print(answer)

{
  "Relevance": "RELEVANT",
  "Explanation": "The generated answer accurately addresses the question regarding which compression should be used when appending data to a parquet file. It specifies 'gzip' and references the use of the fastparquet engine, both of which are mentioned in the original answer."
}


<h3>Jude sample 150</h3>

<h5>prompt 1</h5>

In [27]:
evaluations = []

for record in tqdm(samples):
    prompt = prompt1_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations.append(evaluation)

100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


In [28]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

In [29]:
df_evaluations = pd.DataFrame(json_evaluations)
df_evaluations.Relevance.value_counts()

Relevance
RELEVANT           124
PARTLY_RELEVANT     19
NON_RELEVANT         7
Name: count, dtype: int64

In [31]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT']

Unnamed: 0,Relevance,Explanation
18,NON_RELEVANT,The generated answer is completely unrelated t...
33,NON_RELEVANT,The generated answer discusses project attempt...
53,NON_RELEVANT,The generated answer discusses missing session...
63,NON_RELEVANT,The generated answer refers to accessing the F...
91,NON_RELEVANT,The generated answer does not address the orig...
97,NON_RELEVANT,The generated answer does not address or relat...
120,NON_RELEVANT,The generated answer does not address the user...


<h5>prompt2</h5>

In [32]:
prompt = prompt2_template.format(**record)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: Do we need to download 12 separate parquet files for the 2022 green taxi data for homework 3?
Generated Answer: Yes, you need to download all 12 separate parquet files for the 2022 green taxi data for homework 3. The parquet files are available for each month separately, and you need to add all 12 files to your GCS bucket. You can refer to them using the URIs option when creating an external table in BigQuery, and you can also use the wildcard "*" to refer to all 12 files using a single string.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  

In [33]:
evaluations_2 = []

for record in tqdm(samples):
    prompt = prompt2_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations_2.append(evaluation)

100%|██████████| 150/150 [03:56<00:00,  1.58s/it]


In [34]:
json_evaluations_2 = []

for i, str_eval in enumerate(evaluations_2):
    json_eval = json.loads(str_eval)
    json_evaluations_2.append(json_eval)

In [35]:
df_evaluations_2 = pd.DataFrame(json_evaluations_2)
df_evaluations_2.Relevance.value_counts()

Relevance
RELEVANT           129
PARTLY_RELEVANT     18
NON_RELEVANT         3
Name: count, dtype: int64

In [39]:
df_evaluations.to_csv('evaluations-aqa.csv', index=False, sep=";")
df_evaluations_2.to_csv('evaluations-qa.csv', index=False, sep=";")