In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("../data/ground-truth-retrieval.csv")
df

Unnamed: 0,id,chunk_id,question
0,290,290_20,What are the exact resolutions that Gretchen R...
1,290,290_20,What is the name of Gretchen Rubin's book that...
2,290,290_20,What feeling of helplessness did Gretchen Rubi...
3,290,290_20,What specific tools was Gretchen Rubin only ab...
4,290,290_20,What is the name of the experiment that Gretch...
...,...,...,...
3361,466,466_25,What were the key findings of the book that st...
3362,466,466_25,Could Richard Koch have universally explained ...
3363,466,466_25,"Why was Malcolm Gladwell's thesis about 10,000..."
3364,466,466_25,"How many people out of 20, which Richard Koch ..."


In [5]:
documents = df.to_dict(orient="records")

In [6]:
documents[:5]

[{'id': 290,
  'chunk_id': '290_20',
  'question': 'What are the exact resolutions that Gretchen Rubin is referring to in relation to her daily habits?'},
 {'id': 290,
  'chunk_id': '290_20',
  'question': "What is the name of Gretchen Rubin's book that 'did not find its audience'?"},
 {'id': 290,
  'chunk_id': '290_20',
  'question': 'What feeling of helplessness did Gretchen Rubin experience as a result of her book being a commercial failure?'},
 {'id': 290,
  'chunk_id': '290_20',
  'question': 'What specific tools was Gretchen Rubin only able to access after her book failed?'},
 {'id': 290,
  'chunk_id': '290_20',
  'question': 'What is the name of the experiment that Gretchen Rubin was testing in her book The Happiness Project?'}]

# Retrieval evaluation

Evaluate **keyword search** (default) vs. **semantic search** (using embedding generated by SentenceTransformer model [multi-qa-mpnet-base-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1)) in ElasticSearch.

In [7]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [8]:
from tqdm import tqdm


def evaluate(ground_truth, search_function):
    relevance_overall = []
    relevance_document = []

    for q in tqdm(ground_truth):
        doc_id, chunk_id = q["id"], q["chunk_id"]
        results = search_function(q)
        relevance_chunk = [(d["chunk_id"] == chunk_id) for d in results]
        relevance_doc = [(d["id"] == doc_id) for d in results]
        relevance_overall.append(relevance_chunk)
        relevance_document.append(relevance_doc)

    return {
        "hit_rate_overall": hit_rate(relevance_overall),
        "mrr_overall": mrr(relevance_overall),
        "hit_rate_document": hit_rate(relevance_document),
        "mrr_document": mrr(relevance_document),
    }

In [9]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.path.curdir, "..")))
from utils.rag import elastic_keyword_search, elastic_semantic_search

  from tqdm.autonotebook import tqdm, trange


In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-mpnet-base-cos-v1")



In [11]:
from elasticsearch import Elasticsearch


es_client = Elasticsearch("http://127.0.0.1:9200")
es_client.info()

ObjectApiResponse({'name': '75e558d7ad26', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'hz3rz7BgQ2CimtW7H8da-A', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [12]:
from utils.constants import INDEX_NAME


es_client.indices.exists(index=INDEX_NAME)

HeadApiResponse(True)

In [13]:
keyword_results = evaluate(
    documents, lambda x: elastic_keyword_search(es_client, x["question"])
)
semantic_results = evaluate(
    documents, lambda x: elastic_semantic_search(es_client, x["question"], model)
)

100%|██████████| 3366/3366 [00:31<00:00, 105.22it/s]
100%|██████████| 3366/3366 [06:39<00:00,  8.44it/s]


In [14]:
keyword_results

{'hit_rate_overall': 0.7890671420083185,
 'mrr_overall': 1.0368785898197679,
 'hit_rate_document': 0.8722519310754605,
 'mrr_document': 1.571390374331495}

In [15]:
semantic_results

{'hit_rate_overall': 0.3820558526440879,
 'mrr_overall': 0.44081501287383695,
 'hit_rate_document': 0.6316102198455139,
 'mrr_document': 0.9488710635769462}

In [17]:
import json


with open("../data/keyword_eval.json", "w") as f:
    json.dump(keyword_results, f)

with open("../data/semantic_eval.json", "w") as f:
    json.dump(semantic_results, f)

Keyword search seems to offer better performance than semantic search, and will be used in production.

**Note:** ElasticSearch don't do "true" vector semantic search, but only approximately since it only samples maximum 10000 documents, out of 16397 currently in the the database. This definitely impedes performance.

# RAG Evaluation

Evaluate `llama-3.1-8b-instant` vs. `llama3-8b-8192` from Groq using GPT-4o-mini as a judge.

In [1]:
import pandas as pd
from tqdm import tqdm
import json

In [2]:
df = pd.read_csv("../data/ground-truth-retrieval.csv")

In [3]:
from dotenv import dotenv_values
import os

env_path = os.path.abspath(os.path.join(os.path.curdir, "../.env"))
config = dotenv_values(env_path)

In [4]:
from openai import OpenAI

client = OpenAI(
    api_key=config["OPENAI_API_KEY"],
)

In [5]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.path.curdir, "..")))
from utils.rag import rag

  from tqdm.autonotebook import tqdm, trange


In [6]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch("http://127.0.0.1:9200")
es_client.info()

ObjectApiResponse({'name': '75e558d7ad26', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'hz3rz7BgQ2CimtW7H8da-A', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [7]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [8]:
df_sample = df.sample(n=200, random_state=1)

In [9]:
sample = df_sample.to_dict(orient="records")

In [15]:
def llm_judge(prompt, model="gpt-4o-mini"):
    response = client.chat.completions.create(
        model=model, messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [17]:
llama3_8b_evaluations = []

for record in tqdm(sample):
    question = record["question"]
    answer_llm = rag(es_client, question, "llama3-8b-8192")

    prompt = prompt2_template.format(question=question, answer_llm=answer_llm)

    evaluation = llm_judge(prompt)
    evaluation = json.loads(evaluation)

    llama3_8b_evaluations.append((record, answer_llm, evaluation))

100%|██████████| 200/200 [30:35<00:00,  9.18s/it]


In [19]:
with open("../data/llama3_8b_evaluations.txt", "w") as f:
    json.dump(llama3_8b_evaluations, f)

In [18]:
llama31_8b_evaluations = []

for record in tqdm(sample):
    question = record["question"]
    answer_llm = rag(es_client, question, "llama-3.1-8b-instant")

    prompt = prompt2_template.format(question=question, answer_llm=answer_llm)

    evaluation = llm_judge(prompt)
    evaluation = json.loads(evaluation)

    llama31_8b_evaluations.append((record, answer_llm, evaluation))

  4%|▍         | 9/200 [01:16<27:09,  8.53s/it]


KeyboardInterrupt: 