# My Evaluation Approach


![](assets/my_approach.png)


## Setup


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import time
import os
from typing import List, Dict, TypedDict
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

import openai
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import VectorStore
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser

In [None]:
loaded = load_dotenv()

data_dir = "my_benchmark/"
os.environ['CHUNKING_BENCHMARK'] = data_dir

# 1. Load and Save Documents


Each document is loaded as one Langchain document possibly to small to fit into a LLM. Therefore, we need to split these documents into smaller pieces of text for further processing.

In [None]:
from utils.loader import save_documents

documents: List[Document] = []
for file in os.listdir(data_dir+"documents"):
    file_path = os.path.join(data_dir+"documents", file)
    loader = UnstructuredFileLoader(file_path)
    documents.extend(loader.load())

save_documents(documents, data_dir)

In [None]:
from utils.loader import load_documents
documents = load_documents(data_dir)

# 2. Apply chunking


In [None]:
%run -i chunking_strategies.ipynb

In [None]:
from utils.loader import load_chunks
split_chunks: Dict[str, Document] = load_chunks(data_dir)

# 3. Ingest into vector store

Using FAISS


In [None]:
vector_stores: Dict[str, VectorStore] = {}

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)
for experiment_name, chunks in split_chunks.items():
    if os.path.exists(f"{data_dir}vector_stores/{experiment_name}"):
        print("Loading", experiment_name)
        vector_stores[experiment_name] = FAISS.load_local(f"{data_dir}vector_stores/{experiment_name}", embeddings, allow_dangerous_deserialization=True)
    else:
        print("Indexing", experiment_name)
        vector_stores[experiment_name] = FAISS.from_documents(chunks, embeddings)
        vector_stores[experiment_name].save_local(f"{data_dir}vector_stores/{experiment_name}")

# 5. Evaluation


3 Evaluation Datasets for each Chunking Strategy should include the following:

- Questions across Documents
- Ground Truth Chunks (with graded Relevance)
- Ground Truth Answers

For Simple, Reasoning and Multi-Context Questions


In [None]:
from utils.evaluation import GoldenTestset

class Questions(TypedDict):
    simple: List[GoldenTestset]
    reasoning: List[GoldenTestset]
    multi_context: List[GoldenTestset]

gold_dataset: Dict[str, Questions]  = {}


## Question Generation with RAGAS


Generate synthetic Questions across Documents to challenge chunking strategies on multi-context queries


In [None]:
from os import environ

environ["RAGAS_DO_NOT_TRACK"] = "true"

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

nest_asyncio.apply()

generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)

ragas_testset = generator.generate_with_langchain_docs(
    documents,
    test_size=10,
    distributions={simple: 0.3, reasoning: 0.3, multi_context: 0.4},
)
df = ragas_testset.to_pandas()
df.to_csv(data_dir+"ragas_testset.csv", index=False)

In [None]:
ragas_testset = pd.read_csv(data_dir+"ragas_testset.csv")
for experiment_name in split_chunks.keys():
    gold_dataset[experiment_name] = {
        "simple": [],
        "reasoning": [],
        "multi_context": []
    }
    for _, row in ragas_testset.iterrows():
        testset = {
            "question": row['question'],
            "ground_truth_chunks": [],
            "ground_truth_answer": row['ground_truth']
        }
        gold_dataset[experiment_name][row["evolution_type"]].append(testset)

## Generate Relevancy Score for each chunk


Relevancy Prompt is taken by Trulens. The difference is that I apply it to all chunks whereas Trulens only computed it on the retrieved chunks


In [None]:
from utils.llm_output_parser import re_0_10_rating

system_prompt = """You are a RELEVANCE grader; providing the relevance of the given CONTEXT to the given QUESTION.
    Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. 

    A few additional scoring guidelines:

    - Long CONTEXTS should score equally well as short CONTEXTS.

    - RELEVANCE score should increase as the CONTEXTS provides more RELEVANT context to the QUESTION.

    - RELEVANCE score should increase as the CONTEXTS provides RELEVANT context to more parts of the QUESTION.

    - CONTEXT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE.

    - CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 10.

    - Never elaborate."""

user_prompt = PromptTemplate.from_template(
    """QUESTION: {question}

    CONTEXT: {context}
    
    RELEVANCE: """
)

critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


def make_request_with_backoff(messages, retries=8):
    for i in range(retries):
        try:
            response = critic_llm.invoke(messages)
            return response
        except openai.RateLimitError as e:
            if i == retries - 1:
                raise e
            wait_time = 2**i
            print(f"Rate limit error, retrying in {wait_time} seconds...")
            time.sleep(wait_time)


for experiment_name, questions in gold_dataset.items():
    print("Collecting ground truth for", experiment_name)
    for question_type, testsets in questions.items():
        for testset in testsets:
            print("Collecting ground truth for", testset["question"])
            ground_truth = {}
            for chunk in tqdm(split_chunks[experiment_name]):
                judge_chunk_relevancy_prompt = user_prompt.format(
                    question=testset["question"], context=chunk.page_content
                )

                llm_messages = [
                    SystemMessage(content=system_prompt),
                    HumanMessage(content=judge_chunk_relevancy_prompt),
                ]
                response = make_request_with_backoff(llm_messages)

                chunk_relevancy = re_0_10_rating(response.content)
                if chunk_relevancy != 0.0:
                    ground_truth[str(chunk.metadata["id"])] = chunk_relevancy
            testset["ground_truth_chunks"] = ground_truth

Save Evaluation Dataset


In [None]:
with open(data_dir+'gold_dataset.json', 'w') as jsonl_file:
    json.dump(gold_dataset, jsonl_file, indent=4)

Load Evaluation Dataset


In [None]:
gold_dataset = {}
with open(data_dir+'gold_dataset.json', 'r') as jsonl_file:
    gold_dataset = json.load(jsonl_file)

## Evaluate Retrieval

In [None]:
from utils.evaluation import calculate_metrics, calculate_mean_metrics

for experiment_name, questions in gold_dataset.items():
    if experiment_name not in vector_stores:
        continue
    
    # K = 30
    # vector_stores[experiment_name].embeddings.show_progress_bar = False
    # retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": K})
    for question_type, testsets in questions.items():
        metrics = []
        for testset in testsets:
            question = testset["question"]
            # filter out ground truth chunks that have a low relevance score
            ground_truth = {k: relevance for k, relevance in testset["ground_truth_chunks"].items() if relevance > 2}
            K = len(ground_truth)
            vector_stores[experiment_name].embeddings.show_progress_bar = False
            retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": K})
            retrieved_chunks = retriever.invoke(question)
            retrieved_chunk_ids = [str(doc.metadata["id"]) for doc in retrieved_chunks]
            metrics.append(calculate_metrics(retrieved_chunk_ids, ground_truth_chunks=list(ground_truth.keys()), ground_truth_relevancies=list(ground_truth.values()), K=10))
        
        mean_metrics = calculate_mean_metrics(metrics)
        print(f"Mean metrics for {experiment_name} {question_type}: {mean_metrics}")

## Evaluate Generation

In [None]:

nest_asyncio.apply()

answer_correctness_system_prompt = """You are a CORRECTNESS grader; providing the correctness of the given GENERATED ANSWER compared to the given GROUND TRUTH ANSWER.
Respond only as a number from 0 to 10 where 0 is the least correct and 10 is the most correct.

A few additional scoring guidelines:

- Long GENERATED ANSWERS should score equally well as short GENERATED ANSWERS.

- CORRECTNESS score should increase as the GENERATED ANSWER matches more accurately with the GROUND TRUTH ANSWER.

- CORRECTNESS score should increase as the GENERATED ANSWER covers more parts of the GROUND TRUTH ANSWER accurately.

- GENERATED ANSWERS that partially match the GROUND TRUTH ANSWER should score 2, 3, or 4. Higher scores indicate more correctness.

- GENERATED ANSWERS that mostly match the GROUND TRUTH ANSWER should get a score of 5, 6, 7, or 8. Higher scores indicate more correctness.

- GENERATED ANSWERS that fully match the GROUND TRUTH ANSWER should get a score of 9 or 10. Higher scores indicate more correctness.

- GENERATED ANSWERS must be fully accurate and comprehensive to the GROUND TRUTH ANSWER to get a score of 10.

- Never elaborate."""

answer_correctness_user_prompt = PromptTemplate.from_template(
    """GROUND TRUTH ANSWER: {ground_truth_answer}

GENERATED ANSWER: {generated_answer}

CORRECTNESS: """
)

prompt = hub.pull("rlm/rag-prompt")
generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
        
for experiment_name, questions in gold_dataset.items():
    print("Evaluating", experiment_name)
    vector_stores[experiment_name].embeddings.show_progress_bar = False
    retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": 10})
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | generator_llm
        | StrOutputParser()
    )
    for question_type, testsets in questions.items():
        mean_answer_correctness = 0
        for testset in testsets:
            response = rag_chain.invoke(testset["question"])
            answer_correctness_prompt = answer_correctness_user_prompt.format(
                ground_truth_answer=testset["ground_truth_answer"], generated_answer=response
            )

            llm_messages = [
                SystemMessage(content=answer_correctness_system_prompt),
                HumanMessage(content=answer_correctness_prompt),
            ]
            response = make_request_with_backoff(llm_messages)

            answer_correctness = re_0_10_rating(response.content)
            mean_answer_correctness += answer_correctness
        mean_answer_correctness /= len(testsets)
        print(f"Experiment: {experiment_name} Question Type: {question_type} Mean Answer Correctness: {mean_answer_correctness}")