# My Evaluation Approach


![](assets/my_approach.png)


## Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import time
import os
from typing import List, Dict, TypedDict
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

import openai
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import VectorStore
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser

In [3]:
loaded = load_dotenv(override=True)

data_dir = "my_benchmark/"
os.environ['CHUNKING_BENCHMARK'] = data_dir

# 1. Load and Save Documents


Each document is loaded as one Langchain document possibly to small to fit into a LLM. Therefore, we need to split these documents into smaller pieces of text for further processing.

In [23]:
from utils.loader import save_documents

documents: List[Document] = []
for file in os.listdir(data_dir+"documents"):
    file_path = os.path.join(data_dir+"documents", file)
    loader = TextLoader(file_path)
    documents.extend(loader.load())

save_documents(documents, data_dir)

In [4]:
from utils.loader import load_documents
documents = load_documents(data_dir)

# 2. Apply chunking


In [None]:
%run -i chunking_strategies.ipynb

In [92]:
from utils.loader import load_chunks
split_chunks: Dict[str, Document] = load_chunks(data_dir)

# 3. Ingest into vector store

Using FAISS


In [139]:
from langchain_huggingface import HuggingFaceEmbeddings

vector_stores: Dict[str, VectorStore] = {}

# embeddings = HuggingFaceEmbeddings(
#     model_name="Snowflake/snowflake-arctic-embed-l",
    # model_name="Alibaba-NLP/gte-large-en-v1.5",
#     model_kwargs={"device": 0, 'trust_remote_code': True},  # Comment out to use CPU
# )
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

model_name = (embeddings.model_name if hasattr(embeddings, 'model_name') else embeddings.model).replace("/", "_")
vector_store_dir = f"{data_dir}vector_stores/{model_name}"
Path(vector_store_dir).mkdir(parents=True, exist_ok=True)
for experiment_name, chunks in split_chunks.items():
    if os.path.exists(f"{vector_store_dir}/{experiment_name}"):
        print("Loading", experiment_name)
        vector_stores[experiment_name] = FAISS.load_local(f"{vector_store_dir}/{experiment_name}", embeddings, allow_dangerous_deserialization=True)
    else:
        print("Indexing", experiment_name)
        vector_stores[experiment_name] = FAISS.from_documents(chunks, embeddings)
        vector_stores[experiment_name].save_local(f"{vector_store_dir}/{experiment_name}")

Loading markdown_header_recursive-1024-0
Loading fixed_size-512-200
Loading fixed_size-1024-0
Loading markdown_header_recursive-512-200
Loading semantic_chunks_90
Loading fixed_size-1024-200
Loading markdown_header
Loading fixed_size-2048-0
Loading markdown_header_recursive-2048-200
Loading markdown_header_recursive-2048-0
Loading markdown_header_recursive-1024-200
Loading fixed_size-512-0
Loading recursive-1024-200
Loading markdown_header_parent
Loading fixed_size-2048-200
Loading recursive-2048-0
Loading markdown_header_recursive-512-0
Loading recursive-512-200
Loading recursive-2048-200
Loading semantic_chunks_recursive-95-2048-200
Loading recursive-1024-0
Loading semantic_chunks_95
Loading recursive-512-0


# 5. Evaluation


## Create Golden Datasets

3 Evaluation Golden Datasets for each Chunking Strategy should include the following:

- Questions across Documents
- Ground Truth Chunks (with graded Relevance)
- Ground Truth Answers

For Simple, Reasoning and Multi-Context Questions


In [7]:
from utils.evaluation import GoldenTestset

class Questions(TypedDict):
    simple: List[GoldenTestset]
    reasoning: List[GoldenTestset]
    multi_context: List[GoldenTestset]

gold_dataset: Dict[str, Questions]  = {}


Create golden dataset on subset of documents, to have some irrelevant documents left for some noise

In [8]:
documents_subset_sources = [data_dir+"documents/sleep.md", data_dir+"documents/teeth.md", data_dir+"documents/time_management.md", data_dir+"documents/mentoring.md"]

### Question Generation with RAGAS


Generate synthetic Questions across Documents to challenge chunking strategies on multi-context queries


In [9]:
from os import environ

environ["RAGAS_DO_NOT_TRACK"] = "true"

In [52]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

nest_asyncio.apply()

generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)

ragas_testset = generator.generate_with_langchain_docs(
    [document for document in documents if document.metadata["source"] in documents_subset_sources],
    test_size=10,
    distributions={simple: 0.4, reasoning: 0.4, multi_context: 0.2},
)
df = ragas_testset.to_pandas()
df = df.drop(columns=["contexts"]) # ground truth contexts/chunks are determined in next step
df.to_json(data_dir+"ragas_testset.json", index=False)

NameError: name 'documents' is not defined

In [29]:
ragas_testset = pd.read_json(data_dir+"ragas_testset.json")
for experiment_name in split_chunks.keys():
    gold_dataset[experiment_name] = {
        "simple": [],
        "reasoning": [],
        "multi_context": []
    }
    for _, row in ragas_testset.iterrows():
        testset = {
            "question": row['question'],
            "source": [metadata["source"] for metadata in row['metadata']],
            "ground_truth_chunks": {},
            "ground_truth_answer": row['ground_truth']
        }
        gold_dataset[experiment_name][row["evolution_type"]].append(testset)

### Generate Relevancy Score for each chunk


Relevancy Prompt is taken by Trulens. The difference is that I apply it to all chunks whereas Trulens only computed it on the retrieved chunks


In [18]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.llm_output_parser import re_0_10_rating

system_prompt = """You are a RELEVANCE grader; providing the relevance of the given CONTEXT to the given QUESTION.
    Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. 

    A few additional scoring guidelines:

    - Long CONTEXTS should score equally well as short CONTEXTS.

    - RELEVANCE score should increase as the CONTEXTS provides more RELEVANT context to the QUESTION.

    - RELEVANCE score should increase as the CONTEXTS provides RELEVANT context to more parts of the QUESTION.

    - CONTEXT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE.

    - CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 10.

    - Never elaborate."""

user_prompt = PromptTemplate.from_template(
    """QUESTION: {question}

    CONTEXT: {context}
    
    RELEVANCE: """
)

critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


def make_request_with_backoff(messages, retries=8):
    for i in range(retries):
        try:
            response = critic_llm.invoke(messages)
            return response
        except openai.RateLimitError as e:
            if i == retries - 1:
                raise e
            wait_time = 2**i
            print(f"Rate limited, waiting {wait_time} seconds")
            time.sleep(wait_time)
        except openai.APIError as e:
            print(e)


def process_chunk(chunk, testset):
    if chunk.metadata["source"] not in testset["source"]:
        return None, None

    judge_chunk_relevancy_prompt = user_prompt.format(
        question=testset["question"], context=chunk.page_content
    )

    llm_messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=judge_chunk_relevancy_prompt),
    ]
    response = make_request_with_backoff(llm_messages)
    chunk_relevancy = re_0_10_rating(response.content)
    if chunk_relevancy != 0.0:
        return str(chunk.metadata["id"]), chunk_relevancy
    return None, None

for experiment_name, questions in gold_dataset.items():
    print("Collecting ground truth for", experiment_name)
    for question_type, testsets in questions.items():
        print("Collecting ground truth for", question_type)
        for testset in tqdm(testsets):
            ground_truth = {}
            with ThreadPoolExecutor(max_workers=2) as executor:
                future_to_chunk = {
                    executor.submit(process_chunk, chunk, testset): chunk
                    for chunk in split_chunks[experiment_name]
                }
                for future in as_completed(future_to_chunk):
                    chunk_id, relevancy = future.result()
                    if chunk_id and relevancy:
                        ground_truth[chunk_id] = relevancy
            
            if len(ground_truth):
                testset["ground_truth_chunks"] = ground_truth

Collecting ground truth for markdown-header-recursive-512-200
Collecting ground truth for simple


100%|██████████| 4/4 [00:48<00:00, 12.24s/it]


Collecting ground truth for reasoning


100%|██████████| 4/4 [01:02<00:00, 15.68s/it]


Collecting ground truth for multi_context


100%|██████████| 2/2 [00:23<00:00, 11.51s/it]


Collecting ground truth for markdown-header-parent
Collecting ground truth for simple


100%|██████████| 4/4 [00:14<00:00,  3.52s/it]


Collecting ground truth for reasoning


100%|██████████| 4/4 [00:17<00:00,  4.39s/it]


Collecting ground truth for multi_context


100%|██████████| 2/2 [00:06<00:00,  3.40s/it]


Save Evaluation Dataset


In [26]:
with open(data_dir+'gold_dataset_1.json', 'w') as jsonl_file:
    json.dump(gold_dataset, jsonl_file, indent=4)

## Evaluate Retrieval

Load Evaluation Dataset


In [100]:
gold_dataset = {}
with open(data_dir+'gold_dataset.json', 'r') as jsonl_file:
    gold_dataset = json.load(jsonl_file)

In [159]:
from utils.evaluation import calculate_metrics, calculate_mean_metrics, EvalApproach

SEL_APPROACH: EvalApproach = EvalApproach.FIXED_K
FIXED_K = 5
TOKEN_LIMIT = 3000

results_list = []
for experiment_name, questions in gold_dataset.items():
    if experiment_name not in vector_stores:
        continue

    for question_type, testsets in questions.items():
        metrics = []
        for testset in testsets:
            question = testset["question"]
            ground_truth = testset["ground_truth_chunks"]
            K = FIXED_K if SEL_APPROACH == EvalApproach.FIXED_K else 0
            K = len(ground_truth) if SEL_APPROACH == EvalApproach.GROUND_TRUTH_K else K
            K = 100 if SEL_APPROACH == EvalApproach.TOKEN_LIMIT else K # large number to ensure TOKEN_LIMIT is always reached

            retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": K})
            retrieved_chunks = retriever.invoke(question)

            if SEL_APPROACH == EvalApproach.TOKEN_LIMIT:
                # cap the number of retrieved chunks where sum of page_contents are below a fixed context window
                retrieved_chunks_capped = []
                total_context_length = 0
                for chunk in retrieved_chunks:
                    total_context_length += len(chunk.page_content)
                    if total_context_length > TOKEN_LIMIT * 4: # as one token on average is approximately 4 characters
                        break
                    retrieved_chunks_capped.append(chunk)
                
                retrieved_chunks = retrieved_chunks_capped
            
            retrieved_chunk_ids = [
                str(doc.metadata["id"]) for doc in retrieved_chunks
            ]
            metrics.append(
                calculate_metrics(
                    retrieved_chunk_ids,
                    ground_truth_chunks=list(ground_truth.keys()),
                    ground_truth_relevancies=list(ground_truth.values()),
                )
            )

        mean_metrics = calculate_mean_metrics(metrics)

        try:
            experiment_chunk_size = int(experiment_name.split("-")[-2])
            experiment_chunk_overlap = int(experiment_name.split("-")[-1])
        except:
            experiment_chunk_size = None
            experiment_chunk_overlap = None

        results_list.append(
            [
                experiment_name.split("-")[0],
                experiment_chunk_size,
                experiment_chunk_overlap,
                question_type,
                mean_metrics["precision"],
                mean_metrics["recall"],
                mean_metrics["map"],
                mean_metrics["ndcg"],
            ]
        )

eval_name = f"{SEL_APPROACH}-{FIXED_K}-{model_name}" if SEL_APPROACH == EvalApproach.FIXED_K else ""
eval_name = f"{SEL_APPROACH}-{model_name}" if SEL_APPROACH == EvalApproach.GROUND_TRUTH_K else eval_name
eval_name = f"{SEL_APPROACH}-{TOKEN_LIMIT}-{model_name}" if SEL_APPROACH == EvalApproach.TOKEN_LIMIT else eval_name
results = pd.DataFrame(
    results_list,
    columns=[
        eval_name,
        "Chunk Size",
        "Chunk Overlap",
        "Question Type",
        "Precision",
        "Recall",
        "MAP",
        "NDCG",
    ],
)
results.to_csv(f"{data_dir}results/{eval_name}.csv", index=False)

### Best average strategy

In [160]:
SEL_APPROACH: EvalApproach = EvalApproach.FIXED_K
FIXED_K = 20
TOKEN_LIMIT = 3000
model_name = "text-embedding-3-small"
# model_name = "Alibaba-NLP_gte-large-en-v1.5"
eval_name = f"{SEL_APPROACH}-{FIXED_K}-{model_name}" if SEL_APPROACH == EvalApproach.FIXED_K else ""
eval_name = f"{SEL_APPROACH}-{model_name}" if SEL_APPROACH == EvalApproach.GROUND_TRUTH_K else eval_name
eval_name = f"{SEL_APPROACH}-{TOKEN_LIMIT}-{model_name}" if SEL_APPROACH == EvalApproach.TOKEN_LIMIT else eval_name

In [161]:
results = pd.read_csv(f"{data_dir}results/{eval_name}.csv")
results_view = results.drop(columns=["Question Type", "Chunk Size", "Chunk Overlap", "MAP"]).groupby(eval_name).mean().sort_values(by="Recall", ascending=False)
results_view

Unnamed: 0_level_0,Precision,Recall,NDCG
Fixed-K-20-text-embedding-3-small,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
semantic_chunks_95,0.375,1.0,0.967677
semantic_chunks_recursive,0.616667,0.966667,0.941154
markdown_header_parent,0.595833,0.96,0.962851
semantic_chunks_90,0.570833,0.957816,0.935623
markdown_header,0.583333,0.95,0.935702
fixed_size,0.6875,0.824929,0.929322
recursive,0.745833,0.781641,0.918503
markdown_header_recursive,0.821528,0.757926,0.915479


In [153]:
results = pd.read_csv(f"{data_dir}results/{eval_name}.csv")
results_view = results.drop(columns=["Question Type", "MAP"]).groupby([eval_name, "Chunk Size", "Chunk Overlap"], dropna=False).mean().sort_values(by="Recall", ascending=False)
results_view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Precision,Recall,NDCG
Token-Limit-6000-text-embedding-3-small,Chunk Size,Chunk Overlap,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fixed_size,2048.0,0.0,0.598825,0.97619,0.949561
fixed_size,1024.0,0.0,0.603901,0.967262,0.944149
semantic_chunks_recursive,2048.0,200.0,0.607676,0.966667,0.93974
recursive,2048.0,0.0,0.660256,0.963542,0.949291
recursive,2048.0,200.0,0.647436,0.963542,0.957929
fixed_size,512.0,0.0,0.592163,0.960606,0.930967
markdown_header_recursive,1024.0,0.0,0.575662,0.953175,0.93484
recursive,1024.0,200.0,0.646141,0.952703,0.917986
recursive,1024.0,0.0,0.639163,0.95205,0.92239
semantic_chunks_95,,,0.645412,0.951389,0.965582


In [154]:
results_view.to_csv(f"{data_dir}results/{eval_name}_top_strategies.csv")

### Best chunk size

In [147]:
results = pd.read_csv(f"{data_dir}results/{eval_name}.csv")
results_view = results.drop(columns=[eval_name,"Question Type", "Chunk Overlap", "MAP"]).groupby("Chunk Size", dropna=False).mean().sort_values(by="NDCG", ascending=False)
results_view

Unnamed: 0_level_0,Precision,Recall,NDCG
Chunk Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.596542,0.943461,0.951085
2048.0,0.617665,0.959272,0.94143
1024.0,0.636946,0.951664,0.937825
512.0,0.668596,0.929358,0.930594


In [149]:
results = pd.read_csv(f"{data_dir}results/{eval_name}.csv")
results_view = results.drop(columns=[eval_name,"Question Type", "Chunk Size", "MAP"]).groupby("Chunk Overlap").mean().sort_values(by="NDCG", ascending=False)
results_view

Unnamed: 0_level_0,Precision,Recall,NDCG
Chunk Overlap,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200.0,0.673331,0.93971,0.9392
0.0,0.602622,0.955993,0.934281


## Evaluate Generation

In [None]:

nest_asyncio.apply()

answer_correctness_system_prompt = """You are a CORRECTNESS grader; providing the correctness of the given GENERATED ANSWER compared to the given GROUND TRUTH ANSWER.
Respond only as a number from 0 to 10 where 0 is the least correct and 10 is the most correct.

A few additional scoring guidelines:

- Long GENERATED ANSWERS should score equally well as short GENERATED ANSWERS.

- CORRECTNESS score should increase as the GENERATED ANSWER matches more accurately with the GROUND TRUTH ANSWER.

- CORRECTNESS score should increase as the GENERATED ANSWER covers more parts of the GROUND TRUTH ANSWER accurately.

- GENERATED ANSWERS that partially match the GROUND TRUTH ANSWER should score 2, 3, or 4. Higher scores indicate more correctness.

- GENERATED ANSWERS that mostly match the GROUND TRUTH ANSWER should get a score of 5, 6, 7, or 8. Higher scores indicate more correctness.

- GENERATED ANSWERS that fully match the GROUND TRUTH ANSWER should get a score of 9 or 10. Higher scores indicate more correctness.

- GENERATED ANSWERS must be fully accurate and comprehensive to the GROUND TRUTH ANSWER to get a score of 10.

- Never elaborate."""

answer_correctness_user_prompt = PromptTemplate.from_template(
    """GROUND TRUTH ANSWER: {ground_truth_answer}

GENERATED ANSWER: {generated_answer}

CORRECTNESS: """
)

prompt = hub.pull("rlm/rag-prompt")
generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
        
for experiment_name, questions in gold_dataset.items():
    print("Evaluating", experiment_name)
    vector_stores[experiment_name].embeddings.show_progress_bar = False
    retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": 10})
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | generator_llm
        | StrOutputParser()
    )
    for question_type, testsets in questions.items():
        mean_answer_correctness = 0
        for testset in testsets:
            response = rag_chain.invoke(testset["question"])
            answer_correctness_prompt = answer_correctness_user_prompt.format(
                ground_truth_answer=testset["ground_truth_answer"], generated_answer=response
            )

            llm_messages = [
                SystemMessage(content=answer_correctness_system_prompt),
                HumanMessage(content=answer_correctness_prompt),
            ]
            response = make_request_with_backoff(llm_messages)

            answer_correctness = re_0_10_rating(response.content)
            mean_answer_correctness += answer_correctness
        mean_answer_correctness /= len(testsets)
        print(f"Experiment: {experiment_name} Question Type: {question_type} Mean Answer Correctness: {mean_answer_correctness}")