# My Evaluation Approach


![](assets/my_approach.png)


## Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import time
import os
from typing import List, Dict, TypedDict
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

import openai
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import VectorStore
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser

In [3]:
loaded = load_dotenv()

data_dir = "my_benchmark/"
os.environ['CHUNKING_BENCHMARK'] = data_dir

# 1. Load and Save Documents


Each document is loaded as one Langchain document possibly to small to fit into a LLM. Therefore, we need to split these documents into smaller pieces of text for further processing.

In [23]:
from utils.loader import save_documents

documents: List[Document] = []
for file in os.listdir(data_dir+"documents"):
    file_path = os.path.join(data_dir+"documents", file)
    loader = TextLoader(file_path)
    documents.extend(loader.load())

save_documents(documents, data_dir)

In [4]:
from utils.loader import load_documents
documents = load_documents(data_dir)

# 2. Apply chunking


In [None]:
%run -i chunking_strategies.ipynb

In [4]:
from utils.loader import load_chunks
split_chunks: Dict[str, Document] = load_chunks(data_dir)

# 3. Ingest into vector store

Using FAISS


In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

vector_stores: Dict[str, VectorStore] = {}

embeddings = HuggingFaceEmbeddings(
    model_name="Snowflake/snowflake-arctic-embed-l",
    model_kwargs={"device": 0, 'trust_remote_code': True},  # Comment out to use CPU
)
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store_dir = f"{data_dir}vector_stores/{embeddings.model_name.replace('/', '-')}"
Path(vector_store_dir).mkdir(parents=True, exist_ok=True)
for experiment_name, chunks in split_chunks.items():
    if os.path.exists(f"{vector_store_dir}/{experiment_name}"):
        print("Loading", experiment_name)
        vector_stores[experiment_name] = FAISS.load_local(f"{vector_store_dir}/{experiment_name}", embeddings, allow_dangerous_deserialization=True)
    else:
        print("Indexing", experiment_name)
        vector_stores[experiment_name] = FAISS.from_documents(chunks, embeddings)
        vector_stores[experiment_name].save_local(f"{vector_store_dir}/{experiment_name}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/84.0k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Indexing markdown-header-recursive-512-0
Indexing fixed-size-2048-0
Indexing markdown-header-recursive-1024-0
Indexing markdown-header-recursive-2048-200
Indexing recursive-1024-200
Indexing fixed-size-512-0
Indexing fixed-size-1024-0
Indexing markdown-header-recursive-1024-200
Indexing markdown-header-recursive-2048-0
Indexing fixed-size-1024-200
Indexing markdown-header
Indexing recursive-2048-0
Indexing fixed-size-2048-200
Indexing recursive-512-200
Indexing recursive-2048-200
Indexing recursive-1024-0
Indexing fixed-size-512-200
Indexing semantic-chunks-95-recursive-2048-200
Indexing semantic-chunks-90
Indexing semantic-chunks-95
Indexing recursive-512-0
Indexing markdown-header-recursive-512-200


# 5. Evaluation


## Create Golden Datasets

3 Evaluation Golden Datasets for each Chunking Strategy should include the following:

- Questions across Documents
- Ground Truth Chunks (with graded Relevance)
- Ground Truth Answers

For Simple, Reasoning and Multi-Context Questions


In [7]:
from utils.evaluation import GoldenTestset

class Questions(TypedDict):
    simple: List[GoldenTestset]
    reasoning: List[GoldenTestset]
    multi_context: List[GoldenTestset]

gold_dataset: Dict[str, Questions]  = {}


Create golden dataset on subset of documents, to have some irrelevant documents left for some noise

In [8]:
documents_subset_sources = [data_dir+"documents/sleep.md", data_dir+"documents/teeth.md", data_dir+"documents/time_management.md", data_dir+"documents/mentoring.md"]

### Question Generation with RAGAS


Generate synthetic Questions across Documents to challenge chunking strategies on multi-context queries


In [9]:
from os import environ

environ["RAGAS_DO_NOT_TRACK"] = "true"

In [17]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

nest_asyncio.apply()

generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)

ragas_testset = generator.generate_with_langchain_docs(
    [document for document in documents if document.metadata["source"] in documents_subset_sources],
    test_size=10,
    distributions={simple: 0.4, reasoning: 0.4, multi_context: 0.2},
)
df = ragas_testset.to_pandas()
df = df.drop(columns=["contexts"]) # ground truth contexts/chunks are determined in next step
df.to_csv(data_dir+"ragas_testset.csv", index=False)

embedding nodes:   0%|          | 0/34 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [23]:
ragas_testset = pd.read_csv(data_dir+"ragas_testset.csv")
for experiment_name in split_chunks.keys():
    gold_dataset[experiment_name] = {
        "simple": [],
        "reasoning": [],
        "multi_context": []
    }
    for _, row in ragas_testset.iterrows():
        testset = {
            "question": row['question'],
            "source": row['metadata'],
            "ground_truth_chunks": {},
            "ground_truth_answer": row['ground_truth']
        }
        gold_dataset[experiment_name][row["evolution_type"]].append(testset)

### Generate Relevancy Score for each chunk


Relevancy Prompt is taken by Trulens. The difference is that I apply it to all chunks whereas Trulens only computed it on the retrieved chunks


In [25]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.llm_output_parser import re_0_10_rating

system_prompt = """You are a RELEVANCE grader; providing the relevance of the given CONTEXT to the given QUESTION.
    Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. 

    A few additional scoring guidelines:

    - Long CONTEXTS should score equally well as short CONTEXTS.

    - RELEVANCE score should increase as the CONTEXTS provides more RELEVANT context to the QUESTION.

    - RELEVANCE score should increase as the CONTEXTS provides RELEVANT context to more parts of the QUESTION.

    - CONTEXT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE.

    - CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 10.

    - Never elaborate."""

user_prompt = PromptTemplate.from_template(
    """QUESTION: {question}

    CONTEXT: {context}
    
    RELEVANCE: """
)

critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


def make_request_with_backoff(messages, retries=8):
    for i in range(retries):
        try:
            response = critic_llm.invoke(messages)
            return response
        except openai.RateLimitError as e:
            if i == retries - 1:
                raise e
            wait_time = 2**i
            print(f"Rate limited, waiting {wait_time} seconds")
            time.sleep(wait_time)
        except openai.APIError as e:
            print(e)


def process_chunk(chunk, testset):
    if chunk.metadata["source"] not in testset["source"]:
        return None, None

    judge_chunk_relevancy_prompt = user_prompt.format(
        question=testset["question"], context=chunk.page_content
    )

    llm_messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=judge_chunk_relevancy_prompt),
    ]
    response = make_request_with_backoff(llm_messages)
    chunk_relevancy = re_0_10_rating(response.content)
    if chunk_relevancy != 0.0:
        return str(chunk.metadata["id"]), chunk_relevancy
    return None, None

for experiment_name, questions in gold_dataset.items():
    print("Collecting ground truth for", experiment_name)
    for question_type, testsets in questions.items():
        print("Collecting ground truth for", question_type)
        for testset in tqdm(testsets):
            ground_truth = {}
            with ThreadPoolExecutor(max_workers=2) as executor:
                future_to_chunk = {
                    executor.submit(process_chunk, chunk, testset): chunk
                    for chunk in split_chunks[experiment_name]
                }
                for future in as_completed(future_to_chunk):
                    chunk_id, relevancy = future.result()
                    if chunk_id and relevancy:
                        ground_truth[chunk_id] = relevancy
            
            if len(ground_truth):
                testset["ground_truth_chunks"] = ground_truth

Collecting ground truth for markdown-header-recursive-512-0
Collecting ground truth for simple


100%|██████████| 4/4 [00:43<00:00, 10.97s/it]


Collecting ground truth for reasoning


100%|██████████| 4/4 [00:59<00:00, 14.86s/it]


Collecting ground truth for multi_context


100%|██████████| 2/2 [00:17<00:00,  8.95s/it]


Collecting ground truth for fixed-size-2048-0
Collecting ground truth for simple


100%|██████████| 4/4 [00:09<00:00,  2.35s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:04<00:12,  4.17s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:14<00:15,  7.58s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:23<00:08,  8.59s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:30<00:00,  7.58s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:06<00:06,  6.32s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 2/2 [00:12<00:00,  6.10s/it]


Collecting ground truth for markdown-header-recursive-1024-0
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:09<00:28,  9.50s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 2/4 [00:19<00:19,  9.60s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:26<00:08,  8.67s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:37<00:00,  9.27s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:06<00:19,  6.48s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:23<00:25, 12.54s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:40<00:14, 14.50s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:49<00:00, 12.37s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:09<00:09,  9.29s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:16<00:00,  8.32s/it]


Collecting ground truth for markdown-header-recursive-2048-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:08<00:25,  8.35s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:16<00:16,  8.19s/it]

Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:21<00:06,  6.80s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


100%|██████████| 4/4 [00:33<00:00,  8.32s/it]


Collecting ground truth for reasoning


 25%|██▌       | 1/4 [00:03<00:09,  3.24s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:13<00:14,  7.23s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


 75%|███████▌  | 3/4 [00:25<00:09,  9.38s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:34<00:00,  8.53s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 1/2 [00:07<00:07,  7.80s/it]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:12<00:00,  6.05s/it]


Collecting ground truth for recursive-1024-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:11<00:34, 11.41s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:21<00:21, 10.65s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:30<00:09,  9.72s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:40<00:00, 10.13s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:25<00:26, 13.24s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:41<00:14, 14.84s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:52<00:00, 13.06s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:10<00:10, 10.72s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:19<00:00,  9.86s/it]


Collecting ground truth for fixed-size-512-0
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:13<00:40, 13.46s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:52<00:00, 13.11s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:10<00:32, 10.92s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:32<00:34, 17.24s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:54<00:19, 19.22s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:07<00:00, 16.98s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:24<00:00, 12.50s/it]


Collecting ground truth for fixed-size-1024-0
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:09<00:29,  9.70s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 2/4 [00:19<00:18,  9.49s/it]

Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:25<00:07,  7.93s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:34<00:00,  8.59s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:07<00:23,  7.93s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:21<00:22, 11.33s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:34<00:12, 12.13s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:43<00:00, 10.95s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:08<00:08,  8.85s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


100%|██████████| 2/2 [00:20<00:00, 10.01s/it]


Collecting ground truth for markdown-header-recursive-1024-200
Collecting ground truth for simple


 25%|██▌       | 1/4 [00:07<00:22,  7.47s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 2/4 [00:18<00:19,  9.80s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:27<00:09,  9.34s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


100%|██████████| 4/4 [00:40<00:00, 10.04s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:09<00:27,  9.23s/it]

Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:24<00:26, 13.04s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:42<00:15, 15.01s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:54<00:00, 13.50s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:13<00:13, 13.41s/it]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:19<00:00,  9.94s/it]


Collecting ground truth for markdown-header-recursive-2048-0
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:08<00:24,  8.26s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:14<00:14,  7.06s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:21<00:07,  7.17s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:29<00:00,  7.29s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:04<00:13,  4.66s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:15<00:16,  8.47s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


100%|██████████| 4/4 [00:32<00:00,  8.20s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:07<00:07,  7.73s/it]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:13<00:00,  6.55s/it]


Collecting ground truth for fixed-size-1024-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


 25%|██▌       | 1/4 [00:11<00:33, 11.31s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:22<00:22, 11.05s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:30<00:09,  9.66s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:41<00:00, 10.26s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:10<00:31, 10.59s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:25<00:26, 13.14s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:42<00:14, 15.00s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


100%|██████████| 4/4 [00:55<00:00, 13.96s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:08<00:08,  8.38s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 2/2 [00:18<00:00,  9.06s/it]


Collecting ground truth for markdown-header
Collecting ground truth for simple


100%|██████████| 4/4 [00:19<00:00,  4.97s/it]


Collecting ground truth for reasoning


100%|██████████| 4/4 [00:23<00:00,  5.88s/it]


Collecting ground truth for multi_context


100%|██████████| 2/2 [00:09<00:00,  4.68s/it]


Collecting ground truth for recursive-2048-0
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:06<00:20,  6.80s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 2/4 [00:13<00:13,  6.63s/it]

Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:18<00:05,  5.96s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:25<00:00,  6.35s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:06<00:20,  6.92s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 2/4 [00:15<00:15,  7.89s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:25<00:09,  9.07s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:32<00:00,  8.14s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:06<00:06,  6.60s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:12<00:00,  6.08s/it]


Collecting ground truth for fixed-size-2048-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:08<00:26,  8.75s/it]

Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:14<00:13,  6.87s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:20<00:06,  6.78s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:28<00:00,  7.19s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:03<00:11,  3.83s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


 50%|█████     | 2/4 [00:18<00:20, 10.29s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:26<00:09,  9.02s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:33<00:00,  8.48s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:06<00:06,  6.38s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:11<00:00,  5.98s/it]


Collecting ground truth for recursive-512-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:19<00:57, 19.03s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:37<00:37, 18.97s/it]

Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:53<00:17, 17.54s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:12<00:00, 18.21s/it]


Collecting ground truth for reasoning


 25%|██▌       | 1/4 [00:15<00:46, 15.53s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:47<00:50, 25.47s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [01:20<00:28, 28.83s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:39<00:00, 24.91s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:18<00:18, 18.94s/it]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:34<00:00, 17.48s/it]


Collecting ground truth for recursive-2048-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:06<00:20,  6.87s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 2/4 [00:14<00:15,  7.59s/it]

Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:18<00:05,  5.83s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:25<00:00,  6.40s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:05<00:16,  5.47s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


 50%|█████     | 2/4 [00:17<00:18,  9.23s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:26<00:09,  9.11s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:33<00:00,  8.28s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 1/2 [00:06<00:06,  6.86s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:12<00:00,  6.36s/it]


Collecting ground truth for recursive-1024-0
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:09<00:27,  9.30s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:20<00:20, 10.27s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:27<00:08,  8.86s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:37<00:00,  9.38s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:08<00:25,  8.52s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:40<00:14, 14.17s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:50<00:00, 12.62s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:10<00:10, 10.15s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:18<00:00,  9.04s/it]


Collecting ground truth for fixed-size-512-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:22<01:07, 22.46s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [01:02<00:20, 20.29s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:25<00:00, 21.32s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:17<00:51, 17.25s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:52<00:55, 27.99s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [01:27<00:31, 31.26s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:50<00:00, 27.52s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:22<00:22, 22.87s/it]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:39<00:00, 19.97s/it]


Collecting ground truth for semantic-chunks-95-recursive-2048-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 25%|██▌       | 1/4 [00:08<00:24,  8.32s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:17<00:17,  8.63s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 75%|███████▌  | 3/4 [00:24<00:08,  8.03s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


100%|██████████| 4/4 [00:33<00:00,  8.27s/it]


Collecting ground truth for reasoning


 25%|██▌       | 1/4 [00:04<00:13,  4.55s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


 50%|█████     | 2/4 [00:20<00:22, 11.49s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:30<00:10, 10.60s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:38<00:00,  9.71s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds


 50%|█████     | 1/2 [00:09<00:09,  9.01s/it]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:14<00:00,  7.28s/it]


Collecting ground truth for semantic-chunks-90
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:08<00:25,  8.60s/it]

Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:17<00:17,  8.53s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:24<00:07,  7.95s/it]

Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:32<00:00,  8.02s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:07<00:21,  7.24s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:18<00:19,  9.76s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 2 seconds
Rate limited, waiting 4 seconds


100%|██████████| 4/4 [00:39<00:00,  9.86s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:08<00:08,  8.46s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:15<00:00,  7.78s/it]


Collecting ground truth for semantic-chunks-95
Collecting ground truth for simple


 25%|██▌       | 1/4 [00:06<00:18,  6.03s/it]

Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:12<00:12,  6.21s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:23<00:00,  5.95s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:04<00:14,  4.99s/it]

Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:14<00:15,  7.82s/it]

Rate limited, waiting 1 seconds


100%|██████████| 4/4 [00:29<00:00,  7.49s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:06<00:06,  6.34s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:11<00:00,  6.00s/it]


Collecting ground truth for recursive-512-0
Collecting ground truth for simple


 25%|██▌       | 1/4 [00:17<00:51, 17.16s/it]

Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:35<00:35, 17.84s/it]

Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:08<00:00, 17.05s/it]


Collecting ground truth for reasoning


 25%|██▌       | 1/4 [00:14<00:43, 14.50s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [01:12<00:25, 25.99s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:30<00:00, 22.69s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:18<00:18, 18.40s/it]

Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:32<00:00, 16.33s/it]


Collecting ground truth for markdown-header-recursive-512-200
Collecting ground truth for simple


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:21<01:04, 21.58s/it]

Rate limited, waiting 1 seconds


 75%|███████▌  | 3/4 [00:59<00:19, 19.40s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:21<00:00, 20.44s/it]


Collecting ground truth for reasoning


  0%|          | 0/4 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 25%|██▌       | 1/4 [00:16<00:50, 16.83s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


 50%|█████     | 2/4 [00:49<00:52, 26.16s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 4/4 [01:44<00:00, 26.07s/it]


Collecting ground truth for multi_context


  0%|          | 0/2 [00:00<?, ?it/s]

Rate limited, waiting 1 seconds


 50%|█████     | 1/2 [00:21<00:21, 21.55s/it]

Rate limited, waiting 1 seconds
Rate limited, waiting 1 seconds


100%|██████████| 2/2 [00:38<00:00, 19.37s/it]
100%|██████████| 2/2 [00:38<00:00, 19.37s/it]


Save Evaluation Dataset


In [27]:
with open(data_dir+'gold_dataset.json', 'w') as jsonl_file:
    json.dump(gold_dataset, jsonl_file, indent=4)

## Evaluate Retrieval

Load Evaluation Dataset


In [8]:
gold_dataset = {}
with open(data_dir+'gold_dataset.json', 'r') as jsonl_file:
    gold_dataset = json.load(jsonl_file)

In [13]:
from utils.evaluation import calculate_metrics, calculate_mean_metrics

results = pd.DataFrame(columns=["experiment_name", "question_type", "precision", "recall", "ndcg"])

for experiment_name, questions in gold_dataset.items():
    if experiment_name not in vector_stores:
        continue
    
    K = 10 
    retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": K})
    for question_type, testsets in questions.items():
        metrics = []
        for testset in testsets:
            question = testset["question"]
            ground_truth = testset["ground_truth_chunks"]
            K = len(ground_truth)
            retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": K})
            retrieved_chunks = retriever.invoke(question)
            retrieved_chunk_ids = [str(doc.metadata["id"]) for doc in retrieved_chunks]
            metrics.append(calculate_metrics(retrieved_chunk_ids, ground_truth_chunks=list(ground_truth.keys()), ground_truth_relevancies=list(ground_truth.values()), K=K))
        
        mean_metrics = calculate_mean_metrics(metrics)
        
        results.loc[len(results)] = [
            experiment_name,
            question_type,
            mean_metrics["precision"],
            mean_metrics["recall"],
            mean_metrics["ndcg"]
        ]

results.to_csv(data_dir+"results.csv", index=False)

In [15]:
# sort by ndcg
results_view = results.drop(columns=["question_type"]).groupby("experiment_name").mean().sort_values(by="ndcg", ascending=False)
results_view

Unnamed: 0_level_0,precision,recall,ndcg
experiment_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fixed-size-2048-200,0.841915,0.841915,0.884327
fixed-size-2048-0,0.933036,0.933036,0.882625
recursive-2048-0,0.863806,0.863806,0.873765
markdown-header-recursive-2048-200,0.805336,0.805336,0.863841
fixed-size-1024-200,0.767514,0.767514,0.84155
markdown-header-recursive-2048-0,0.724237,0.724237,0.838231
recursive-2048-200,0.828538,0.828538,0.835953
markdown-header,0.626389,0.626389,0.811778
semantic-chunks-95,0.744874,0.744874,0.810884
fixed-size-1024-0,0.762517,0.762517,0.797328


## Evaluate Generation

In [None]:

nest_asyncio.apply()

answer_correctness_system_prompt = """You are a CORRECTNESS grader; providing the correctness of the given GENERATED ANSWER compared to the given GROUND TRUTH ANSWER.
Respond only as a number from 0 to 10 where 0 is the least correct and 10 is the most correct.

A few additional scoring guidelines:

- Long GENERATED ANSWERS should score equally well as short GENERATED ANSWERS.

- CORRECTNESS score should increase as the GENERATED ANSWER matches more accurately with the GROUND TRUTH ANSWER.

- CORRECTNESS score should increase as the GENERATED ANSWER covers more parts of the GROUND TRUTH ANSWER accurately.

- GENERATED ANSWERS that partially match the GROUND TRUTH ANSWER should score 2, 3, or 4. Higher scores indicate more correctness.

- GENERATED ANSWERS that mostly match the GROUND TRUTH ANSWER should get a score of 5, 6, 7, or 8. Higher scores indicate more correctness.

- GENERATED ANSWERS that fully match the GROUND TRUTH ANSWER should get a score of 9 or 10. Higher scores indicate more correctness.

- GENERATED ANSWERS must be fully accurate and comprehensive to the GROUND TRUTH ANSWER to get a score of 10.

- Never elaborate."""

answer_correctness_user_prompt = PromptTemplate.from_template(
    """GROUND TRUTH ANSWER: {ground_truth_answer}

GENERATED ANSWER: {generated_answer}

CORRECTNESS: """
)

prompt = hub.pull("rlm/rag-prompt")
generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
        
for experiment_name, questions in gold_dataset.items():
    print("Evaluating", experiment_name)
    vector_stores[experiment_name].embeddings.show_progress_bar = False
    retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": 10})
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | generator_llm
        | StrOutputParser()
    )
    for question_type, testsets in questions.items():
        mean_answer_correctness = 0
        for testset in testsets:
            response = rag_chain.invoke(testset["question"])
            answer_correctness_prompt = answer_correctness_user_prompt.format(
                ground_truth_answer=testset["ground_truth_answer"], generated_answer=response
            )

            llm_messages = [
                SystemMessage(content=answer_correctness_system_prompt),
                HumanMessage(content=answer_correctness_prompt),
            ]
            response = make_request_with_backoff(llm_messages)

            answer_correctness = re_0_10_rating(response.content)
            mean_answer_correctness += answer_correctness
        mean_answer_correctness /= len(testsets)
        print(f"Experiment: {experiment_name} Question Type: {question_type} Mean Answer Correctness: {mean_answer_correctness}")