# My Evaluation Approach


![](assets/my_approach.png)


1. **Document Preparation**: Load the entire document and extract content.
2. **Question Generation**: Use an LLM to generate broad and detailed questions about the document.
3. **Chunking Application**: Apply various chunking methods to the document. The next steps will be executed for each chunking strategy separately.
4. **Chunk-Question Relevancy Scoring**: For each generated question, instruct an LLM to grade all chunks by relevancy to the question. These chunks become the ground truth and will represent our silver evaluation dataset.
5. **Human Annotation**: Human annotators then review and modify the silver dataset to produce the gold dataset.
6. **Chunk Retrieval**: Embed the chunks and retrieve the most similar chunks to each question.
7. **Evaluate Retrieval**: Compare the retrieved chunks with the synthesized ground truth chunks and calculate the Precision, Recall and nDCG to analyse the retrieval performance

## Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import time
import os
from typing import List, Dict, TypedDict
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

import openai
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import VectorStore
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser

In [3]:
loaded = load_dotenv(override=True)

data_dir = "my_benchmark/"
os.environ['CHUNKING_BENCHMARK'] = data_dir

# 1. Load and Save Documents


Each document is loaded as one Langchain document possibly to small to fit into a LLM. Therefore, we need to split these documents into smaller pieces of text for further processing.

In [23]:
from utils.loader import save_documents

documents: List[Document] = []
for file in os.listdir(data_dir+"documents"):
    file_path = os.path.join(data_dir+"documents", file)
    loader = TextLoader(file_path)
    documents.extend(loader.load())

save_documents(documents, data_dir)

# 2. Question Generation


Generate synthetic Questions across Documents to challenge chunking strategies on multi-context queries


In [7]:
from utils.loader import load_documents
documents = load_documents(data_dir)

In [49]:
# Question generation based on documents
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class Question(BaseModel):
    question: str = Field(description="The question generated by the model")
    type: str = Field(description="The type of question generated")

class Questions(BaseModel):
    questions: List[Question] = Field(description="The list of questions generated by the model")

parser = JsonOutputParser(pydantic_object=Questions)

question_generation_prompt = PromptTemplate(
    input_variables=["document"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
    template="""
You are a highly knowledgeable assistant tasked with generating challenging questions to evaluate different document chunking strategies in retrieval augmented generation (RAG) pipelines.
Your goal is to create questions that require detailed, specific, and nuanced understanding of the document content.
These questions should test the ability of different chunking strategies to retrieve and generate accurate and comprehensive responses.

Here is the document:
{document}

Based on the document provided, generate a set of questions that meet the following criteria:

1. **Complexity:** Questions should be complex and require in-depth understanding of the document, involving multiple facts or concepts interlinked within the document.
2. **Specificity:** Questions should be specific and precise, targeting particular sections or details within the document.
3. **Inference:** Questions should require inferential reasoning, where the answer is not directly stated but can be deduced from the document content.
4. **Variability:** Include a mix of question types, such as:
   - Conceptual questions (e.g., understanding the main ideas or arguments presented)
   - Analytical questions (e.g., comparing or contrasting information within the document)
   - Application questions (e.g., applying the information or concepts to a hypothetical scenario)
   - Synthesis questions (e.g., combining multiple pieces of information to form a comprehensive answer)

Ensure that the questions are designed to challenge the document retrieval and generation capabilities of different chunking strategies.

Please generate 4 such challenging questions based on the provided document as follows:
{format_instructions}
"""
)

generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

chain = question_generation_prompt | generator_llm | parser
questions = []
for document in documents[:4]:
    doc_questions = chain.invoke({"document": document.page_content})["questions"]
    for question in doc_questions:
        question["source"] = document.metadata["source"]

    questions.extend(doc_questions)

In [50]:
with open(f"{data_dir}synthetic_questions.json", "w") as f:
    json.dump(questions, f)

# 3. Apply chunking


In [None]:
%run -i chunking_strategies.ipynb

In [4]:
from utils.loader import load_chunks
split_chunks: Dict[str, Document] = load_chunks(data_dir)

In [21]:
df = pd.DataFrame(columns=["Experiment", "Chunk Count", "Average Chunk Size"])
for experiment_name, chunks in split_chunks.items():
    df.loc[len(df)] = [experiment_name, len(chunks), round(sum([len(chunk.page_content) for chunk in chunks])/len(chunks))]

df.sort_values(by="Chunk Count", ascending=True).style.hide(axis="index")

Experiment,Chunk Count,Average Chunk Size
fixed_size-2048-0,65,1915
semantic_chunks_95,67,1848
fixed_size-2048-200,72,1902
recursive-2048-0,72,1728
recursive-2048-200,73,1736
semantic_chunks_90,118,1049
fixed_size-1024-0,127,980
markdown_header,146,844
markdown_header_parent,146,873
fixed_size-1024-200,153,1000


# 4. (5.) Create Evaluation Datasets

### Generate Relevancy Score for each chunk


In [None]:
with open(f"{data_dir}synthetic_questions.json", "r") as f:
    questions = json.load(f)

Initialize datasets with questions

In [None]:
from utils.evaluation import Testset

datasets: Dict[str, List[Testset]]  = {}
for experiment_name in split_chunks.keys():
    datasets[experiment_name] = []
    for question in questions:
        datasets[experiment_name].append({
            "question": question["question"],
            "source": question["source"],
            "type": question["type"],
            "ground_truth_chunks": {} 
        })

Relevancy Prompt is taken by Trulens. The difference is that I apply it to all chunks whereas Trulens only computed it on the retrieved chunks


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.llm_output_parser import re_0_10_rating

system_prompt = """You are a RELEVANCE grader; providing the relevance of the given CONTEXT to the given QUESTION.
    Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. 

    A few additional scoring guidelines:

    - Long CONTEXTS should score equally well as short CONTEXTS.

    - RELEVANCE score should increase as the CONTEXTS provides more RELEVANT context to the QUESTION.

    - RELEVANCE score should increase as the CONTEXTS provides RELEVANT context to more parts of the QUESTION.

    - CONTEXT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.

    - CONTEXT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE.

    - CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 10.

    - Never elaborate."""

user_prompt = PromptTemplate.from_template(
    """QUESTION: {question}

    CONTEXT: {context}
    
    RELEVANCE: """
)

critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


def make_request_with_backoff(messages, retries=8):
    for i in range(retries):
        try:
            response = critic_llm.invoke(messages)
            return response
        except openai.RateLimitError as e:
            if i == retries - 1:
                raise e
            wait_time = 2**i
            print(f"Rate limited, waiting {wait_time} seconds")
            time.sleep(wait_time)
        except openai.APIError as e:
            print(e)


def process_chunk(chunk, testset):
    if chunk.metadata["source"] not in testset["source"]:
        return None, None

    judge_chunk_relevancy_prompt = user_prompt.format(
        question=testset["question"], context=chunk.page_content
    )

    llm_messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=judge_chunk_relevancy_prompt),
    ]
    response = make_request_with_backoff(llm_messages)
    chunk_relevancy = re_0_10_rating(response.content)
    if chunk_relevancy != 0.0:
        return str(chunk.metadata["id"]), chunk_relevancy
    return None, None

for experiment_name, questions in datasets.items():
    if os.path.exists(f"{data_dir}/datasets/{experiment_name}.json"):
        continue

    print("Collecting ground truth for", experiment_name)
    for testset in tqdm(questions):
        ground_truth = {}
        with ThreadPoolExecutor(max_workers=2) as executor:
            future_to_chunk = {
                executor.submit(process_chunk, chunk, testset): chunk
                for chunk in split_chunks[experiment_name]
            }
            for future in as_completed(future_to_chunk):
                chunk_id, relevancy = future.result()
                if chunk_id and relevancy:
                    ground_truth[chunk_id] = relevancy
        
        if len(ground_truth):
            testset["ground_truth_chunks"] = ground_truth

    with open(f"{data_dir}/datasets/{experiment_name}.json", "w") as f:
        json.dump(questions, f)

Collecting ground truth for markdown_header_parent


100%|██████████| 20/20 [01:16<00:00,  3.82s/it]


### Analyse dataset

In [None]:
from utils.loader import load_datasets
datasets = load_datasets(data_dir)

df = pd.DataFrame(columns=[ "Experiment", "Chunk Count", "Average Chunk Size", "Average Ground Truth Count","Average Relevancy per Ground Truth"])
for experiment_name, questions in datasets.items():
    total_relevancy = 0
    total_ground_truth_count = 0
    for question in questions:
        for chunk_relevancy in question["ground_truth_chunks"].values():
            total_relevancy += chunk_relevancy
        total_ground_truth_count += len(question["ground_truth_chunks"])
    
    average_ground_truth_count = total_ground_truth_count / len(questions)
    average_relevancy_per_ground_truth = total_relevancy / total_ground_truth_count
    average_chunk_size = round(sum([len(chunk.page_content) for chunk in split_chunks[experiment_name]]) / len(split_chunks[experiment_name]))
    df.loc[len(df)] = [experiment_name, len(split_chunks[experiment_name]), average_chunk_size, average_ground_truth_count, average_relevancy_per_ground_truth]

df.sort_values(by="Average Chunk Size", ascending=False).style.hide(axis="index")

Experiment,Chunk Count,Average Chunk Size,Average Ground Truth Count,Average Relevancy per Ground Truth
fixed_size-2048-0,65,1915,6.45,4.806202
fixed_size-2048-200,72,1902,6.95,4.784173
semantic_chunks_95,67,1848,6.15,5.0
recursive-2048-200,73,1736,7.15,4.888112
recursive-2048-0,72,1728,6.95,5.100719
semantic_chunks_90,118,1049,11.0,4.172727
fixed_size-1024-200,153,1000,15.4,3.967532
fixed_size-1024-0,127,980,12.6,3.952381
markdown_header_parent,146,873,14.35,3.662021
markdown_header,146,844,14.1,3.723404


Average Total Ground Truth Token Size

In [None]:
mean_ground_truth_token_count = 0
for experiment_name, questions in datasets.items():
    ground_character_token_count = 0
    for question in questions:
        for split_chunk in split_chunks[experiment_name]:
            if str(split_chunk.metadata["id"]) in question["ground_truth_chunks"]:
                ground_character_token_count += len(split_chunk.page_content)
    mean_ground_truth_token_count += ground_character_token_count / (len(questions)*4) # *4 because on token is approximately 4 characters

round(mean_ground_truth_token_count / len(datasets))

3343

# 6. - 7. Evaluation


## Ingest Chunks into Vector Store

Using FAISS


In [8]:
# from langchain_huggingface import HuggingFaceEmbeddings

vector_stores: Dict[str, VectorStore] = {}

# embeddings = HuggingFaceEmbeddings(
#     model_name="Snowflake/snowflake-arctic-embed-l",
    # model_name="Alibaba-NLP/gte-large-en-v1.5",
#     model_kwargs={"device": 0, 'trust_remote_code': True},  # Comment out to use CPU
# )
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

model_name = (embeddings.model_name if hasattr(embeddings, 'model_name') else embeddings.model).replace("/", "_")
vector_store_dir = f"{data_dir}vector_stores/{model_name}"
Path(vector_store_dir).mkdir(parents=True, exist_ok=True)
for experiment_name, chunks in split_chunks.items():
    if os.path.exists(f"{vector_store_dir}/{experiment_name}"):
        print("Loading", experiment_name)
        vector_stores[experiment_name] = FAISS.load_local(f"{vector_store_dir}/{experiment_name}", embeddings, allow_dangerous_deserialization=True)
    else:
        print("Indexing", experiment_name)
        vector_stores[experiment_name] = FAISS.from_documents(chunks, embeddings)
        vector_stores[experiment_name].save_local(f"{vector_store_dir}/{experiment_name}")

Loading fixed_size-512-200
Loading fixed_size-1024-0
Loading semantic_chunks_90
Loading fixed_size-1024-200
Loading markdown_header
Loading fixed_size-2048-0
Loading fixed_size-512-0
Loading recursive-1024-200
Loading markdown_header_parent
Loading fixed_size-2048-200
Loading recursive-2048-0
Loading recursive-512-200
Loading recursive-2048-200
Loading recursive-1024-0
Loading semantic_chunks_95
Loading recursive-512-0


## Evaluate Retrieval

Load Evaluation Dataset


In [6]:
from utils.loader import load_datasets
datasets = load_datasets(data_dir)

Select Evaluation Approach

In [45]:
class EvalApproach:
    FIXED_K = "Fixed-K"
    GROUND_TRUTH_K = "Ground-Truth-K"
    TOKEN_LIMIT = "Token-Limit"
    RATIO_K = "Ratio-K"

SEL_APPROACH: EvalApproach = EvalApproach.RATIO_K
FIXED_K = 20
TOKEN_LIMIT = 3340
RATIO_K = 0.05

model_name = "text-embedding-3-small"
eval_name = f"{SEL_APPROACH}-{FIXED_K}-{model_name}" if SEL_APPROACH == EvalApproach.FIXED_K else ""
eval_name = f"{SEL_APPROACH}-{model_name}" if SEL_APPROACH == EvalApproach.GROUND_TRUTH_K else eval_name
eval_name = f"{SEL_APPROACH}-{TOKEN_LIMIT}-{model_name}" if SEL_APPROACH == EvalApproach.TOKEN_LIMIT else eval_name
eval_name = f"{SEL_APPROACH}-{RATIO_K}-{model_name}" if SEL_APPROACH == EvalApproach.RATIO_K else eval_name
if os.path.exists(f"{data_dir}results/{eval_name}.csv"):
    results = pd.read_csv(f"{data_dir}results/{eval_name}.csv")

In [46]:
from utils.evaluation import calculate_metrics, calculate_mean_metrics

results_list = []
for experiment_name, questions in datasets.items():
    if experiment_name not in vector_stores:
        continue

    K = FIXED_K if SEL_APPROACH == EvalApproach.FIXED_K else 0
    K = (
        round(len(split_chunks[experiment_name]) * RATIO_K)
        if SEL_APPROACH == EvalApproach.RATIO_K
        else K
    )
    K = (
        200 if SEL_APPROACH == EvalApproach.TOKEN_LIMIT else K
    )  # large number to ensure TOKEN_LIMIT is always reached
    print("Evaluating", experiment_name, "with K =", K if K else "Ground Truth based")
    metrics = []
    for testset in tqdm(questions):
        if testset["ground_truth_chunks"] == {}:
            continue
        question = testset["question"]
        ground_truth = testset["ground_truth_chunks"]
        K = len(ground_truth) if SEL_APPROACH == EvalApproach.GROUND_TRUTH_K else K

        retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": K})
        retrieved_chunks = retriever.invoke(question)

        if SEL_APPROACH == EvalApproach.TOKEN_LIMIT:
            # cap the number of retrieved chunks where sum of page_contents are below a fixed context window
            retrieved_chunks_capped = []
            total_context_length = 0
            for chunk in retrieved_chunks:
                total_context_length += len(chunk.page_content)
                if (
                    total_context_length > TOKEN_LIMIT * 4
                ):  # as one token on average is approximately 4 characters
                    break
                retrieved_chunks_capped.append(chunk)

            retrieved_chunks = retrieved_chunks_capped

        retrieved_chunk_ids = [str(doc.metadata["id"]) for doc in retrieved_chunks]
        metrics.append(
            calculate_metrics(
                retrieved_chunk_ids,
                ground_truth_chunks=list(ground_truth.keys()),
                ground_truth_relevancies=list(ground_truth.values()),
            )
        )

    mean_metrics = (
        calculate_mean_metrics(metrics)
        if len(metrics)
        else {
            "precision": 0.0,
            "recall": 0.0,
            "map": 0.0,
            "ndcg": 0.0,
        }
    )

    try:
        experiment_chunk_size = int(experiment_name.split("-")[-2])
        experiment_chunk_overlap = int(experiment_name.split("-")[-1])
    except:
        experiment_chunk_size = None
        experiment_chunk_overlap = None

    results_list.append(
        [
            experiment_name.split("-")[0],
            experiment_chunk_size,
            experiment_chunk_overlap,
            mean_metrics["precision"],
            mean_metrics["recall"],
            mean_metrics["map"],
            mean_metrics["ndcg"],
        ]
    )

results = pd.DataFrame(
    results_list,
    columns=[
        eval_name,
        "Chunk Size",
        "Chunk Overlap",
        "Precision",
        "Recall",
        "MAP",
        "NDCG",
    ],
)
results.to_csv(f"{data_dir}results/{eval_name}.csv", index=False)

Evaluating fixed_size-512-200 with K = 20


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:06<00:00,  3.09it/s]


Evaluating semantic_chunks_90 with K = 6


100%|██████████| 20/20 [00:05<00:00,  3.52it/s]


Evaluating recursive-2048-0 with K = 4


100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


Evaluating recursive-1024-200 with K = 8


100%|██████████| 20/20 [00:07<00:00,  2.66it/s]


Evaluating semantic_chunks_95 with K = 3


100%|██████████| 20/20 [00:05<00:00,  3.66it/s]


Evaluating fixed_size-1024-200 with K = 8


100%|██████████| 20/20 [00:05<00:00,  3.39it/s]


Evaluating recursive-2048-200 with K = 4


100%|██████████| 20/20 [00:06<00:00,  2.88it/s]


Evaluating fixed_size-1024-0 with K = 6


100%|██████████| 20/20 [00:05<00:00,  3.44it/s]


Evaluating recursive-1024-0 with K = 8


100%|██████████| 20/20 [00:05<00:00,  3.58it/s]


Evaluating fixed_size-2048-0 with K = 3


100%|██████████| 20/20 [00:05<00:00,  3.57it/s]


Evaluating recursive-512-200 with K = 19


100%|██████████| 20/20 [00:05<00:00,  3.36it/s]


Evaluating recursive-512-0 with K = 18


100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


Evaluating fixed_size-512-0 with K = 12


100%|██████████| 20/20 [00:05<00:00,  3.38it/s]


Evaluating markdown_header with K = 7


100%|██████████| 20/20 [00:05<00:00,  3.55it/s]


Evaluating markdown_header_parent with K = 7


100%|██████████| 20/20 [00:06<00:00,  3.31it/s]


Evaluating fixed_size-2048-200 with K = 4


100%|██████████| 20/20 [00:05<00:00,  3.64it/s]


### Best Chunking Strategy

In [47]:
results.drop(columns=["MAP"]).groupby([eval_name, "Chunk Size", "Chunk Overlap"], dropna=False).mean().sort_values(by="Recall", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Precision,Recall,NDCG
Ratio-K-0.05-text-embedding-3-small,Chunk Size,Chunk Overlap,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
semantic_chunks_95,,,1.0,0.661667,0.888422
recursive,2048.0,200.0,0.9375,0.627172,0.908906
fixed_size,2048.0,200.0,0.875,0.625354,0.908574
recursive,2048.0,0.0,0.875,0.625354,0.908903
semantic_chunks_90,,,0.841667,0.616118,0.915729
fixed_size,1024.0,200.0,0.90625,0.597243,0.88917
recursive,1024.0,0.0,0.9,0.58663,0.902028
markdown_header,,,0.964286,0.577365,0.898449
recursive,1024.0,200.0,0.9,0.576862,0.91689
fixed_size,1024.0,0.0,0.916667,0.565604,0.905585


### Best Chunk Size

In [48]:
results.drop(columns=[eval_name, "Chunk Overlap", "MAP"]).groupby("Chunk Size", dropna=False).mean().sort_values(by="Recall", ascending=False)

Unnamed: 0_level_0,Precision,Recall,NDCG
Chunk Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2048.0,0.901042,0.60947,0.909941
,0.940774,0.604927,0.900089
1024.0,0.905729,0.581585,0.903418
512.0,0.867639,0.552511,0.860665


### Best Overlap

In [41]:
results.drop(columns=[eval_name, "Chunk Size", "MAP"]).groupby("Chunk Overlap").mean().sort_values(by="NDCG", ascending=False)

Unnamed: 0_level_0,Precision,Recall,NDCG
Chunk Overlap,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.479302,0.979497,0.937772
200.0,0.478943,0.983321,0.934517


## Evaluate Generation (TODO)

In [None]:

nest_asyncio.apply()

answer_correctness_system_prompt = """You are a CORRECTNESS grader; providing the correctness of the given GENERATED ANSWER compared to the given GROUND TRUTH ANSWER.
Respond only as a number from 0 to 10 where 0 is the least correct and 10 is the most correct.

A few additional scoring guidelines:

- Long GENERATED ANSWERS should score equally well as short GENERATED ANSWERS.

- CORRECTNESS score should increase as the GENERATED ANSWER matches more accurately with the GROUND TRUTH ANSWER.

- CORRECTNESS score should increase as the GENERATED ANSWER covers more parts of the GROUND TRUTH ANSWER accurately.

- GENERATED ANSWERS that partially match the GROUND TRUTH ANSWER should score 2, 3, or 4. Higher scores indicate more correctness.

- GENERATED ANSWERS that mostly match the GROUND TRUTH ANSWER should get a score of 5, 6, 7, or 8. Higher scores indicate more correctness.

- GENERATED ANSWERS that fully match the GROUND TRUTH ANSWER should get a score of 9 or 10. Higher scores indicate more correctness.

- GENERATED ANSWERS must be fully accurate and comprehensive to the GROUND TRUTH ANSWER to get a score of 10.

- Never elaborate."""

answer_correctness_user_prompt = PromptTemplate.from_template(
    """GROUND TRUTH ANSWER: {ground_truth_answer}

GENERATED ANSWER: {generated_answer}

CORRECTNESS: """
)

prompt = hub.pull("rlm/rag-prompt")
generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
        
for experiment_name, questions in datasets.items():
    print("Evaluating", experiment_name)
    vector_stores[experiment_name].embeddings.show_progress_bar = False
    retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": 10})
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | generator_llm
        | StrOutputParser()
    )
    for question_type, datasets in questions.items():
        mean_answer_correctness = 0
        for testset in datasets:
            response = rag_chain.invoke(testset["question"])
            answer_correctness_prompt = answer_correctness_user_prompt.format(
                ground_truth_answer=testset["ground_truth_answer"], generated_answer=response
            )

            llm_messages = [
                SystemMessage(content=answer_correctness_system_prompt),
                HumanMessage(content=answer_correctness_prompt),
            ]
            response = make_request_with_backoff(llm_messages)

            answer_correctness = re_0_10_rating(response.content)
            mean_answer_correctness += answer_correctness
        mean_answer_correctness /= len(datasets)
        print(f"Experiment: {experiment_name} Question Type: {question_type} Mean Answer Correctness: {mean_answer_correctness}")