# Multi Hop RAG Benchmark

![](../assets/multi_hop_rag.png)

Question and Ground Truths are provided by the Multi Hop RAG Benchmark
Hard to use as ground truths consist of facts (sentences) that could have been split up by chunking strategies. But maybe not so bad as sentences shouldnt be split up anyways. LOl

**CANT use gpt-4o-mini because knowledge cutoff is in 2023 so it may be able to answer qeury without needing RAG**

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import time
import os
from typing import List, Dict, TypedDict
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

import tiktoken
import openai
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import VectorStore
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser

In [3]:
loaded = load_dotenv()

data_dir = "data/multi_hop_rag/"
os.environ['CHUNKING_BENCHMARK_DATADIR'] = data_dir

# 1. Load and Save Multi Hop RAG Dataset

In [None]:
from utils.loader import save_documents

documents: List[Document] = []
with open(data_dir+"dataset/corpus.json", 'r') as file_name:
    load_data = json.load(file_name)

for data in load_data:
    metadata = {"title": data['title'], "published_at": data['published_at'],"source":data['source']}
    documents.append(Document(page_content=data['body'], metadata=metadata))

save_documents(documents, data_dir)

In [None]:
encoding = tiktoken.encoding_for_model("text-embedding-3-small")
num_tokens = 0
for doc in documents:
    num_tokens += len(encoding.encode(doc.page_content))

cost = (num_tokens/1000000) * 0.01
print(f"Cost of embedding chunks: {cost} with {num_tokens} tokens")

In [None]:
from utils.loader import load_documents
documents = load_documents(data_dir)

# 2. Apply chunking

In [None]:
%run -i ../chunking_strategies.ipynb

In [4]:
from utils.loader import load_chunks
split_chunks = load_chunks(data_dir)

# 3. Indexing/Ingestion

In [5]:
vector_stores: Dict[str, VectorStore] = {}

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)
for experiment_name, chunks in split_chunks.items():
    if os.path.exists(f"{data_dir}vector_stores/{experiment_name}"):
        print("Loading", experiment_name)
        vector_stores[experiment_name] = FAISS.load_local(f"{data_dir}vector_stores/{experiment_name}", embeddings, allow_dangerous_deserialization=True)
    else:
        print("Indexing", experiment_name)
        vector_stores[experiment_name] = FAISS.from_documents(chunks, embeddings)
        vector_stores[experiment_name].save_local(f"{data_dir}vector_stores/{experiment_name}")

Loading fixed-size-1000-0
Loading recursive-1000-0
Loading semantic-chunks


# Evaluation

In [6]:
with open("multi_hop_rag/dataset/MultiHopRAG.json", "r") as file_name:
    query_data = json.load(file_name)

In [None]:
encoding = tiktoken.encoding_for_model("text-embedding-3-small")
num_tokens = 0
for data in query_data:
    num_tokens += len(encoding.encode(data['query']))

cost = (num_tokens/1000000) * 0.01
print(f"Cost of embedding chunks: {cost} with {num_tokens} tokens")

## Evaluating Retrieval

In [7]:
from utils.evaluation import calculate_metrics, calculate_mean_metrics

for experiment_name, vector_store in vector_stores.items():
    print("Generating evaluation dataset for", experiment_name)
    # vector_store.embeddings.show_progress_bar = False
    # retriever = vector_store.as_retriever(search_kwargs={"k": 10})
    metrics = []
    for data in tqdm(query_data[:100]):
        if data['question_type'] == 'null_query':
            continue
        query = data["query"]
        vector_store.embeddings.show_progress_bar = False
        retriever = vector_store.as_retriever(search_kwargs={"k": len(data["evidence_list"])})
        retrieved_chunks = retriever.invoke(query)
        retrieved_chunks_content = ([doc.page_content for doc in retrieved_chunks])
        ground_truths = {gold["fact"]: 1.0 for gold in data["evidence_list"]}
        metrics.append(calculate_metrics(retrieved_chunks_content, ground_truths))
    mean_metrics = calculate_mean_metrics(metrics)
    print("Mean metrics for", experiment_name, mean_metrics)

    


Generating evaluation dataset for fixed-size-1000-0


100%|██████████| 100/100 [00:30<00:00,  3.30it/s]


Mean metrics for fixed-size-1000-0 {'precision': 0.19230769230769232, 'recall': 0.19230769230769232, 'map': 0.15018315018315018, 'ndcg': 0.0}
Generating evaluation dataset for recursive-1000-0


100%|██████████| 100/100 [00:28<00:00,  3.46it/s]


Mean metrics for recursive-1000-0 {'precision': 0.27289377289377287, 'recall': 0.27289377289377287, 'map': 0.22115384615384612, 'ndcg': 0.0}
Generating evaluation dataset for semantic-chunks


100%|██████████| 100/100 [00:29<00:00,  3.41it/s]

Mean metrics for semantic-chunks {'precision': 0.28388278388278393, 'recall': 0.28388278388278393, 'map': 0.22779304029304026, 'ndcg': 0.0}





## Evaluating Generation

In [None]:
nest_asyncio.apply()

evaluation_datasets = {}
prompt = hub.pull("rlm/rag-prompt")
generator_llm = ChatOpenAI(model="gpt-4o-mini")
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
        
for experiment_name, vector_store in vector_stores.items():
    print("Evaluating", experiment_name)
    evaluation_datasets[experiment_name] = { "question": [], "answer": [], "ground_truth": [] }
    vector_store.embeddings.show_progress_bar = False
    retriever = vector_store.as_retriever()

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | generator_llm
        | StrOutputParser()
    )

    for data in tqdm(query_data[:10]):
        if data['question_type'] == 'null_query':
            continue
        query = data["query"]
        response = rag_chain.invoke(query)
        evaluation_datasets[experiment_name]["question"].append(query)
        evaluation_datasets[experiment_name]["answer"].append(response)
        evaluation_datasets[experiment_name]["ground_truth"].append(data["answer"])


In [None]:
from datasets import Dataset 
from ragas.metrics import answer_correctness
from ragas import evaluate

nest_asyncio.apply()

critic_llm = ChatOpenAI(model="gpt-4o-mini")

for experiment_name, data in evaluation_datasets.items():
    dataset = Dataset.from_dict(data)
    score = evaluate(dataset,metrics=[answer_correctness], llm=critic_llm)
    answer_correctnesses = score.to_pandas()["answer_correctness"].tolist()
    data["answer_correctnesses"] = answer_correctnesses
    print(f"Answer correctness for {experiment_name}: {score.values}")

    # data["answer_correctness"] = score
    # print(score.to_pandas())