# My Evaluation Approach


![](assets/ragas.png)


## Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import time
import os
from typing import List, Dict, TypedDict
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import nest_asyncio
from dotenv import load_dotenv

import openai
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import VectorStore
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain.schema import StrOutputParser

In [3]:
loaded = load_dotenv()

data_dir = "ragas/"
os.environ['CHUNKING_BENCHMARK'] = data_dir

# 1. Load and Save Documents


Each document is loaded as one Langchain document possibly to small to fit into a LLM. Therefore, we need to split these documents into smaller pieces of text for further processing.

In [None]:
from utils.loader import save_documents

documents: List[Document] = []
for file in os.listdir(data_dir+"documents"):
    file_path = os.path.join(data_dir+"documents", file)
    loader = TextLoader(file_path)
    documents.extend(loader.load())

save_documents(documents, data_dir)

In [None]:
from utils.loader import load_documents
documents = load_documents(data_dir)

# 2. Apply chunking


In [None]:
%run -i chunking_strategies.ipynb

In [4]:
from utils.loader import load_chunks
split_chunks: Dict[str, Document] = load_chunks(data_dir)

# 3. Ingest into vector store

Using FAISS


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

vector_stores: Dict[str, VectorStore] = {}

embeddings = HuggingFaceEmbeddings(
    model_name="Snowflake/snowflake-arctic-embed-l",
    model_kwargs={"device": 0, 'trust_remote_code': True},  # Comment out to use CPU
)

# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store_dir = f"{data_dir}vector_stores/{embeddings.model_name.replace('/', '-')}"
Path(vector_store_dir).mkdir(parents=True, exist_ok=True)
for experiment_name, chunks in split_chunks.items():
    if os.path.exists(f"{vector_store_dir}/{experiment_name}"):
        print("Loading", experiment_name)
        vector_stores[experiment_name] = FAISS.load_local(f"{vector_store_dir}/{experiment_name}", embeddings, allow_dangerous_deserialization=True)
    else:
        print("Indexing", experiment_name)
        vector_stores[experiment_name] = FAISS.from_documents(chunks, embeddings)
        vector_stores[experiment_name].save_local(f"{vector_store_dir}/{experiment_name}")

Loading markdown-header-recursive-512-0
Loading fixed-size-2048-0
Loading markdown-header-recursive-1024-0
Loading markdown-header-recursive-2048-200
Loading recursive-1024-200
Loading fixed-size-512-0
Loading fixed-size-1024-0
Loading markdown-header-recursive-1024-200
Loading markdown-header-recursive-2048-0
Loading fixed-size-1024-200
Loading markdown-header
Loading recursive-2048-0
Loading fixed-size-2048-200
Loading recursive-512-200
Loading recursive-2048-200
Loading recursive-1024-0
Loading fixed-size-512-200
Loading semantic-chunks-95-recursive-2048-200
Loading semantic-chunks-90
Loading semantic-chunks-95
Loading recursive-512-0
Loading markdown-header-recursive-512-200


# 5. Evaluation


## Create Golden Datasets

Create golden dataset on subset of documents, to have some irrelevant documents left for some noise

In [None]:
documents_subset_sources = [data_dir+"documents/sleep.md", data_dir+"documents/teeth.md", data_dir+"documents/time_management.md", data_dir+"documents/mentoring.md"]

### Question Generation with RAGAS


Generate synthetic Questions across Documents to challenge chunking strategies on multi-context queries


In [6]:
from os import environ

environ["RAGAS_DO_NOT_TRACK"] = "true"

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

nest_asyncio.apply()

generator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
critic_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)

for experiment_name, chunks in split_chunks.items():
    print("Generating", experiment_name)
    ragas_testset = generator.generate_with_langchain_docs(
        # [chunk for chunk in chunks if chunk.metadata["source"] in documents_subset_sources],
        chunks,
        test_size=10,
        distributions={simple: 0.2, reasoning: 0.3, multi_context: 0.5},
    )
    df = ragas_testset.to_pandas()
    df.to_csv(data_dir+f"testsets/{experiment_name}.csv", index=False)

## Evaluate Retrieval

Load Evaluation Dataset


In [7]:
from ragas.metrics import context_precision, context_recall
from ragas import evaluate
from datasets import Dataset


# Allow nested use of asyncio (used by Ragas)
nest_asyncio.apply()

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

for experiment_name, vector_store in vector_stores.items():
    ragas_testset = pd.read_csv(data_dir+f"testsets/{experiment_name}.csv")
    questions = ragas_testset["question"].tolist()
    ground_truths = ragas_testset["ground_truth"].tolist()
    eval_data = {
        "question": questions,
        "ground_truth": ground_truths,
        "contexts": [],
    }
    K = 10 
    retriever = vector_store.as_retriever(search_kwargs={"k": K})
    for question in tqdm(questions):
            # K = len(ground_truth)
            # retriever = vector_stores[experiment_name].as_retriever(search_kwargs={"k": K})
            retrieved_chunks = retriever.invoke(question)
            eval_data["contexts"].append([chunk.page_content for chunk in retrieved_chunks])
     
    # RAGAS expects a Dataset object
    dataset = Dataset.from_dict(eval_data)

    retries = 8
    for i in range(retries):
        try:
            print(f"Running evaluation for {experiment_name}")
            result = evaluate(
                dataset=dataset,
                metrics=[context_precision, context_recall],
                raise_exceptions=True,
                embeddings=embeddings,
                llm=llm,
            )
            result.to_pandas().to_csv(data_dir+f"results/{experiment_name}.csv", index=False)
            break
        except Exception as e:
            if i == retries - 1:
                raise e
            wait_time = 2**i
            print(f"Error, waiting {wait_time} seconds")
            time.sleep(wait_time)


    

100%|██████████| 8/8 [00:01<00:00,  5.16it/s]

Running evaluation for markdown-header-recursive-512-0





Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 20.34it/s]

Running evaluation for fixed-size-2048-0





Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

In [9]:
for experiment_name in vector_stores.keys():
    result = pd.read_csv(data_dir+f"results/{experiment_name}.csv")
    map = result["context_precision"].mean()
    print(f"MAP for {experiment_name}: {map}")

MAP for markdown-header-recursive-512-0: 0.3295138888743634
MAP for fixed-size-2048-0: 0.6115039682362264
MAP for markdown-header-recursive-1024-0: 0.5172486772339645
MAP for markdown-header-recursive-2048-200: 0.6374603174436049
MAP for recursive-1024-200: 0.6200651927260762
MAP for fixed-size-512-0: 0.7209920634745784
MAP for fixed-size-1024-0: 0.8884501763530135
MAP for markdown-header-recursive-1024-200: 0.5722163170627643
MAP for markdown-header-recursive-2048-0: 0.49289021162476027
MAP for fixed-size-1024-200: 0.5841187956600791
MAP for markdown-header: 0.8235317460111551
MAP for recursive-2048-0: 0.8150881834044512
MAP for fixed-size-2048-200: nan
MAP for recursive-512-200: nan
MAP for recursive-2048-200: nan
MAP for recursive-1024-0: nan
MAP for fixed-size-512-200: nan
MAP for semantic-chunks-95-recursive-2048-200: nan
MAP for semantic-chunks-90: nan
MAP for semantic-chunks-95: nan
MAP for recursive-512-0: nan
MAP for markdown-header-recursive-512-200: nan
