# RAG Evaluation

### Imports and configs

In [1]:
from typing import List
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import os
import sys
from dotenv import load_dotenv
import json
from deepeval import evaluate
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCaseParams, LLMTestCase
from utils import load_or_create_vector_store


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

EMBED_DIMENSION = 512
CHUNK_SIZE = 250
CHUNK_OVERLAP = 25

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

path = "../data/"
node_parser = SimpleDirectoryReader(input_dir=path, required_exts=['.txt', '.pdf'])
documents = node_parser.load_data()
    
CACHE_DIR = "../cache"
VECTOR_STORE_PATH = os.path.join(CACHE_DIR, "faiss_index.pkl")
HASH_PATH = os.path.join(CACHE_DIR, "documents_hash.txt")

vector_store_index = load_or_create_vector_store(documents, EMBED_DIMENSION, CHUNK_SIZE, CHUNK_OVERLAP, cache_dir=CACHE_DIR, vector_store_path=VECTOR_STORE_PATH, hash_path=HASH_PATH)
retriever = vector_store_index.as_retriever(similarity_top_k=2)




Loading vector store from cache...


### Evaluation

In [2]:
def create_deep_eval_test_cases(
    questions: List[str],
    gt_answers: List[str],
    generated_answers: List[str],
    retrieved_documents: List[str]
) -> List[LLMTestCase]:
    """
    Create a list of LLMTestCase objects for evaluation.

    Args:
        questions (List[str]): List of input questions.
        gt_answers (List[str]): List of ground truth answers.
        generated_answers (List[str]): List of generated answers.
        retrieved_documents (List[str]): List of retrieved documents.

    Returns:
        List[LLMTestCase]: List of LLMTestCase objects.
    """
    return [
        LLMTestCase(
            input=question,
            expected_output=gt_answer,
            actual_output=generated_answer,
            retrieval_context=retrieved_document
        )
        for question, gt_answer, generated_answer, retrieved_document in zip(
            questions, gt_answers, generated_answers, retrieved_documents
        )
    ]

In [3]:
# Set llm model for evaluation of the question and answers 
LLM_MODEL = "gpt-4o"

# Define evaluation metrics
correctness_metric = GEval(
    name="Correctness",
    model=LLM_MODEL,
    evaluation_params=[
        LLMTestCaseParams.EXPECTED_OUTPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT
    ],
    evaluation_steps=[
        "Determine whether the actual output is factually correct based on the expected output."
    ],
)

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model=LLM_MODEL,
    include_reason=False
)

relevance_metric = ContextualRelevancyMetric(
    threshold=1,
    model=LLM_MODEL,
    include_reason=True
)

def evaluate_rag(query_engine, num_questions: int = 5) -> None:
    """
    Evaluate the RAG system using predefined metrics.

    Args:
        query_engine: Query engine to ask questions and get answers along with retrieved context.
        num_questions (int): Number of questions to evaluate (default: 5).
    """
    
    
    # Load questions and answers from JSON file
    q_a_file_name = "../data/q_a.json"
    with open(q_a_file_name, "r", encoding="utf-8") as json_file:
        q_a = json.load(json_file)

    questions = [qa["question"] for qa in q_a][:num_questions]
    ground_truth_answers = [qa["answer"] for qa in q_a][:num_questions]
    generated_answers = []
    retrieved_documents = []

    # Generate answers and retrieve documents for each question
    for question in questions:
        response = query_engine.query(question)
        context = [doc.text for doc in response.source_nodes]
        retrieved_documents.append(context)
        generated_answers.append(response.response)

    # Create test cases and evaluate
    test_cases = create_deep_eval_test_cases(questions, ground_truth_answers, generated_answers, retrieved_documents)
    evaluate(
        test_cases=test_cases,
        metrics=[correctness_metric, faithfulness_metric, relevance_metric]
    )

In [4]:
query_engine  = vector_store_index.as_query_engine(similarity_top_k=2)
evaluate_rag(query_engine, num_questions=6)

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 6 test case(s) in parallel: |██████████|100% (6/6) [Time Taken: 00:06,  1.10s/test case]



Metrics Summary

  - ✅ Correctness (GEval) (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output is factually correct as it matches the expected output verbatim regarding the Green party's promise., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: None, error: None)
  - ✅ Contextual Relevancy (score: 1.0, threshold: 1.0, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the context perfectly aligns with the input. Great job!, error: None)

For test case:

  - input: By how much does the Green party promise to increase the disability benefits?
  - actual output: The Green party promises to increase disability benefits by 5%.
  - expected output: The Green party promises to increase disability benefits by 5%.
  - context: None
  - retrieval context: ['• Increase all disability benefits by 5%.\n• Ensure that pensions are always uprated in line with inflation an


