In [None]:
from pydantic import BaseModel, Field

class TestQuery(BaseModel):
    question: str = Field(..., description = "User prompt")
    keywords: list[str] = Field(..., description = "Words that should present in response")
    reference_answer: str
    category: str

class RetrivalMetrics(BaseModel):
    mean_recursive_rank: float
    discounted_cummulative_gain: float

In [9]:
def load_tests() -> list[TestQuery]:
    path = './tests.jsonl'
    tests = []
    with open(path, 'r', encoding = 'utf-8') as data:
        for test in data:
            tests.append(TestQuery.model_validate_json(json_data = test))
    return tests

In [40]:
import os

from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

DB_NAME = 'vector_db'
EMBEDDING_MODEL_NAME = 'all-minilm'

embedding = OllamaEmbeddings(model = EMBEDDING_MODEL_NAME)

if os.path.exists(DB_NAME):
    vector_store = Chroma(persist_directory = DB_NAME)
else:
    raise ValueError('No datastore found!')

retriever = vector_store.as_retriever()

In [41]:
def get_chunks_from_db(question) -> list[Document]:
    return retriever.invoke(input = question, k = 5)

# Evaluate Retrieval

In [11]:
from math import log2

In [12]:
def mean_recursive_rank(chunks: list[str], keyword: str) -> float:
    keyword = keyword.lower()
    for rank, chunk in enumerate(chunks, start = 1):
        if keyword in chunk.lower():
            return 1.0 / rank
    return 0.0

In [13]:
def discounted_cummulative_gain(chunk: str, keywords: list[str]) -> float:
    chunk = chunk.lower()
    for keyword in keywords:
        if keyword.lower() in chunk:
            return 1.0
    return 0.0

In [14]:
def compute_discounted_cummulative_gain(relavency_list: list[int]):
    dcg = 0.0
    for rank, relavency in enumerate(relavency_list, start = 2):
        dcg += (relavency / log2(rank))
    return dcg

In [None]:
from tqdm import tqdm

def evaluate_retrieval_metrics() -> RetrivalMetrics:
    tests = load_tests()

    mrr = []
    dcg = []

    for test in tqdm(tests):
        
        keywords = test.keywords
        chunks = get_chunks_from_db(test.question)
        chunks = [chunk.page_content for chunk in chunks]

        mrr_score = 0

        for keyword in keywords:
            mrr_score += mean_recursive_rank(chunks = chunks, keyword = keyword)

        mrr.append(mrr_score / len(keywords))

        dcg_scores = []

        for chunk in chunks:
            dcg_scores.append(discounted_cummulative_gain(chunk = chunk, keywords = keywords))

        normalized_dcg_score = compute_discounted_cummulative_gain(relavency_list = dcg_scores)
        ideal_dcg_score = compute_discounted_cummulative_gain(relavency_list = sorted(dcg_scores, reverse = True))

        dcg.append(normalized_dcg_score / ideal_dcg_score if ideal_dcg_score else 0.0)
    
    mrr_average = sum(mrr) / len(mrr) if mrr else 0.0
    dcg_average = sum(dcg) / len(dcg) if dcg else 0.0

    return RetrivalMetrics(mean_recursive_rank = mrr_average, discounted_cummulative_gain = dcg_average)

evaluate_retrieval_metrics()

100%|██████████| 150/150 [00:09<00:00, 15.33it/s]


RetrivalMetrics(mean_recursive_rank=0.7346018518518519, discounted_cummulative_gain=0.9019096262063332)

# Evaluate Answers

In [None]:
from pydantic import BaseModel, Field

class JudgeResponse(BaseModel):
    accuracy: int = Field(description = 'How factually correct is it compared to the reference answer?')
    completeness: int = Field(description = 'How thoroughly does it address all aspects of the question')
    relevance: int = Field(description = 'How well does it directly answer the specific question asked')

In [62]:
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
from tqdm import tqdm

OLLAMA_MODEL = 'gpt-oss:latest'
llm = ChatOllama(model = OLLAMA_MODEL)

In [18]:
LLM_JUDGE_PROMPT = """
    You are an expert evaluator assessing the quality of answers. 
    Evaluate the generated answer by comparing it to the reference answer. 
    Only give 5/5 scores for perfect answers.

    Please evaluate the generated answer on three dimensions:
    1. Accuracy: How factually correct is it compared to the reference answer? Only give 5/5 scores for perfect answers.
    2. Completeness: How thoroughly does it address all aspects of the question, covering all the information from the reference answer?
    3. Relevance: How well does it directly answer the specific question asked, giving no additional information?

    Provide detailed feedback and scores from 1 (very poor) to 5 (ideal) for each dimension. 
    If the answer is wrong, then the accuracy score must be 1.
"""

In [19]:
EVALUATION_CONTEXT = """

    Question:
    {question}

    Generated Answer:
    {generated_answer}

    Reference Answer:
    {reference_answer}

"""

In [53]:
def generate_answer(question: str) -> str:

    SYSTEM_PROMPT = """

        You are a knowledgeable, friendly assistant representing the company Insurellm.
        You are chatting with a user about Insurellm.
        If relevant, use the given context to answer any question.
        If you don't know the answer, say so.

        Context:
        {context}

    """

    chunks = get_chunks_from_db(question = question)
    chunks = [chunk.page_content for chunk in chunks]
    context = "\n".join(chunks)

    response = llm.invoke(input = [SystemMessage(content = SYSTEM_PROMPT.format(context = context)), HumanMessage(content = question)])
    return response.content

In [None]:
def evaluate_answers() -> tuple[int, int, int]:
    tests = load_tests()

    accuracy = 0.0
    completeness = 0.0
    relevance = 0.0

    for test in tqdm(tests):

        system = SystemMessage(content = LLM_JUDGE_PROMPT)
        human = HumanMessage(content = EVALUATION_CONTEXT.format(
            question = test.question,
            generated_answer = generate_answer(test.question),
            reference_answer = test.reference_answer
        ))
        judge = llm.with_structured_output(schema = JudgeResponse)
        response = judge.invoke(input = [system, human])
        accuracy += response.accuracy
        completeness += response.completeness
        relevance += response.relevance

    return accuracy, completeness, relevance

In [None]:
evaluate_answers()

  0%|          | 0/150 [00:15<?, ?it/s]


(3.0, 5.0, 3.0)