In [98]:
from pydantic import BaseModel, Field

class TestQuery(BaseModel):
    question: str = Field(..., description = "User prompt")
    keywords: list[str] = Field(..., description = "Words that should present in response")
    reference_answer: str
    category: str

class RetrivalMetrics(BaseModel):
    mean_recursive_rank: float
    discounted_cummulative_gain: float

In [99]:
def load_tests() -> list[TestQuery]:
    path = './tests.jsonl'
    tests = []
    with open(path, 'r', encoding = 'utf-8') as data:
        for test in data:
            tests.append(TestQuery.model_validate_json(json_data = test))
    return tests

In [100]:
import os

from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

DB_NAME = 'vector_db'
EMBEDDING_MODEL_NAME = 'all-minilm'

embedding = OllamaEmbeddings(model = EMBEDDING_MODEL_NAME)

if os.path.exists(DB_NAME):
    vector_store = Chroma(persist_directory = DB_NAME)
else:
    raise ValueError('No datastore found!')

retriever = vector_store.as_retriever()

In [101]:
from math import log2

In [102]:
def mean_recursive_rank(chunks: list[str], keyword: str) -> float:
    keyword = keyword.lower()
    for rank, chunk in enumerate(chunks, start = 1):
        if keyword in chunk.lower():
            return 1.0 / rank
    return 0.0

In [103]:
def discounted_cummulative_gain(chunk: str, keywords: list[str]) -> float:
    chunk = chunk.lower()
    for keyword in keywords:
        if keyword.lower() in chunk:
            return 1.0
    return 0.0

In [104]:
def compute_discounted_cummulative_gain(relavency_list: list[int]):
    dcg = 0.0
    for rank, relavency in enumerate(relavency_list, start = 2):
        dcg += (relavency / log2(rank))
    return dcg

In [105]:
from tqdm import tqdm

def evaluate_retrieval_metrics() -> RetrivalMetrics:
    tests = load_tests()
    RETRIEVED_CHUNKS = 5

    mrr = []
    dcg = []

    for test in tqdm(tests):
        
        keywords = test.keywords
        chunks = retriever.invoke(input = test.question, k = RETRIEVED_CHUNKS)
        chunks = [chunk.page_content for chunk in chunks]

        mrr_score = 0

        for keyword in keywords:
            mrr_score += mean_recursive_rank(chunks = chunks, keyword = keyword)

        mrr.append(mrr_score / len(keywords))

        dcg_scores = []

        for chunk in chunks:
            dcg_scores.append(discounted_cummulative_gain(chunk = chunk, keywords = keywords))

        normalized_dcg_score = compute_discounted_cummulative_gain(relavency_list = dcg_scores)
        ideal_dcg_score = compute_discounted_cummulative_gain(relavency_list = sorted(dcg_scores, reverse = True))

        dcg.append(normalized_dcg_score / ideal_dcg_score if ideal_dcg_score else 0.0)
    
    mrr_average = sum(mrr) / len(mrr) if mrr else 0.0
    dcg_average = sum(dcg) / len(dcg) if dcg else 0.0

    return RetrivalMetrics(mean_recursive_rank = mrr_average, discounted_cummulative_gain = dcg_average)

evaluate_retrieval_metrics()

100%|██████████| 150/150 [00:10<00:00, 14.16it/s]


RetrivalMetrics(mean_recursive_rank=0.7353425925925926, discounted_cummulative_gain=0.9023805709993685)