In [1]:
import sys

sys.path.append("../")

In this file we'll compare all retrievers.
We'll check how each of them compares to each other,
We'll get the best 10 retrievers that will later be used for our RAG.

To test all retrievers:
1. Get all dense, sparse and hybrid retrievers
2. Prompt and get top 10 results for each retriever
3. Calculate metrics

REMEMBER TO CACHE WHAT YOU CAN!

In [2]:
from elasticsearch import Elasticsearch
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from cache.cache import Cache

qdrant_client = QdrantClient(host="localhost", port=6333)
es_client = Elasticsearch(
    hosts=["http://localhost:9200"],
)
cache = Cache()

In [3]:
from typing import Dict
from common.names import INDEX_NAMES
from repository.es_repository import ESRepository

es_repositories: Dict[str, ESRepository] = {}

for index_name in INDEX_NAMES:
    es_repositories[index_name] = ESRepository(es_client, index_name, cache)

For ES repositories:
1. Run tests for each of 7 indexes
2. Run tests for each of 2 datasets
3. Run tests for each character splitting type

In [4]:
from dataset.polqa_dataset_getter import PolqaDatasetGetter
from dataset.poquad_dataset_getter import PoquadDatasetGetter

poquad_dataset_getter = PoquadDatasetGetter()
polqa_dataset_getter = PolqaDatasetGetter()

poquad_dataset = poquad_dataset_getter.get_test_dataset()
polqa_dataset = polqa_dataset_getter.get_test_dataset()

In [5]:
from common.names import CHARACTER_SPLITTING_FUNCTION, DATASET_NAMES, MODEL_NAMES, SEMANTIC_TYPES
from common.utils import get_semantic_dataset_key, get_split_dataset_key, replace_slash_with_dash
from evaluation.retriever_evaluator import RetrieverEvaluator

evaluator = RetrieverEvaluator()

# Save score by indexname-datasetkey
scores: Dict[str, float] = {}

for es_repository in es_repositories.values():
    # poquad - character splitting
    for split in CHARACTER_SPLITTING_FUNCTION:
        dataset_key = get_split_dataset_key(DATASET_NAMES[1], split)
        ndcgs = []
        mrrs = []

        for entry in poquad_dataset:
            title = entry["title"]
            query = entry["question"]
            result = es_repository.find(query, dataset_key)

            ndcg = evaluator.calculate_ndcg(result, title)
            mrr = evaluator.calculate_mrr(result, title)
            ndcgs.append(ndcg)
            mrrs.append(mrr)

        ndcg_score = sum(ndcgs) / len(ndcgs)
        print(f"{es_repository.index_name}-{dataset_key}: {ndcg_score}")
        
        mrr_score = sum(mrrs) / len(mrrs)
        print(f"{es_repository.index_name}-{dataset_key}: {mrr_score}")

basic_index-clarin-pl-poquad-character-500: 0.4809383856518945
basic_index-clarin-pl-poquad-character-500: 0.47190580835619
basic_index-clarin-pl-poquad-character-1000: 0.49526676878964876
basic_index-clarin-pl-poquad-character-1000: 0.4775363366489321
basic_index-clarin-pl-poquad-character-2000: 0.504805028250532
basic_index-clarin-pl-poquad-character-2000: 0.48425200753445025
polish_index-clarin-pl-poquad-character-500: 0.5467760715519275
polish_index-clarin-pl-poquad-character-500: 0.5422323700472556
polish_index-clarin-pl-poquad-character-1000: 0.5636051692010742
polish_index-clarin-pl-poquad-character-1000: 0.5507422232135972
polish_index-clarin-pl-poquad-character-2000: 0.5686764296297864
polish_index-clarin-pl-poquad-character-2000: 0.5517567303349745
polish_whitespace_index-clarin-pl-poquad-character-500: 0.5438914759110157
polish_whitespace_index-clarin-pl-poquad-character-500: 0.5391043752684974
polish_whitespace_index-clarin-pl-poquad-character-1000: 0.560268131223659
polish