In [1]:
import sys

sys.path.append("../")

In [2]:
from common.utils import (
    get_all_es_index_combinations,
    get_all_openai_model_combinations,
    get_all_qdrant_model_combinations,
)


es_index_combinations = get_all_es_index_combinations()
qdrant_model_combinations = get_all_qdrant_model_combinations()
openai_model_combinations = get_all_openai_model_combinations()

In [3]:
from elasticsearch import Elasticsearch
from qdrant_client import QdrantClient
from cache.cache import Cache


qdrant_client = QdrantClient(host="localhost", port=6333)
es_client = Elasticsearch(
    hosts=["http://localhost:9200"],
)
cache = Cache()

In [4]:
from common.names import DATASET_SEED
from dataset.polqa_dataset_getter import PolqaDatasetGetter
from dataset.poquad_dataset_getter import PoquadDatasetGetter
from evaluation.retriever_evaluator import RetrieverEvaluator

poquad_dataset_getter = PoquadDatasetGetter()
polqa_dataset_getter = PolqaDatasetGetter()

poquad_dataset = poquad_dataset_getter.get_random_n_test(500, DATASET_SEED)
polqa_dataset = polqa_dataset_getter.get_random_n_test(500, DATASET_SEED)

retriever_evaluator = RetrieverEvaluator()

In [5]:

from typing import Dict

from common.dataset_entry import DatasetEntry
from repository.repository import Repository
from retrievers.retriever import Retriever

def run_poquad_evaluation(dataset: list[DatasetEntry], repository: Repository, retriever: Retriever, dataset_key: str):
    scores: Dict[str, float] = {}
    
    ndcgs = []
    mrrs = []
    recalls = []
    accuracies = []

    for entry in dataset:
        passage_id = entry.passage_id
        query = entry.question
        result = retriever.get_relevant_passages(query)
        relevant_passages_count = repository.count_relevant_documents(
            passage_id, dataset_key
        )

        if relevant_passages_count == 0:
            print(f"ERROR NO RELEVANT PASSAGES - passage id {passage_id}")
            break

        ndcg = retriever_evaluator.calculate_ndcg(result, passage_id)
        mrr = retriever_evaluator.calculate_mrr(result, passage_id)
        recall = retriever_evaluator.calculate_recall(
            result, passage_id, relevant_passages_count
        )
        accuracy = retriever_evaluator.calculate_accuracy(result, passage_id)

        ndcgs.append(ndcg)
        mrrs.append(mrr)
        recalls.append(recall)
        accuracies.append(accuracy)


    scores["ndcg"] = sum(ndcgs) / len(ndcgs)
    scores["mrr"] = sum(mrrs) / len(mrrs)
    scores["recall"] = sum(recalls) / len(recalls)
    scores["accuracy"] = sum(accuracies) / len(accuracies)

    return scores

In [6]:
from typing import Dict


def run_polqa_evaluation(dataset: list[DatasetEntry], repository: Repository, retriever: Retriever, dataset_key: str):
    scores: Dict[str, float] = {}

    ndcgs = []
    mrrs = []
    recalls = []
    accuracies = []

    for entry in dataset:
        passage_id = entry.passage_id
        query = entry.question

        result = retriever.get_relevant_passages(query)
        relevant_passages_count = repository.count_relevant_documents(
            passage_id, dataset_key
        )

        if relevant_passages_count == 0:
            print(f"ERROR NO RELEVANT PASSAGES - passage id {passage_id}")
            break

        ndcg = retriever_evaluator.calculate_ndcg(result, passage_id)
        mrr = retriever_evaluator.calculate_mrr(result, passage_id)
        recall = retriever_evaluator.calculate_recall(
            result, passage_id, relevant_passages_count
        )
        accuracy = retriever_evaluator.calculate_accuracy(result, passage_id)
        if recall < 0:
            break
        if recall > 1:
            print(dataset_key, passage_id, recall, relevant_passages_count)
            for passage, _ in result.passages:
                print(passage)
            break

        ndcgs.append(ndcg)
        mrrs.append(mrr)
        recalls.append(recall)
        accuracies.append(accuracy)

    scores["ndcg"] = sum(ndcgs) / len(ndcgs)
    scores["mrr"] = sum(mrrs) / len(mrrs)
    scores["recall"] = sum(recalls) / len(recalls)
    scores["accuracy"] = sum(accuracies) / len(accuracies)

    return scores

In [7]:
from repository.es_repository import ESRepository
from repository.qdrant_repository import QdrantRepository
from rerankers.hf_reranker import HFReranker
from retrievers.es_retriever import ESRetriever

def run_es_evaluations(combinations, datasets, reranker: HFReranker = None):
    es_scores: Dict[str, float] = {}
    poquad_dataset, polqa_dataset  = datasets

    for index, dataset_key in combinations:
        repository = ESRepository(es_client, index, cache)
        retriever = ESRetriever(repository, dataset_key, reranker)

        selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
        evaluator_func = run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation

        scores = evaluator_func(selected_dataset, repository, retriever, dataset_key)
        es_scores[f"{index}-{dataset_key}"] = scores

        print(f"{index}-{dataset_key}")
        print(scores)
        print("\n\n")

    return es_scores

In [8]:
import json
from common.names import PASSAGE_PREFIX_MAP, QUERY_PREFIX_MAP
from repository.qdrant_repository import QdrantRepository
from retrievers.qdrant_retriever import QdrantRetriever


def run_qdrant_evaluations(combinations, datasets, reranker: HFReranker = None):
    qdrant_scores: Dict[str, float] = {}

    poquad_dataset, polqa_dataset  = datasets

    for model, distance, dataset_key in combinations:
        passage_prefix = PASSAGE_PREFIX_MAP[model]
        query_prefix = QUERY_PREFIX_MAP[model]

        repository = QdrantRepository.get_repository(
            qdrant_client, model, distance, cache, passage_prefix, query_prefix
        )
        retriever = QdrantRetriever(repository, dataset_key, reranker)

        selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
        evaluator_func = (
            run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation
        )

        scores = evaluator_func(selected_dataset, repository, retriever, dataset_key)
        qdrant_scores[f"{model}-{distance}-{dataset_key}"] = scores

        cache.set(f"qdrant_scores:{model}-{distance}-{dataset_key}", json.dumps(scores))

        print(f"{model}-{distance}-{dataset_key}")
        print(scores)
        print("\n\n")

    return qdrant_scores

In [9]:
import json

es_scores = None
cached_es_scores = cache.get("score:es")

if (cached_es_scores is not None):
    es_scores = json.loads(cached_es_scores)
else:
    es_scores = run_es_evaluations(es_index_combinations, (poquad_dataset, polqa_dataset))
    cache.set("score:es", json.dumps(es_scores))

In [10]:
qdrant_scores = None
cached_qdrant_scores = cache.get("score:qdrant")

if cached_qdrant_scores is not None:
    qdrant_scores = json.loads(cached_qdrant_scores)
else:
    qdrant_scores = run_qdrant_evaluations(qdrant_model_combinations, (poquad_dataset, polqa_dataset))
    cache.set("score:qdrant", json.dumps(qdrant_scores))

In [11]:
from qdrant_client.models import Distance

hybrid_combinations = [
    (
        "clarin-pl-poquad-100000",
        "morfologik_index",
        "intfloat/multilingual-e5-large",
        Distance.COSINE,
    ),
    (
        "ipipan-polqa-1000",
        "morfologik_index",
        "sdadas/mmlw-retrieval-roberta-large",
        Distance.COSINE,
    ),
]

In [12]:
from repository import qdrant_repository
from repository.es_repository import ESRepository
from retrievers.es_retriever import ESRetriever
from retrievers.hybrid_retriever import HybridRetriever


def run_hybrid_evaluations(combinations, datasets, alphas: list[int], reranker: HFReranker = None):
    hybrid_scores: Dict[str, float] = {}

    poquad_dataset, polqa_dataset  = datasets

    for dataset_key, es_index, qdrant_model, qdrant_distance in combinations:
        for alpha in alphas:
            es_repository = ESRepository(es_client, es_index, cache)

            passage_prefix = PASSAGE_PREFIX_MAP[qdrant_model]
            query_prefix = QUERY_PREFIX_MAP[qdrant_model]
            qdrant_repository = QdrantRepository.get_repository(
                qdrant_client,
                qdrant_model,
                qdrant_distance,
                cache,
                passage_prefix,
                query_prefix,
            )

            retriever = HybridRetriever(
                es_repository, qdrant_repository, dataset_key, alpha, reranker
            )

            selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
            evaluator_func = (
                run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation
            )

            scores = evaluator_func(
                selected_dataset, es_repository, retriever, dataset_key
            )
            hybrid_scores[
                f"{es_index}-{qdrant_model}-{qdrant_distance}-{dataset_key}-{alpha}"
            ] = scores

            print(f"{es_index}-{qdrant_model}-{qdrant_distance}-{dataset_key}-{alpha}")
            print(scores)
            print("\n\n")

    return hybrid_scores

In [13]:
hybrid_scores = None
cached_hybrid_scores = cache.get("score:hybrid")

if cached_hybrid_scores is not None:
    hybrid_scores = json.loads(cached_hybrid_scores)
else:
    hybrid_scores = run_hybrid_evaluations(hybrid_combinations, (poquad_dataset, polqa_dataset), [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75])
    cache.set("score:hybrid", json.dumps(hybrid_scores))

In [14]:
es_reranker_combinations = [('morfologik_index', 'clarin-pl-poquad-100000'), ('basic_index', 'clarin-pl-poquad-500'), ('morfologik_index', 'ipipan-polqa-1000'), ('basic_index', 'ipipan-polqa-500')]
qdrant_reranker_combinations = [("intfloat/multilingual-e5-large", Distance.COSINE, "clarin-pl-poquad-100000"), ("sdadas/mmlw-roberta-large", Distance.EUCLID, "clarin-pl-poquad-500"), ("sdadas/mmlw-retrieval-roberta-large", Distance.COSINE, "ipipan-polqa-1000"), ("sdadas/mmlw-roberta-large", Distance.EUCLID, "ipipan-polqa-500")]

In [15]:
from common.names import RERANKER_MODEL_NAMES


def run_reranker_evaluations():
    reranker_scores: Dict[str, float] = {}

    for reranker_model in RERANKER_MODEL_NAMES:
        reranker = HFReranker(reranker_model, cache)

        es_reranker_scores = run_es_evaluations(
            es_reranker_combinations, (poquad_dataset, polqa_dataset), reranker
        )
        for key, value in es_reranker_scores.items():
            reranker_scores[f"{key}-{reranker_model}"] = value

        qdrant_reranker_scores = run_qdrant_evaluations(
            qdrant_reranker_combinations,
            (poquad_dataset, polqa_dataset),
            reranker,
        )
        for key, value in qdrant_reranker_scores.items():
            reranker_scores[f"{key}-{reranker_model}"] = value

        hybrid_reranker_scores = run_hybrid_evaluations(
            hybrid_combinations,
            (poquad_dataset, polqa_dataset),
            [0.25, 0.5, 0.75],
            reranker,
        )
        for key, value in hybrid_reranker_scores.items():
            reranker_scores[f"{key}-{reranker_model}"] = value

    return reranker_scores

In [16]:
reranker_scores = None
cached_reranker_scores = cache.get("score:reranker")

if cached_reranker_scores is not None:
    reranker_scores = json.loads(cached_reranker_scores)
else:
    reranker_scores = run_reranker_evaluations()
    cache.set("score:reranker", json.dumps(reranker_scores))

In [17]:
import json
from repository.qdrant_openai_repository import QdrantOpenAIRepository
from retrievers.qdrant_retriever import QdrantRetriever


def run_openai_evaluations(combinations, datasets):
    openai_scores: Dict[str, float] = {}

    poquad_dataset, polqa_dataset = datasets

    for model, distance, dataset_key in combinations:
        repository = QdrantOpenAIRepository.get_repository(
            qdrant_client, model, distance, cache
        )
        retriever = QdrantRetriever(repository, dataset_key)

        selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
        evaluator_func = (
            run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation
        )

        scores = evaluator_func(selected_dataset, repository, retriever, dataset_key)
        openai_scores[f"{model}-{distance}-{dataset_key}"] = scores

        cache.set(
            f"openai_scores:{model}-{distance}-{dataset_key}", json.dumps(scores)
        )

        print(f"{model}-{distance}-{dataset_key}")
        print(scores)
        print("\n\n")

    return openai_scores

In [18]:
openai_scores = None
cached_openai_scores = cache.get("score:openai")

if cached_openai_scores is not None:
    openai_scores = json.loads(cached_openai_scores)
else:
    openai_scores = run_openai_evaluations(
        openai_model_combinations, (poquad_dataset, polqa_dataset)
    )
    cache.set("score:openai", json.dumps(openai_scores))

Vectorizer with model text-embedding-3-large initialized
Qdrant openai collection text-embedding-3-large-Cosine repository initialized
text-embedding-3-large-Cosine-ipipan-polqa-500
{'ndcg': 0.9267680443358624, 'mrr': 0.9123190476190476, 'recall': 0.973, 'accuracy': 0.872}



Vectorizer with model text-embedding-3-large initialized
Qdrant openai collection text-embedding-3-large-Euclid repository initialized
text-embedding-3-large-Euclid-ipipan-polqa-500
{'ndcg': 0.9307503429221868, 'mrr': 0.9163190476190476, 'recall': 0.977, 'accuracy': 0.876}



Vectorizer with model text-embedding-3-large initialized
Qdrant openai collection text-embedding-3-large-Cosine repository initialized
text-embedding-3-large-Cosine-ipipan-polqa-1000
{'ndcg': 0.9346719575285637, 'mrr': 0.9195301587301588, 'recall': 0.98, 'accuracy': 0.88}



Vectorizer with model text-embedding-3-large initialized
Qdrant openai collection text-embedding-3-large-Euclid repository initialized
text-embedding-3-large-Euclid-ipipa

In [19]:
# save results as csv
import csv

def save_scores_to_csv(scores, filename):
    with open(filename, mode='w') as file:
        writer = csv.writer(file)
        writer.writerow(['model', 'ndcg', 'mrr', 'recall', 'accuracy', 'sum', 'avg'])
        for key, value in scores.items():
            writer.writerow(
                [
                    key,
                    str(value["ndcg"]).replace(".", ","),
                    str(value["mrr"]).replace(".", ","),
                    str(value["recall"]).replace(".", ","),
                    str(value["accuracy"]).replace(".", ","),
                    str(value["ndcg"] + value["mrr"] + value["recall"] + value["accuracy"]).replace(".", ","),
                    str((value["ndcg"] + value["mrr"] + value["recall"] + value["accuracy"]) / 4).replace(".", ","),
                ]
            )

In [20]:
es_poquad_scores = {key: value for key, value in es_scores.items() if "poquad" in key}
es_polqa_scores = {key: value for key, value in es_scores.items() if "polqa" in key}

qdrant_poquad_scores = {key: value for key, value in qdrant_scores.items() if "poquad" in key}
qdrant_polqa_scores = {key: value for key, value in qdrant_scores.items() if "polqa" in key}

hybrid_poquad_scores = {key: value for key, value in hybrid_scores.items() if "poquad" in key}
hybrid_polqa_scores = {key: value for key, value in hybrid_scores.items() if "polqa" in key}

reranker_poquad_scores = {key: value for key, value in reranker_scores.items() if "poquad" in key}
reranker_polqa_scores = {key: value for key, value in reranker_scores.items() if "polqa" in key}

openai_poquad_scores = {key: value for key, value in openai_scores.items() if "poquad" in key}
openai_polqa_scores = {key: value for key, value in openai_scores.items() if "polqa" in key}

In [21]:
save_scores_to_csv(es_poquad_scores, "../../output/es_scores_poquad.csv")
save_scores_to_csv(es_polqa_scores, "../../output/es_scores_polqa.csv")

save_scores_to_csv(qdrant_poquad_scores, "../../output/qdrant_scores_poquad.csv")
save_scores_to_csv(qdrant_polqa_scores, "../../output/qdrant_scores_polqa.csv")

save_scores_to_csv(hybrid_poquad_scores, "../../output/hybrid_scores_poquad.csv")
save_scores_to_csv(hybrid_polqa_scores, "../../output/hybrid_scores_polqa.csv")

save_scores_to_csv(reranker_poquad_scores, "../../output/reranker_scores_poquad.csv")
save_scores_to_csv(reranker_polqa_scores, "../../output/reranker_scores_polqa.csv")

save_scores_to_csv(openai_poquad_scores, "../../output/openai_scores_poquad.csv")
save_scores_to_csv(openai_polqa_scores, "../../output/openai_scores_polqa.csv")