In [1]:
import sys

sys.path.append("../")

In [2]:
from common.utils import (
    get_all_es_index_combinations,
    get_all_openai_model_combinations,
    get_all_qdrant_model_combinations,
)


es_index_combinations = get_all_es_index_combinations()
qdrant_model_combinations = get_all_qdrant_model_combinations()
openai_model_combinations = get_all_openai_model_combinations()

In [3]:
from elasticsearch import Elasticsearch
from qdrant_client import QdrantClient
from cache.cache import Cache


qdrant_client = QdrantClient(host="localhost", port=6333)
es_client = Elasticsearch(
    hosts=["http://localhost:9200"],
)
cache = Cache()

In [4]:
from dataset.polqa_dataset_getter import PolqaDatasetGetter
from dataset.poquad_dataset_getter import PoquadDatasetGetter
from evaluation.polqa_retriever_evaluator import PolqaRetrieverEvaluator
from evaluation.poquad_retriever_evaluator import PoquadRetrieverEvaluator

poquad_dataset_getter = PoquadDatasetGetter()
polqa_dataset_getter = PolqaDatasetGetter()

poquad_dataset = poquad_dataset_getter.get_test_dataset()
polqa_dataset = polqa_dataset_getter.get_test_dataset()

poquad_evaluator = PoquadRetrieverEvaluator()
polqa_evaluator = PolqaRetrieverEvaluator()

In [5]:
from typing import Dict

def run_poquad_evaluation(dataset, repository, retriever, dataset_key):
    scores: Dict[str, float] = {}

    ndcgs = []
    mrrs = []
    recalls = []
    accuracies = []

    for entry in dataset:
        passage_id = entry.passage_id
        query = entry.question
        result = retriever.get_relevant_passages(query)
        relevant_passages_count = repository.count_relevant_documents(
            [passage_id], dataset_key
        )

        if relevant_passages_count == 0:
            continue

        ndcg = poquad_evaluator.calculate_ndcg(result, passage_id)
        mrr = poquad_evaluator.calculate_mrr(result, passage_id)
        recall = poquad_evaluator.calculate_recall(
            result, passage_id, relevant_passages_count
        )
        accuracy = poquad_evaluator.calculate_accuracy(result, passage_id)
        if recall < 0:
            break
        if (recall > 1):
            print(dataset_key, passage_id, recall, relevant_passages_count)
            for passage, _ in result.passages:
                print(passage)
            break

        ndcgs.append(ndcg)
        mrrs.append(mrr)
        recalls.append(recall)
        accuracies.append(accuracy)

    scores["ndcg"] = sum(ndcgs) / len(ndcgs)
    scores["mrr"] = sum(mrrs) / len(mrrs)
    scores["recall"] = sum(recalls) / len(recalls)
    scores["accuracy"] = sum(accuracies) / len(accuracies)

    return scores

In [6]:
import random
from typing import Dict


def run_polqa_evaluation(dataset, repository, retriever, dataset_key):
    scores: Dict[str, float] = {}

    ndcgs = []
    mrrs = []
    recalls = []
    accuracies = []

    unique_question_entries = []
    for entry in dataset:
        if entry.question not in [entry_.question for entry_ in unique_question_entries]:
            unique_question_entries.append(entry)

    for entry in unique_question_entries:
        same_question_entries = []

        for entry_ in dataset:
            if entry_.question == entry.question:
                same_question_entries.append(entry_)

        passage_ids = [entry.metadata["passage_id"] for entry in same_question_entries]
        query = entry.question
        result = retriever.get_relevant_passages(query)
        relevant_passages_count = repository.count_relevant_documents(
            passage_ids, dataset_key
        )

        if relevant_passages_count == 0:
            continue

        ndcg = polqa_evaluator.calculate_ndcg(result, passage_ids)
        mrr = polqa_evaluator.calculate_mrr(result, passage_ids)
        recall = polqa_evaluator.calculate_recall(
            result, passage_ids, relevant_passages_count
        )
        accuracy = polqa_evaluator.calculate_accuracy(result, passage_ids)
        if recall < 0:
            break
        if recall > 1:
            print(dataset_key, passage_ids, recall, relevant_passages_count)
            for passage, _ in result.passages:
                print(passage)
            break

        ndcgs.append(ndcg)
        mrrs.append(mrr)
        recalls.append(recall)
        accuracies.append(accuracy)

    scores["ndcg"] = sum(ndcgs) / len(ndcgs)
    scores["mrr"] = sum(mrrs) / len(mrrs)
    scores["recall"] = sum(recalls) / len(recalls)
    scores["accuracy"] = sum(accuracies) / len(accuracies)

    return scores

In [7]:
from repository.es_repository import ESRepository
from repository.qdrant_repository import QdrantRepository
from rerankers.hf_reranker import HFReranker
from retrievers.es_retriever import ESRetriever

def run_es_evaluations(combinations, datasets, reranker: HFReranker = None):
    es_scores: Dict[str, float] = {}
    poquad_dataset, polqa_dataset  = datasets

    for index, dataset_key in combinations:
        repository = ESRepository(es_client, index, cache)
        retriever = ESRetriever(repository, dataset_key, reranker)

        selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
        evaluator_func = run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation

        scores = evaluator_func(selected_dataset, repository, retriever, dataset_key)
        es_scores[f"{index}-{dataset_key}"] = scores

        print(f"{index}-{dataset_key}")
        print(scores)
        print("\n\n")

    return es_scores

In [8]:
import json
from common.names import PASSAGE_PREFIX_MAP, QUERY_PREFIX_MAP
from repository.qdrant_repository import QdrantRepository
from retrievers.qdrant_retriever import QdrantRetriever


def run_qdrant_evaluations(combinations, datasets, reranker: HFReranker = None):
    qdrant_scores: Dict[str, float] = {}

    poquad_dataset, polqa_dataset  = datasets

    for model, distance, dataset_key in combinations:
        passage_prefix = PASSAGE_PREFIX_MAP[model]
        query_prefix = QUERY_PREFIX_MAP[model]

        repository = QdrantRepository.get_repository(
            qdrant_client, model, distance, cache, passage_prefix, query_prefix
        )
        retriever = QdrantRetriever(repository, dataset_key, reranker)

        selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
        evaluator_func = (
            run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation
        )

        scores = evaluator_func(selected_dataset, repository, retriever, dataset_key)
        qdrant_scores[f"{model}-{distance}-{dataset_key}"] = scores

        cache.set(f"qdrant_scores:{model}-{distance}-{dataset_key}", json.dumps(scores))

        print(f"{model}-{distance}-{dataset_key}")
        print(scores)
        print("\n\n")

    return qdrant_scores

In [9]:
import json

es_scores = None
cached_es_scores = cache.get("score:es")  

if (cached_es_scores is not None):
    es_scores = json.loads(cached_es_scores)
else:
    es_scores = run_es_evaluations(es_index_combinations, (poquad_dataset, polqa_dataset))
    cache.set("score:es", json.dumps(es_scores))

In [10]:
qdrant_scores = None
cached_qdrant_scores = cache.get("score:qdrant")

if cached_qdrant_scores is not None:
    qdrant_scores = json.loads(cached_qdrant_scores)
else:
    qdrant_scores = run_qdrant_evaluations(qdrant_model_combinations, (poquad_dataset, polqa_dataset))
    cache.set("score:qdrant", json.dumps(qdrant_scores))

In [11]:
print("ES Scores")
print(es_scores)

ES Scores
{'basic_index-ipipan-polqa-500': {'ndcg': 0.9033158644767426, 'mrr': 0.9116864212934082, 'recall': 0.7845671310736813, 'accuracy': 0.8820960698689956}, 'basic_index-ipipan-polqa-1000': {'ndcg': 0.9031070620942475, 'mrr': 0.9116864212934082, 'recall': 0.7938795314341166, 'accuracy': 0.8820960698689956}, 'basic_index-ipipan-polqa-2000': {'ndcg': 0.9031070620942475, 'mrr': 0.9116864212934082, 'recall': 0.7938795314341166, 'accuracy': 0.8820960698689956}, 'basic_index-ipipan-polqa-100000': {'ndcg': 0.9031070620942475, 'mrr': 0.9116864212934082, 'recall': 0.7938795314341166, 'accuracy': 0.8820960698689956}, 'basic_index-clarin-pl-poquad-500': {'ndcg': 0.6758059968759683, 'mrr': 0.6565473218743314, 'recall': 0.50520201840097, 'accuracy': 0.5888666999500749}, 'basic_index-clarin-pl-poquad-1000': {'ndcg': 0.6962586758152063, 'mrr': 0.6673856279073452, 'recall': 0.73139457480446, 'accuracy': 0.5998502246630055}, 'basic_index-clarin-pl-poquad-2000': {'ndcg': 0.7089806720640973, 'mrr': 

In [12]:
# sort es scores by sym of all scores
es_scores_sorted = sorted(es_scores.items(), key=lambda x: sum(x[1].values()), reverse=True)

In [13]:
print([x[0] for x in es_scores_sorted])

['morfologik_stopwords_index-ipipan-polqa-1000', 'morfologik_stopwords_index-ipipan-polqa-2000', 'morfologik_stopwords_index-ipipan-polqa-100000', 'morfologik_whitespace_index-ipipan-polqa-1000', 'morfologik_whitespace_index-ipipan-polqa-2000', 'morfologik_whitespace_index-ipipan-polqa-100000', 'morfologik_index-ipipan-polqa-1000', 'morfologik_index-ipipan-polqa-2000', 'morfologik_index-ipipan-polqa-100000', 'morfologik_stopwords_index-ipipan-polqa-500', 'morfologik_whitespace_index-ipipan-polqa-500', 'morfologik_index-ipipan-polqa-500', 'polish_index-ipipan-polqa-1000', 'polish_index-ipipan-polqa-2000', 'polish_index-ipipan-polqa-100000', 'polish_index-ipipan-polqa-500', 'polish_whitespace_index-ipipan-polqa-1000', 'polish_whitespace_index-ipipan-polqa-2000', 'polish_whitespace_index-ipipan-polqa-100000', 'polish_whitespace_index-ipipan-polqa-500', 'polish_stopwords_index-ipipan-polqa-1000', 'polish_stopwords_index-ipipan-polqa-2000', 'polish_stopwords_index-ipipan-polqa-100000', 'pol

In [14]:
# sort qdrant scores by sum of all scores
qdrant_scores_sorted = sorted(qdrant_scores.items(), key=lambda x: sum(x[1].values()), reverse=True)

In [15]:
print([x[0] for x in qdrant_scores_sorted])

['sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000', 'sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-2000', 'sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-100000', 'sdadas/mmlw-retrieval-roberta-large-Euclid-ipipan-polqa-1000', 'sdadas/mmlw-retrieval-roberta-large-Euclid-ipipan-polqa-2000', 'sdadas/mmlw-retrieval-roberta-large-Euclid-ipipan-polqa-100000', 'BAAI/bge-m3-Cosine-ipipan-polqa-1000', 'BAAI/bge-m3-Euclid-ipipan-polqa-1000', 'BAAI/bge-m3-Cosine-ipipan-polqa-2000', 'BAAI/bge-m3-Euclid-ipipan-polqa-2000', 'BAAI/bge-m3-Cosine-ipipan-polqa-100000', 'BAAI/bge-m3-Euclid-ipipan-polqa-100000', 'intfloat/multilingual-e5-large-Cosine-ipipan-polqa-1000', 'intfloat/multilingual-e5-large-Euclid-ipipan-polqa-1000', 'intfloat/multilingual-e5-large-Cosine-ipipan-polqa-2000', 'intfloat/multilingual-e5-large-Euclid-ipipan-polqa-2000', 'intfloat/multilingual-e5-large-Cosine-ipipan-polqa-100000', 'intfloat/multilingual-e5-large-Euclid-ipipan-polqa-100000', 'sdadas/m

In [16]:
from qdrant_client.models import Distance

# sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000

hybrid_combinations = [
    (
        "clarin-pl-poquad-100000",
        "morfologik_index",
        "intfloat/multilingual-e5-large",
        Distance.COSINE,
    ),
    (
        "ipipan-polqa-1000",
        "morfologik_index",
        "sdadas/mmlw-retrieval-roberta-large",
        Distance.COSINE,
    ),
]

In [17]:
from repository import qdrant_repository
from repository.es_repository import ESRepository
from retrievers.es_retriever import ESRetriever
from retrievers.hybrid_retriever import HybridRetriever


def run_hybrid_evaluations(combinations, datasets, alphas: list[int], reranker: HFReranker = None):
    hybrid_scores: Dict[str, float] = {}

    poquad_dataset, polqa_dataset  = datasets

    for dataset_key, es_index, qdrant_model, qdrant_distance in combinations:
        for alpha in alphas:
            es_repository = ESRepository(es_client, es_index, cache)

            passage_prefix = PASSAGE_PREFIX_MAP[qdrant_model]
            query_prefix = QUERY_PREFIX_MAP[qdrant_model]
            qdrant_repository = QdrantRepository.get_repository(
                qdrant_client,
                qdrant_model,
                qdrant_distance,
                cache,
                passage_prefix,
                query_prefix,
            )

            retriever = HybridRetriever(
                es_repository, qdrant_repository, dataset_key, alpha, reranker
            )

            selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
            evaluator_func = (
                run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation
            )

            scores = evaluator_func(
                selected_dataset, es_repository, retriever, dataset_key
            )
            hybrid_scores[
                f"{es_index}-{qdrant_model}-{qdrant_distance}-{dataset_key}-{alpha}"
            ] = scores

            print(f"{es_index}-{qdrant_model}-{qdrant_distance}-{dataset_key}-{alpha}")
            print(scores)
            print("\n\n")

    return hybrid_scores

In [18]:
hybrid_scores = None
cached_hybrid_scores = cache.get("score:hybrid")

if cached_hybrid_scores is not None:
    hybrid_scores = json.loads(cached_hybrid_scores)
else:
    hybrid_scores = run_hybrid_evaluations(hybrid_combinations, (poquad_dataset, polqa_dataset), [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75])
    cache.set("score:hybrid", json.dumps(hybrid_scores))

In [19]:
es_reranker_combinations = [('morfologik_index', 'clarin-pl-poquad-100000'), ('basic_index', 'clarin-pl-poquad-500'), ('morfologik_index', 'ipipan-polqa-1000'), ('basic_index', 'ipipan-polqa-500')]
qdrant_reranker_combinations = [("intfloat/multilingual-e5-large", Distance.COSINE, "clarin-pl-poquad-100000"), ("sdadas/mmlw-roberta-large", Distance.EUCLID, "clarin-pl-poquad-500"), ("sdadas/mmlw-retrieval-roberta-large", Distance.COSINE, "ipipan-polqa-1000"), ("sdadas/mmlw-roberta-large", Distance.EUCLID, "ipipan-polqa-500")]

In [20]:
from common.names import RERANKER_MODEL_NAMES


def run_reranker_evaluations():
    reranker_scores: Dict[str, float] = {}

    random_poquad_dataset = random.sample(poquad_dataset, 500)
    random_polqa_dataset = random.sample(polqa_dataset, 500)

    datasets = (random_poquad_dataset, random_polqa_dataset)

    for reranker_model in RERANKER_MODEL_NAMES:
        reranker = HFReranker(reranker_model, cache)

        es_reranker_scores = run_es_evaluations(
            es_reranker_combinations, datasets, reranker
        )
        for key, value in es_reranker_scores.items():
            reranker_scores[f"{key}-{reranker_model}"] = value

        qdrant_reranker_scores = run_qdrant_evaluations(
            qdrant_reranker_combinations,
            datasets,
            reranker,
        )
        for key, value in qdrant_reranker_scores.items():
            reranker_scores[f"{key}-{reranker_model}"] = value

        hybrid_reranker_scores = run_hybrid_evaluations(
            hybrid_combinations,
            datasets,
            [0.25, 0.5, 0.75],
            reranker,
        )
        for key, value in hybrid_reranker_scores.items():
            reranker_scores[f"{key}-{reranker_model}"] = value

    return reranker_scores

In [21]:
reranker_scores = None
cached_reranker_scores = cache.get("score:reranker")

if cached_reranker_scores is not None:
    reranker_scores = json.loads(cached_reranker_scores)
else:
    reranker_scores = run_reranker_evaluations()
    cache.set("score:reranker", json.dumps(reranker_scores))

In [22]:
print(reranker_scores)

{'morfologik_index-clarin-pl-poquad-100000-sdadas/polish-reranker-large-ranknet': {'ndcg': 0.9331009295026591, 'mrr': 0.9288095238095238, 'recall': 0.9457142857142857, 'accuracy': 0.9171428571428571}, 'basic_index-clarin-pl-poquad-500-sdadas/polish-reranker-large-ranknet': {'ndcg': 0.8172343604147756, 'mrr': 0.8199761904761905, 'recall': 0.5564965986394558, 'accuracy': 0.8085714285714286}, 'morfologik_index-ipipan-polqa-1000-sdadas/polish-reranker-large-ranknet': {'ndcg': 0.6349354667257144, 'mrr': 0.5295826932923707, 'recall': 0.956989247311828, 'accuracy': 0.3118279569892473}, 'basic_index-ipipan-polqa-500-sdadas/polish-reranker-large-ranknet': {'ndcg': 0.5760611674495155, 'mrr': 0.4836277521761393, 'recall': 0.8333333333333334, 'accuracy': 0.2903225806451613}, 'intfloat/multilingual-e5-large-Cosine-clarin-pl-poquad-100000-sdadas/polish-reranker-large-ranknet': {'ndcg': 0.9414064410232674, 'mrr': 0.9317210884353742, 'recall': 0.9714285714285714, 'accuracy': 0.9114285714285715}, 'sdad

In [23]:
import json
from repository.qdrant_openai_repository import QdrantOpenAIRepository
from retrievers.qdrant_retriever import QdrantRetriever


def run_openai_evaluations(combinations, datasets):
    openai_scores: Dict[str, float] = {}

    poquad_dataset, polqa_dataset = datasets

    for model, distance, dataset_key in combinations:
        repository = QdrantOpenAIRepository.get_repository(
            qdrant_client, model, distance, cache
        )
        retriever = QdrantRetriever(repository, dataset_key)

        selected_dataset = poquad_dataset if "poquad" in dataset_key else polqa_dataset
        evaluator_func = (
            run_poquad_evaluation if "poquad" in dataset_key else run_polqa_evaluation
        )

        scores = evaluator_func(selected_dataset, repository, retriever, dataset_key)
        openai_scores[f"{model}-{distance}-{dataset_key}"] = scores

        cache.set(
            f"openai_scores:{model}-{distance}-{dataset_key}", json.dumps(scores)
        )

        print(f"{model}-{distance}-{dataset_key}")
        print(scores)
        print("\n\n")

    return openai_scores

In [24]:
openai_scores = None
cached_openai_scores = cache.get("score:openai")

if cached_openai_scores is not None:
    openai_scores = json.loads(cached_openai_scores)
else:
    openai_scores = run_openai_evaluations(
        openai_model_combinations, (poquad_dataset, polqa_dataset)
    )
    cache.set("score:openai", json.dumps(openai_scores))

In [25]:
# save results as csv
import csv

def save_scores_to_csv(scores, filename):
    with open(filename, mode='w') as file:
        writer = csv.writer(file)
        writer.writerow(['model', 'ndcg', 'mrr', 'recall', 'accuracy', 'sum', 'avg'])
        for key, value in scores.items():
            writer.writerow(
                [
                    key,
                    str(value["ndcg"]).replace(".", ","),
                    str(value["mrr"]).replace(".", ","),
                    str(value["recall"]).replace(".", ","),
                    str(value["accuracy"]).replace(".", ","),
                    str(value["ndcg"] + value["mrr"] + value["recall"] + value["accuracy"]).replace(".", ","),
                    str((value["ndcg"] + value["mrr"] + value["recall"] + value["accuracy"]) / 4).replace(".", ","),
                ]
            )

In [27]:
es_poquad_scores = {key: value for key, value in es_scores.items() if "poquad" in key}
es_polqa_scores = {key: value for key, value in es_scores.items() if "polqa" in key}

qdrant_poquad_scores = {key: value for key, value in qdrant_scores.items() if "poquad" in key}
qdrant_polqa_scores = {key: value for key, value in qdrant_scores.items() if "polqa" in key}

hybrid_poquad_scores = {key: value for key, value in hybrid_scores.items() if "poquad" in key}
hybrid_polqa_scores = {key: value for key, value in hybrid_scores.items() if "polqa" in key}

reranker_poquad_scores = {key: value for key, value in reranker_scores.items() if "poquad" in key}
reranker_polqa_scores = {key: value for key, value in reranker_scores.items() if "polqa" in key}

openai_poquad_scores = {key: value for key, value in openai_scores.items() if "poquad" in key}
openai_polqa_scores = {key: value for key, value in openai_scores.items() if "polqa" in key}

In [28]:
save_scores_to_csv(es_poquad_scores, "../../output/es_scores_poquad.csv")
save_scores_to_csv(es_polqa_scores, "../../output/es_scores_polqa.csv")

save_scores_to_csv(qdrant_poquad_scores, "../../output/qdrant_scores_poquad.csv")
save_scores_to_csv(qdrant_polqa_scores, "../../output/qdrant_scores_polqa.csv")

save_scores_to_csv(hybrid_poquad_scores, "../../output/hybrid_scores_poquad.csv")
save_scores_to_csv(hybrid_polqa_scores, "../../output/hybrid_scores_polqa.csv")

save_scores_to_csv(reranker_poquad_scores, "../../output/reranker_scores_poquad.csv")
save_scores_to_csv(reranker_polqa_scores, "../../output/reranker_scores_polqa.csv")

save_scores_to_csv(openai_poquad_scores, "../../output/openai_scores_poquad.csv")
save_scores_to_csv(openai_polqa_scores, "../../output/openai_scores_polqa.csv")