In [1]:
import sys

sys.path.append("../")

In [2]:
from elasticsearch import Elasticsearch
from qdrant_client import QdrantClient
from cache.cache import Cache


qdrant_client = QdrantClient(host="localhost", port=6333)
es_client = Elasticsearch(
    hosts=["http://localhost:9200"],
)
cache = Cache()

We'll run tests, but not for every possible combination.
We want to focus more on the differences between the worst and the best models we have.
We'll select few models, starting with the worst to the best.

Combinations for poquad dataset
Best one - morfologik_index-intfloat/multilingual-e5-large-Cosine-clarin-pl-poquad-100000-0.5-sdadas/polish-reranker-large-ranknet

50 percentile - BAAI/bge-m3-Euclid-clarin-pl-poquad-500

Worst one - basic_index-clarin-pl-poquad-500

Combinations for polqa dataset
Best one - morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.05

50 percentile - sdadas/mmlw-roberta-large-Cosine-ipipan-polqa-2000

Worst one - basic_index-ipipan-polqa-500-sdadas/polish-reranker-large-ranknet


Each dataset will be run with 2 QA models and 2 Instruction models

To run tests faster we'll select smaller batches of questions. Probably 500.

In [3]:
from common.names import  OPENAI_EMBEDDING_MODEL_NAMES, PASSAGE_PREFIX_MAP, QUERY_PREFIX_MAP
from repository.es_repository import ESRepository
from repository.qdrant_openai_repository import QdrantOpenAIRepository
from repository.qdrant_repository import QdrantRepository
from qdrant_client.models import Distance

from rerankers.hf_reranker import HFReranker
from retrievers.es_retriever import ESRetriever
from retrievers.hybrid_retriever import HybridRetriever
from retrievers.qdrant_retriever import QdrantRetriever
from retrievers.retriever import Retriever


def get_best_poquad_retriever() -> tuple[Retriever, str]:
    dataset_key = "clarin-pl-poquad-100000"
    es_index = "morfologik_index"
    qdrant_model = "intfloat/multilingual-e5-large"
    reranker_model = "sdadas/polish-reranker-large-ranknet"
    alpha = 0.5

    es_repository = ESRepository(es_client, es_index, cache)
    passage_prefix = PASSAGE_PREFIX_MAP[qdrant_model]
    query_prefix = QUERY_PREFIX_MAP[qdrant_model]
    qdrant_repository = QdrantRepository.get_repository(
        qdrant_client,
        qdrant_model,
        Distance.COSINE,
        cache,
        passage_prefix,
        query_prefix,
    )
    reranker = HFReranker(reranker_model, cache)

    retriever = HybridRetriever(
        es_repository, qdrant_repository, dataset_key, alpha, reranker
    )

    return (retriever, "morfologik_index-intfloat/multilingual-e5-large-Cosine-clarin-pl-poquad-100000-0.5-sdadas/polish-reranker-large-ranknet")


def get_50p_poquad_retriever() -> tuple[Retriever, str]:
    dataset_key = "clarin-pl-poquad-1000"
    qdrant_model = "sdadas/mmlw-retrieval-roberta-large"

    passage_prefix = PASSAGE_PREFIX_MAP[qdrant_model]
    query_prefix = QUERY_PREFIX_MAP[qdrant_model]
    qdrant_repository = QdrantRepository.get_repository(
        qdrant_client,
        qdrant_model,
        Distance.EUCLID,
        cache,
        passage_prefix,
        query_prefix,
    )

    retriever = QdrantRetriever(qdrant_repository, dataset_key)

    return (
        retriever,
        "sdadas/mmlw-retrieval-roberta-large-Euclid-clarin-pl-poquad-1000",
    )


def get_worst_poquad_retriever() -> tuple[Retriever, str]:
    dataset_key = "clarin-pl-poquad-500"
    es_index = "basic_index"

    es_repository = ESRepository(es_client, es_index, cache)

    retriever = ESRetriever(es_repository, dataset_key)

    return (retriever, "basic_index-clarin-pl-poquad-500")


def get_best_poquad_openai_retriever() -> tuple[Retriever, str]:
    repository = QdrantOpenAIRepository.get_repository(
        qdrant_client, OPENAI_EMBEDDING_MODEL_NAMES[0], Distance.COSINE, cache
    )

    retriever = QdrantRetriever(repository, "clarin-pl-poquad-2000")

    return (retriever, "text-embedding-3-large-Cosine-clarin-pl-poquad-2000")


def get_worst_poquad_openai_retriever() -> tuple[Retriever, str]:
    repository = QdrantOpenAIRepository.get_repository(
        qdrant_client, OPENAI_EMBEDDING_MODEL_NAMES[0], Distance.COSINE, cache
    )

    retriever = QdrantRetriever(repository, "clarin-pl-poquad-500")

    return (retriever, "text-embedding-3-large-Cosine-clarin-pl-poquad-500")


def get_best_polqa_retriever() -> tuple[Retriever, str]:
    dataset_key = "ipipan-polqa-1000"
    es_index = "morfologik_index"
    qdrant_model = "sdadas/mmlw-retrieval-roberta-large"
    reranker_model = "sdadas/polish-reranker-large-ranknet"
    alpha = 0.75

    es_repository = ESRepository(es_client, es_index, cache)
    passage_prefix = PASSAGE_PREFIX_MAP[qdrant_model]
    query_prefix = QUERY_PREFIX_MAP[qdrant_model]
    qdrant_repository = QdrantRepository.get_repository(
        qdrant_client,
        qdrant_model,
        Distance.COSINE,
        cache,
        passage_prefix,
        query_prefix,
    )
    reranker = HFReranker(reranker_model, cache)

    retriever = HybridRetriever(
        es_repository, qdrant_repository, dataset_key, alpha, reranker
    )

    return (
        retriever,
        "morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet",
    )


def get_50p_polqa_retriever() -> tuple[Retriever, str]:
    dataset_key = "ipipan-polqa-1000"
    es_index = "morfologik_index"
    qdrant_model = "sdadas/mmlw-retrieval-roberta-large"
    alpha = 0.75

    es_repository = ESRepository(es_client, es_index, cache)
    passage_prefix = PASSAGE_PREFIX_MAP[qdrant_model]
    query_prefix = QUERY_PREFIX_MAP[qdrant_model]
    qdrant_repository = QdrantRepository.get_repository(
        qdrant_client,
        qdrant_model,
        Distance.COSINE,
        cache,
        passage_prefix,
        query_prefix,
    )

    retriever = HybridRetriever(
        es_repository, qdrant_repository, dataset_key, alpha
    )

    return (
        retriever,
        "morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.75",
    )


def get_worst_polqa_retriever() -> tuple[Retriever, str]:
    dataset_key = "ipipan-polqa-500"
    es_index = "basic_index"

    es_repository = ESRepository(es_client, es_index, cache)

    retriever = ESRetriever(
        es_repository, dataset_key
    )

    return (
        retriever,
        "basic_index-ipipan-polqa-500",
    )


def get_best_polqa_openai_retriever() -> tuple[Retriever, str]:
    repository = QdrantOpenAIRepository.get_repository(
        qdrant_client, OPENAI_EMBEDDING_MODEL_NAMES[0], Distance.EUCLID, cache
    )

    retriever = QdrantRetriever(repository, "ipipan-polqa-2000")

    return (retriever, "text-embedding-3-large-Euclid-ipipan-polqa-2000")


def get_worst_polqa_openai_retriever() -> tuple[Retriever, str]:
    repository = QdrantOpenAIRepository.get_repository(
        qdrant_client, OPENAI_EMBEDDING_MODEL_NAMES[0], Distance.COSINE, cache
    )

    retriever = QdrantRetriever(repository, "ipipan-polqa-500")

    return (retriever, "text-embedding-3-large-Cosine-ipipan-polqa-500")

In [4]:
poquad_retriever_functions = [
    get_best_poquad_retriever,
    get_50p_poquad_retriever,
    get_worst_poquad_retriever,
]

poquad_openai_retriever_functions = [
    get_best_poquad_openai_retriever,
    get_worst_poquad_openai_retriever,
]

polqa_retriever_functions = [
    get_best_polqa_retriever,
    get_50p_polqa_retriever,
    get_worst_polqa_retriever,
]

polqa_openai_retriever_functions = [
    get_best_polqa_openai_retriever,
    get_worst_polqa_openai_retriever,
]

In [5]:
from common.names import DATASET_SEED
from dataset.polqa_dataset_getter import PolqaDatasetGetter
from dataset.poquad_dataset_getter import PoquadDatasetGetter


poquad_dataset_getter = PoquadDatasetGetter()
polqa_dataset_getter = PolqaDatasetGetter()

poquad_dataset = poquad_dataset_getter.get_random_n_test(500, DATASET_SEED)[:100]
polqa_dataset = polqa_dataset_getter.get_random_n_test(500, DATASET_SEED)[:100]

In [6]:
ns = [1, 5]

In [7]:
from ast import Dict
from common.dataset_entry import DatasetEntry
from evaluation.qa_evaluator import QAEvaluator
from generators.generator import Generator

qa_evaluator = QAEvaluator()


def run_qa_evaluation(retriever: Retriever, generator: Generator, dataset: list[DatasetEntry], name, generator_name, n):
    scores: Dict[str, float] = {}

    em = []
    f1 = []

    i = 0

    for entry in dataset:
        if i  % 10 == 0:
            print(f"Processing {i} out of {len(dataset)}")

        question = entry.question
        correct_answers = entry.answers

        retriever_result = retriever.get_relevant_passages(question)
        passages = [passage for (passage, _) in retriever_result.passages]
        top_n_passages = passages[:n]

        answer = generator.generate_answer(question, top_n_passages)

        exact_match_score = qa_evaluator.calculate_em(answer, correct_answers)
        f1_score = qa_evaluator.calculate_f1_score(answer, correct_answers)

        em.append(exact_match_score)
        f1.append(f1_score)

        i += 1

    scores["em"] = sum(em) / len(em)
    scores["f1"] = sum(f1) / len(f1)

    print(f"name: {name}, generator: {generator_name}, n: {n}, em: {scores['em']}, f1: {scores['f1']}")

    return scores

In [8]:
from evaluation.hallucination_evaluator import HallucinationEvaluator


def run_instruction_evaluation(retriever: Retriever, generator: Generator, dataset: list[DatasetEntry], name, generator_name, n):
    # hallucination_evaluator = HallucinationEvaluator(cache)

    scores: Dict[str, float] = {}

    confidence_score = []
    above = 0
    below = 0

    i = 0

    for entry in dataset:
        if i  % 10 == 0:
            print(f"Processing {i} out of {len(dataset)}")

        question = entry.question

        retriever_result = retriever.get_relevant_passages(question)
        passages = [passage for (passage, _) in retriever_result.passages]
        top_n_passages = passages[:n]

        answer = generator.generate_answer(question, top_n_passages)

        # score = hallucination_evaluator.calculate(question, answer, top_n_passages)
        score = 0.0

        confidence_score.append(score)
        if score >= 0.5:
            above += 1
        else:
            below += 1

        i += 1

    scores["confidence"] = sum(confidence_score) / len(confidence_score)
    scores["above"] = above
    scores["below"] = below

    print(f"name: {name}, generator: {generator_name}, n: {n}, confidence: {scores['confidence']}, above: {scores['above']}, below: {scores['below']}")

    return scores

In [9]:
from ast import Dict

from common.names import INST_MODEL_PATHS, QA_MODEL_NAMES
from generators.instruction_generator import InstructionGenerator
from generators.openai_generator import OpenAIGenerator
from generators.question_answering_generator import QuestionAnsweringGenerator


def run_poquad_evaluations():
    poquad_scores: Dict[str, float] = {}

    for retriever_func in poquad_retriever_functions:
        (retriever, name) = retriever_func()

        for n in ns:
            for qa_model_name in QA_MODEL_NAMES:
                generator = QuestionAnsweringGenerator(qa_model_name, cache)
                scores = run_qa_evaluation(
                    retriever, generator, poquad_dataset, name, qa_model_name, n
                )
                poquad_scores[f"{name}-QA-{qa_model_name}-{n}"] = scores

            for instruction_models in INST_MODEL_PATHS:
                generator = InstructionGenerator(instruction_models, cache)

                scores = run_instruction_evaluation(
                    retriever, generator, poquad_dataset, name, instruction_models, n
                )

                poquad_scores[f"{name}-INST-{instruction_models}-{n}"] = scores

    for retriever_func in poquad_openai_retriever_functions:
        (retriever, name) = retriever_func()

        for n in ns:
            generator = OpenAIGenerator(cache)
            scores = run_instruction_evaluation(
                retriever, generator, poquad_dataset, name, "gpt-4o-mini", n
            )
            poquad_scores[f"{name}-QA-gpt-4o-mini-{n}"] = scores

    return poquad_scores

In [10]:
def run_polqa_evaluations():
    polqa_scores: Dict[str, float] = {}

    for retriever_func in polqa_retriever_functions:
        (retriever, name) = retriever_func()

        for n in ns:
            for qa_model_name in QA_MODEL_NAMES:
                generator = QuestionAnsweringGenerator(qa_model_name, cache)
                scores = run_qa_evaluation(retriever, generator, polqa_dataset, name, qa_model_name, n)
                polqa_scores[f"{name}-QA-{qa_model_name}-{n}"] = scores

            for instruction_models in INST_MODEL_PATHS:
                generator = InstructionGenerator(instruction_models, cache)

                scores = run_instruction_evaluation(
                    retriever, generator, polqa_dataset, name, instruction_models, n
                )

                polqa_scores[f"{name}-INST-{instruction_models}-{n}"] = scores

    for retriever_func in polqa_openai_retriever_functions:
        (retriever, name) = retriever_func()

        for n in ns:
            generator = OpenAIGenerator(cache)
            scores = run_instruction_evaluation(
                retriever, generator, polqa_dataset, name, "gpt-4o-mini", n
            )
            polqa_scores[f"{name}-QA-gpt-4o-mini-{n}"] = scores

    return polqa_scores

In [11]:
import json

polqa_scores = None
cached_polqa_scores = cache.get("score:generator_polqa")

if cached_polqa_scores is not None:
    polqa_scores = json.loads(cached_polqa_scores)
else:
    polqa_scores = run_polqa_evaluations()
    cache.set("score:generator_polqa", json.dumps(polqa_scores))

Vectorizer with model sdadas/mmlw-retrieval-roberta-large initialized
Qdrant collection sdadas-mmlw-retrieval-roberta-large-Cosine repository initialized
Vectorizer with model sdadas/polish-reranker-large-ranknet initialized
Processing 0 out of 100
Processing 10 out of 100
Processing 20 out of 100
Processing 30 out of 100
Processing 40 out of 100
Processing 50 out of 100
Processing 60 out of 100
Processing 70 out of 100
Processing 80 out of 100
Processing 90 out of 100
name: morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet, generator: radlab/polish-qa-v2, n: 1, em: 0.28, f1: 0.41923809523809524




Processing 0 out of 100
Processing 10 out of 100
Processing 20 out of 100
Processing 30 out of 100
Processing 40 out of 100
Processing 50 out of 100
Processing 60 out of 100
Processing 70 out of 100
Processing 80 out of 100
Processing 90 out of 100
name: morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet, generator: timpal0l/mdeberta-v3-base-squad2, n: 1, em: 0.24, f1: 0.40114285714285713
Processing 0 out of 100
Processing 10 out of 100
Processing 20 out of 100
Processing 30 out of 100
Processing 40 out of 100
Processing 50 out of 100
Processing 60 out of 100
Processing 70 out of 100
Processing 80 out of 100
Processing 90 out of 100
name: morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet, generator: ../../models/Bielik-11B-v2.2-Instruct-q4, n: 1, confidence: 0.0, above: 0, below: 100
Processing 0 out of 100
Processing 10 out of 100
Processing 20 out o

In [12]:

poquad_scores = None
cached_poquad_scores = cache.get("score:generator_poquad")

if cached_poquad_scores is not None:
    poquad_scores = json.loads(cached_poquad_scores)
else:
    poquad_scores = run_poquad_evaluations()
    cache.set("score:generator_poquad", json.dumps(poquad_scores))

In [13]:
# save results as csv
import csv


def save_scores_to_csv(scores, filename):
    with open(filename, mode="w") as file:
        writer = csv.writer(file)
        writer.writerow(["model", "em", "f1"])
        
        for key, value in scores.items():
            print(key, value)
            if "em" in value and "f1" in value:            
                writer.writerow(
                    [
                        key,
                        str(value["em"]).replace(".", ","),
                        str(value["f1"]).replace(".", ","),
                    ]
                )

In [14]:
# save_scores_to_csv(poquad_scores, "../../output/generator_poquad_scores.csv")
save_scores_to_csv(polqa_scores, "../../output/generator_polqa_scores.csv")

morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet-QA-radlab/polish-qa-v2-1 {'em': 0.28, 'f1': 0.41923809523809524}
morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet-QA-timpal0l/mdeberta-v3-base-squad2-1 {'em': 0.24, 'f1': 0.40114285714285713}
morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet-INST-../../models/Bielik-11B-v2.2-Instruct-q4-1 {'confidence': 0.0, 'above': 0, 'below': 100}
morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet-INST-../../models/Mistral-7B-Instruct-v0.2-q4-1 {'confidence': 0.0, 'above': 0, 'below': 100}
morfologik_index-sdadas/mmlw-retrieval-roberta-large-Cosine-ipipan-polqa-1000-0.5-sdadas/polish-reranker-large-ranknet-QA-radlab/polish-qa-v2-5 {'em': 0.25, 'f1': 0.3872380952380952}
morfologik