# Evaluation Reranking

In [None]:
import copy
import json
import os
import sys
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
from ranx import Qrels, Run
from utils import *
from reranking import Reranker
from retrieving import Retriever
from evaluation_ranx import EvaluationRanx


QUESTIONS_PATH = os.getenv("QUESTIONS_PATH")

## Loading Questions

In [2]:
seed = 2
questions = json.load(open(os.path.join(QUESTIONS_PATH, f"questions_seed_{seed}.json",), encoding="utf-8"))

In [None]:
q_idx = 24
question = questions[q_idx]["question"]
source = questions[q_idx]["source"]
source_edited = source.removeprefix("file:///").replace("//", "\\").replace("Source", "Data")

question, source, source_edited

## Create Qrels

### Initalization

In [246]:
chunking_type = ChunkingType.TITLE
n = 40

reranking = Reranker(
    n_retriever=n,
    n_reranker=n,
    chunking_type=chunking_type,
)

In [247]:
eval_ranx = EvaluationRanx(
    metrics=["ndcg@3", "ndcg@5", "mrr@3", "mrr@5", "map@3", "map@5"]
)

### Create Docs

In [None]:
# create retriever docs, create reranker docs for cohere
retriever_type = RetrieverType.MULTIQUERY

reranking.rerank(
    query=question,
    retriever_type=retriever_type,
    reranker_type=RerankerType.COHERE,
    index_docs=True,
    add_desc_scores=True,
)[:0]

In [None]:
# display first 5 retriever documents

base_docs = reranking.documents
base_docs_copy = copy.deepcopy(base_docs)
base_docs[:5], len(base_docs)

In [None]:
# display first 5 cohere reranker documents

cohere_docs = reranking.reranker_documents  
cohere_docs[:5], len(cohere_docs)

In [None]:
# display first 5 flashrank reranker documents

flashrank_docs = reranking.rerank_with_documents(
    query=question,
    documents=base_docs,
    reranker_type=RerankerType.FLASHRANK,
)
flashrank_docs[:5], len(flashrank_docs)

In [None]:
# display relevant docs in base docs based on source

relevant_docs = [doc for doc in base_docs_copy if doc.metadata["source"] == source_edited]
for doc in relevant_docs:
    del doc.metadata["source"]
question, relevant_docs

In [None]:
source, q_idx+1

### Qrels

In [None]:
# create and saving qrels
# qrels_file_path = os.path.join(qrels_path, f"q_{qrels_file_id}_qrels_base_{chunking_type.value}.json")

docs = [base_docs, cohere_docs, flashrank_docs]
rel_ids = [0]
rel_socres = [10]

qrels = eval_ranx.create_qrels(
    rel_ids=rel_ids,
    rel_scores=rel_socres,
    dict_key_name=f"q_{q_idx+1}",
    save_path=f"Data/json/qrels/seed_{seed}/{retriever_type.value}_{chunking_type.value}/q_{q_idx+1}_qrels_{retriever_type.value}_{chunking_type.value}.json",
)

runs_list = eval_ranx.create_runs(
    docs_list=docs,
    run_names = ["base", "cohere", "flashrank"],
    dict_key_name=f"q_{q_idx+1}"
)

eval_ranx.compare_query(
    qrels=qrels,
    runs=runs_list,
    save_path=f"Data/json/qrels/seed_{seed}/{retriever_type.value}_{chunking_type.value}/q_{q_idx+1}_report_{retriever_type.value}_{chunking_type.value}.json"
)

Save page conent for each relevant id

In [480]:
rel_ids_dic = {f"q_{q_idx+1}": {f"d_{rel_id}": [doc.page_content for doc in relevant_docs if doc.metadata["id"] == rel_id][0] for rel_id in rel_ids}}
save_path = f"Data/json/qrels/seed_{seed}/{retriever_type.value}_{chunking_type.value}/q_{q_idx+1}_qrels_{retriever_type.value}_{chunking_type.value}_pagecontent.json"
eval_ranx.save_as_json(rel_ids_dic, save_path, "asdsd")

## Compare All

### Comparing with Base

In [None]:
qrels_path = "D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//base_basic"
save_paths = ["D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//base_basic//q_all_report_base_basic.json"]

reports = eval_ranx.compare_all_queries_base(
    queries=questions,
    qrels_path=qrels_path,
    chunking_types=[chunking_type],
    save_paths=save_paths,
)

reports

In [None]:
report_paths = ["D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//base_basic",
                "D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//base_by_title"]

save_paths = ["D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//base_basic//q_all_report_base_basic.json",
              "D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//base_by_title//q_all_report_base_by_title.json"]


reports = eval_ranx.compute_average_with_reports(
    report_paths=report_paths,
    save_paths=save_paths
)
reports

### Comparing with Multiquery

In [None]:
report_paths = ["D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//multiquery_basic",
                "D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//multiquery_by_title"]

save_paths = ["D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//multiquery_basic//q_all_report_multiquery_basic.json",
              "D://Studium//Informatik//Module//Bachelorarbeit//Project//Pipeline//Utility//classes//Data//json//qrels//seed_2//multiquery_by_title//q_all_report_multiquery_by_title.json"]


reports = eval_ranx.compute_average_with_reports(
    report_paths=report_paths,
    save_paths=save_paths
)
reports