### Defining the Dataset

In [10]:
from data.query import Query
from helper_functions import DownloadData, LoadData, LoadQueries, LoadRelevants

DownloadData()
data = LoadData()
queries = LoadQueries()

relevants_raw = LoadRelevants()
relevants: dict[str, list] = {}
for row in relevants_raw.iloc:
    query_id = row["query-id"]
    corpus_id = row["corpus-id"]
    if not query_id in relevants.keys():
        relevants[query_id] = []
    relevants[query_id].append(corpus_id)

documents = [{"title": row["title"], "text": row["text"], "id": row["_id"]} for row in data.iloc]
queries = {row["_id"]: Query(text = row["text"], id = row["_id"], relevant_document_ids=relevants[row["_id"]]) for row in queries.iloc if row["_id"] in relevants.keys()}

Dataset already exists in /Users/andreasbigom/Documents/dtu/computational_tools/02807_final_project/datasets/nfcorpus


### Defining the Models

In [11]:
from models.bm25 import BM25
from models.builers.retriever import Retriever
from models.cure import CURE
from models.k_means import KMeans
from models.tfidf import TFIDF
from models.dpr import DPR
from models.dpr_crossencoder import DPRCrossencoder

models: dict[str: Retriever] = {
    "TF-IDF": TFIDF(documents=documents),
    # "DPR": DPR(documents=documents),
    # "Crossencoder": DPRCrossencoder(documents=documents, n=25),
    # "KMeans": KMeans(documents=documents, k = 4),
    # "CURE": CURE(documents=documents, k = 2, n=2, shrinkage_fraction=0.2),
    #  "BM25": BM25(documents=documents),
}

### Perform Experiment

In [3]:
import time

def TimeFunction(function, **args):
    time_before = time.perf_counter()
    output = function(**args)
    time_after = time.perf_counter()
    return time_after - time_before, output

In [4]:
def RetrieveQueryAndGetScore(model: Retriever, query: Query, k: int):
    retrieved_documents = model.Lookup(query=query.GetQuery(), k=k)
    relevancies = []
    for document in retrieved_documents:
        if query.IsDocumentRelevant(document):
            relevancies.append(True)
        else:
            relevancies.append(False)
    return relevancies

In [5]:
def MeanReciprocalRank(relevancies):
    for i, relevancy in enumerate(relevancies):
        if relevancy:
            return 1/(i+1)
    return 0

def Precision(relevancies):
    return sum([1 if relevancy else 0 for relevancy in relevancies]) / len(relevancies)

def Recall(relevancies, query: Query):
    return sum([1 if relevancy else 0 for relevancy in relevancies]) / min(len(relevancies), query.GetNumberOfRelevantDocuments())

In [9]:
from data.document import Document

query = "The aims of this study were to determine the concentrations of 4-nonylphenol (NP)"
k = 10

for model_type in models.keys():
    model: Retriever = models[model_type]
    time_perf, relevancies = TimeFunction(RetrieveQueryAndGetScore, **{"model": model, "query": queries['PLAIN-3'], "k": k})
    print(time_perf)
    print(MeanReciprocalRank(relevancies))

0.001147707982454449
0
