## Imports 

In [9]:
import os
import torch
from models.builers.retriever import Retriever
from models.DPR import DPR

In [10]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

## Getting dataset

The below code downloads and unzips a specified dataset and saves it to data/datasets/*name_of_dataset*/

In [11]:
from data.dataloader import DataLoader
import configparser

# load config.ini 
config = configparser.ConfigParser()
config.read('config.ini')
data_handler = DataLoader(config)
dataset = "fiqa"
corpus, queries = data_handler.get_dataset(dataset)


Loading dataset from data/datasets/fiqa
data/datasets/fiqa/corpus.jsonl


In [12]:
relevant_doc_ids_for_queries = data_handler.get_relevants(dataset)
# print first 5 relevants from the dictionary
print("First 5 relevants from the dictionary")
for i in list(relevant_ids_for_all_queries.keys())[:5]:
    print("  ", i, ":", relevant_ids_for_all_queries[i])

print("\nFirst query and document from corpus:")
print("  Corpus[0]:  ", corpus[0])
print("  Query[0]: ", queries[0])

First 5 relevants from the dictionary


NameError: name 'relevant_ids_for_all_queries' is not defined

### Create Queries 

Here we build the queries form the downloaded data.

In [None]:
from data.query import Query

queries_ = data_handler.get_queries() # Get queries in the correct format
queries: list[Query] = []
for query_id, relevant_doc_ids in relevant_doc_ids_for_queries.items():
    relevant_doc_ids = [r[0] for r in relevant_doc_ids[1]]
    query = queries_[query_id]
    queries.append(Query(text=query['text'], 
                         id=query_id, 
                         relevant_document_ids=relevant_doc_ids))

# output an example
queries[0].GetQuery(), queries[0].GetNumberOfRelevantDocuments()

## Create or load models

In [None]:
# DELETE WHEN RUNNNING FULL
documents = corpus[:10]
del corpus

First we can save a lot of computations, by pre-calulating the embeddings used in the DRP, DRP Crossencoder, CURE and K-means models.

In [None]:
embedding_index_path = "indexes/embedding_index.pickle"
embedding_model = "bert-base-uncased"

embedder = DPR(documents=documents, model_name=embedding_model)
embedder.SaveIndex(embedding_index_path)
del embedder

In [None]:
from models.model_loader_helpers import create_models

models_to_create = {"TF-IDF": {},
                    "BM25": {},
                    "DPR": {},
                    "Crossencoder": {"n":25},
                    "KMeans": {"k":4},
                    "CURE": {"k": 2, "n": 2, "shrinkage_fraction":0.2}}

create_models(documents=documents, 
              dataset_name=dataset, 
              models=models_to_create, 
              save=True,
              embedding_index_path=embedding_index_path)

Creating TF-IDF model
GetCorpusVocabulary Elapsed: 0.0001628398895263672s
GetInverseDocumentFrequencies Elapsed: 0.0009119510650634766s
GetDocumentsTFIDFVectors Elapsed: 0.003918886184692383s
Creating directory: models/pickled_models
Saving model 'TF-IDF' at: models/pickled_models/fiqa/TF-IDF.pickle
Creating BM25 model
GetCorpusVocabulary Elapsed: 0.0001552104949951172s
GetInverseDocumentFrequencies Elapsed: 0.0003941059112548828s
GetDocumentLengths Elapsed: 7.390975952148438e-05s
GetDocumentBM25Vectors Elapsed: 0.0024192333221435547s
Saving model 'BM25' at: models/pickled_models/fiqa/BM25.pickle
Creating DPR model
Saving model 'DPR' at: models/pickled_models/fiqa/DPR.pickle
Crossencoder model
Saving model 'Crossencoder' at: models/pickled_models/fiqa/Crossencoder_n25.pickle
KMeans model
Saving model 'KMeans' at: models/pickled_models/fiqa/KMeans_k4.pickle
CURE model
Saving model 'CURE' at: models/pickled_models/fiqa/CURE_k2_n2_shrinkage_fraction0.2.pickle


{'TF-IDF': <models.TFIDF.TFIDF at 0x29af4d510>,
 'BM25': <models.BM25.BM25 at 0x29c31de90>,
 'DPR': <models.DPR.DPR at 0x29c2f6bd0>,
 'Crossencoder': <models.DPR_crossencoder.DPRCrossencoder at 0x29c37b650>,
 'KMeans': <models.k_means.KMeans at 0x2cd60c510>,
 'CURE': <models.CURE.CURE at 0x2d09c3250>}

In [None]:
from models.model_loader_helpers import load_models

models_to_load = {"TF-IDF": {},
                    "BM25": {},
                    "DPR": {}}
models = load_models("fiqa", models_to_load)

### Perform Experiment

In [16]:
def retrieveQueryAndGetRelevancies(model: Retriever, query: Query, k: int):
    retrieved_documents = model.Lookup(query=query.GetQuery(), k=k)
    relevancies = []
    for document in retrieved_documents:
        if query.IsDocumentRelevant(document):
            relevancies.append(True)
        else:
            relevancies.append(False)
    return relevancies

## Define evaluation functions

In [17]:
def reciprocalRank(relevancies):
    for i, relevancy in enumerate(relevancies):
        if relevancy:
            return 1/(i+1)
    return 0

def meanReciprocalRank(scores):
    reciprocal_ranks = []
    for score in scores:
        reciprocal_ranks.append(reciprocalRank(score))

    return sum(reciprocal_ranks) / len(reciprocal_ranks)

def precision(relevancies):
    return sum([1 if relevancy else 0 for relevancy in relevancies]) / len(relevancies)

def recall(relevancies, query: Query):
    return sum([1 if relevancy else 0 for relevancy in relevancies]) / min(len(relevancies), query.GetNumberOfRelevantDocuments())

def calculate_metrics(results):
    """ Calculates precision, recall and accuracy based on the results of a query. True or false values are used to indicate whether a document is relevant or not.
    
    Args:
        results (list): A list of lists of booleans, where each list of booleans represents the retrieved documents for a query

    Returns:
        precision (float): Precision of the results
        recall (float): Recall of the results
        accuracy (float): Accuracy of the results
    """
    total_true_positives = 0  # Relevant documents correctly retrieved
    total_false_positives = 0 # Non-relevant documents incorrectly retrieved
    total_false_negatives = 0 # Relevant documents missed
    total_true_negatives = 0  # Non-relevant documents correctly not retrieved

    for query_results in results:
        true_positives = sum(query_results)
        false_positives = len(query_results) - true_positives
        # Assuming the length of the list is the total number of relevant documents for the query
        false_negatives = len(query_results) - true_positives  
        # True negatives can't be calculated without knowing the total number of non-relevant documents

        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives

    # Calculating metrics
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    accuracy = (total_true_positives + total_true_negatives) / (total_true_positives + total_false_positives + total_false_negatives + total_true_negatives) if (total_true_positives + total_false_positives + total_false_negatives + total_true_negatives) > 0 else 0

    mrr = meanReciprocalRank(results)

    return precision, recall, accuracy, mrr

In [18]:
import time

def timeFunction(function, **args):
    time_before = time.perf_counter()
    output = function(**args)
    time_after = time.perf_counter()
    return time_after - time_before, output

In [41]:
scores = {dataset:{}}
times = {dataset: {}}
for model_name in ["TF-IDF", "BM25", "DPR"]:
# for model_name in list(models_to_load.keys()):
    scores[dataset][model_name] = list()
    times[dataset][model_name] = list()
    for query in queries[:1000]:
        t_, s_ = timeFunction(retrieveQueryAndGetRelevancies, **{"model": models[model_name], "query": query, "k": query.GetNumberOfRelevantDocuments()})
        scores[dataset][model_name].append(s_)
        times[dataset][model_name].append(t_)

In [40]:
print("Precision, Recall, Accuracy, MRR")
print("TF-IDF:", calculate_metrics(scores[dataset]["TF-IDF"]), "Avg. time:", sum(times[dataset]["TF-IDF"])/len(times[dataset]["TF-IDF"]))
print("BM25:", calculate_metrics(scores[dataset]["BM25"]), ", Avg. time:", sum(times[dataset]["BM25"])/len(times[dataset]["BM25"]))
print("DPR:", calculate_metrics(scores[dataset]["DPR"]), ", Avg. time:", sum(times[dataset]["DPR"])/len(times[dataset]["DPR"]))

Precision, Recall, Accuracy, MRR
TF-IDF: (0.0, 0.0, 0.0, 0.0) Avg. time: 7.994100000587423e-05
BM25: (0.0, 0.0, 0.0, 0.0) , Avg. time: 4.232699999647593e-05
DPR: (0.0, 0.0, 0.0, 0.0) , Avg. time: 0.027545723999980964


In [22]:
def create_latex_table(scores, times, caption="Experiment results.", label="tab:results-table"):
    """Prints a latex table from the scores and times dictionaries.
    
    Args:
        scores (dict): A dictionary containing the scores for each model and dataset
        times (dict): A dictionary containing the times for each model and dataset
        caption (str, optional): Caption of the table. Defaults to "Experiment results.".
        label (str, optional): Label of the table. Defaults to "tab:results-table".
    
    Returns:
        None
    """
    print("\\begin{table}[h]")
    print("\\begin{tabular}{ll|lllll}")
    print("\\textbf{Dataset} & \\textbf{Models} & \\textbf{Time} & \\textbf{Accuracy} & \\textbf{Precision} & \\textbf{Recall} & \\textbf{MRR} \\\\ \\hline")

    dataset_names = list(scores.keys())

    for dname in dataset_names:
        models_names = list(scores[dname].keys())
        # Loop through scores for all the models 
        for i, model_name in enumerate(models_names):
            precision, recall, accuracy, mrr = calculate_metrics(scores[dname][model_name])
            t_ = sum(times[dname][model_name])/len(times[dname][model_name])

            n_dec = 4
            # round numbers 
            precision = round(precision, n_dec)
            recall = round(recall, n_dec)
            accuracy = round(accuracy, n_dec)
            mrr = round(mrr, n_dec)
            t_ = round(t_, n_dec)

            if i==0:
                stri = f"\multirow{{{len(models_names)}}}{{*}}{{\\rotatebox[origin=c]{{90}}{{{dname}}}}} & {model_name} & {t_} s & {accuracy} & {precision} & {recall} & {mrr} \\\\"
            else:
                stri = f" & {model_name} & {t_} s & {accuracy} & {precision} & {recall} & {mrr} \\\\"
            print(stri)    
        
        print("\\hline")

    print("\\end{tabular}")
    print(f"\\caption{{{caption}}}")
    print(f"\\label{{{label}}}")
    print("\\end{table}")

In [23]:
create_latex_table(scores,times)

\begin{table}[h]
\begin{tabular}{ll|lllll}
\textbf{Dataset} & \textbf{Models} & \textbf{Time} & \textbf{Accuracy} & \textbf{Precision} & \textbf{Recall} & \textbf{MRR} \\ \hline
\multirow{2}{*}{\rotatebox[origin=c]{90}{fiqa}} & TF-IDF & 0.0001 s & 0.0 & 0.0001 & 0.0001 & 0.0001 \\
 & BM25 & 0.0 s & 0.0001 & 0.0002 & 0.0002 & 0.0005 \\
\hline
\end{tabular}
\caption{Experiment results.}
\label{tab:results-table}
\end{table}


## Run experiments on all datasets

!!! The following has not been tested yet

In [None]:
create_new_models = False # Set to True to create new models. False to load existing models

In [None]:
from data.dataloader import DataLoader
import configparser

# load config.ini 
config = configparser.ConfigParser()
config.read('config.ini')
all_datasets = list(config["DATASETS"])
print("All datasets:", all_datasets)

# Store scores and retrieval times in dictionaries
scores = {}
times = {}

for dataset in all_datasets:
    print("Dataset:", dataset)

    # Get the dataset
    corpus, queries = data_handler.get_dataset(dataset)
    relevant_ids_for_all_queries = data_handler.get_relevants(dataset)
    documents = corpus
    del corpus
    

    # Define models with parameters
    models_to_create = {"TF-IDF": {},
                            "BM25": {},
                            "DPR": {},
                            "Crossencoder": {"n":25},
                            "KMeans": {"k":4},
                            "CURE": {"k": 2, "n": 2, "shrinkage_fraction":0.2}}
    
    # Either load or create models
    if create_new_models:
        from models.model_loader_helpers import create_models

        models = create_models(documents=documents, dataset_name=dataset, models=models_to_create, save=True)
    else:
        from models.model_loader_helpers import load_models
        
        models = load_models(dataset, models_to_create)

    # Get queries in the correct format
    queries_ = data_handler.get_queries() # Get queries in the correct format
    queries = []
    for rel in relevant_ids_for_all_queries.items():
        id = rel[0]
        rels = [r[0] for r in rel[1]]
        query = queries_[id]
        queries.append(Query(text=query['text'], id=id, relevant_document_ids=rels))

    # Now calculate the scores
    scores[dataset] = {}
    times[dataset] = {}
    for model_name in list(models_to_create.keys()):
        scores[dataset][model_name] = list()
        times[dataset][model_name] = list()
        for query in queries:
            t_, s_ = timeFunction(retrieveQueryAndGetRelevancies, **{"model": models[model_name], "query": query, "k": query.GetNumberOfRelevantDocuments()})
            scores[dataset][model_name].append(s_)
            times[dataset][model_name].append(t_)
        print(f"{model_name}:", calculate_metrics(scores[dataset][model_name]), ", Avg. time:", sum(times[dataset][model_name])/len(times[dataset][model_name]))
    
    print("\n")

In [None]:
# Create the latex table with the results
create_latex_table(scores,times)