## 0 Imports and Device

Here we import all necessary helper functions and classes.
We also define the device to run the models on (GPU or CPU).

In [37]:
import os
import torch
from models.DPR import DPR
import configparser
from utils.data_utils import getCorpus, getQueries
from data.dataloader import DataLoader
from models.model_loader_helpers import createModels, loadModels
from utils.metrics_uitls import timeFunction, calculateMetrics
from utils.lookup_utils import retrieveQueryAndGetRelevancies
from utils.latex_utils import createLatexTable
from utils.misc import batch
device = ('cuda' if torch.cuda.is_available() else 'cpu')

## 1 Prepare Experiments

### 1.1 Define Experiment Configuration 

Here we define the configuration of the experiment.
Both the datasets to perform the experiment on and the model configurations.

Change the load_saved_models variable to True, to load locally saved models, instead of creating them during the experiment.

In [43]:
config = configparser.ConfigParser()
config.read('configs/config.ini')
datasets = list(config['DATASETS'])
data_loader = DataLoader(config)

# model_descriptions = {"TF-IDF": {},
#         "BM25": {},
#         "DPR": {},
#         "Crossencoder": {"n":25},
#         "KMeans": {"k":4},
#         "CURE": {"k": 2, "n": 2, "shrinkage_fraction":0.2}}

model_descriptions = {"TF-IDF": {},
        "BM25": {},
        "DPR": {},
        "Crossencoder": {"n":25},
        "KMeans": {"k":3}}

load_saved_models = True

embedding_model_name = "bert-base-uncased"
embedding_index_folder_path = "indexes"
top_k = 25
batch_size = 20

### 1.2 Define Function to Pre-compute Embeddings

This function helps us reduce a lot of computations, by pre computing the embeddings offline and loading them online, instead of computing them multiple times (one time for each model that relies on embeddings).

In [44]:
def preComputeEmbeddings(dataset: str, 
                         documents: list[dict], 
                         embedding_model_name: str, 
                         embedding_index_folder_path: str):
    embedder = DPR(documents, model_name=embedding_model_name)
    embedding_index_path = getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)
    embedder.SaveIndex(embedding_index_path)
    return embedding_index_path

def getPreComputedEmbeddingsPath(dataset: str, embedding_index_folder_path: str):
    return os.path.join(embedding_index_folder_path,dataset,"embedding_index.pickle")


## 2 Run Experiemnt

Here we define the experiment itself.
We itterate over all datasets and perform retrieval for each query for each model.
We then return the score metrics, which are the mean precision, recall, reciprocal rank and time for each model.

In [45]:
def runExperiment(data_loader: DataLoader, 
                  datasets: list[str], 
                  model_descriptions: dict[str, dict],
                  embedding_model_name: str,
                  embedding_index_folder_path: str,
                  top_k: int):
    
    score_metrics: dict[str, dict[str, dict[str, float]]] = {}
    print(f'{load_saved_models}')
    for dataset in datasets:
        score_metrics[dataset] = {}
        documents, relevant_doc_ids_for_all_queries = getCorpus(data_loader, dataset)
        queries = getQueries(data_loader, relevant_doc_ids_for_all_queries)
        if load_saved_models:
            print(f'Loading saved models!')
            models = loadModels(dataset, model_descriptions)
            print("Models loaded!")
        else:
            # Compute embeddings if not done already
            if not os.path.exists(getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)):
                embedding_index_path = preComputeEmbeddings(dataset, 
                                documents,
                                embedding_model_name,
                                embedding_index_folder_path)
                print('Finished computing embeddings!')
            embedding_index_path = getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)
            
            models = createModels(documents=documents, 
                                   dataset_name=dataset, 
                                   models=model_descriptions, 
                                   embedding_index_path=embedding_index_path,
                                   save=True)
        
        for model_name, model in models.items():
            print(f'Computing results for {model_name}')
            results = []
            times = []
            score_metrics[dataset][model_name] = {}

            for query_batch in batch(queries, batch_size):
                time, relevancies = timeFunction(retrieveQueryAndGetRelevancies, 
                                                 **{"model": model, 
                                                    "queries": query_batch, 
                                                    "k": top_k})
                results.append(e for e in relevancies)
                times.append(time)
            precision, recall, reciprocal_rank = calculateMetrics(results, queries)
            score_metrics[dataset][model_name]["precision"] = precision
            score_metrics[dataset][model_name]["recall"] = recall
            score_metrics[dataset][model_name]["reciprocal_rank"] = reciprocal_rank
            score_metrics[dataset][model_name]["time"] = sum(times)/len(times)
    return score_metrics

Here we perform the acutal experiment.

In [46]:
score_metrics = runExperiment(data_loader,
                              datasets,
                              model_descriptions,
                              embedding_model_name,
                              embedding_index_folder_path,
                              top_k=top_k)

True

Loading dataset from data/datasets\fiqa
data/datasets\fiqa\corpus.jsonl
Loading saved models!
Models loaded!
Computing results for TF-IDF


TypeError: retrieveQueryAndGetRelevancies() got an unexpected keyword argument 'queries'

In [None]:
createLatexTable(score_metrics,
                 caption="Experiment results.",
                 number_of_decimal_points=4)

\begin{table}[h]
\begin{tabular}{ll|lllll}
\textbf{Dataset} & \textbf{Models} & \textbf{Time} & \textbf{Precision} & \textbf{Recall} & \textbf{MRR} \\ \hline
\multirow{6}{*}{\rotatebox[origin=c]{90}{fiqa}} & TF-IDF & 0.0002 s & 0.0 & 0.0 & 0.0 \\
 & BM25 & 0.0001 s & 0.0 & 0.0 & 0.0 \\
 & DPR & 0.0376 s & 0.0 & 0.0 & 0.0 \\
 & Crossencoder & 2.0107 s & 0.0 & 0.0 & 0.0 \\
 & KMeans & 0.0301 s & 0.0 & 0.0 & 0.0 \\
 & CURE & 0.0299 s & 0.0 & 0.0 & 0.0 \\
\hline
\end{tabular}
\caption{Experiment results.}
\label{tab:results-table}
\end{table}
