## 0 Imports and Device

Here we import all necessary helper functions and classes.
We also define the device to run the models on (GPU or CPU).

In [None]:
import os
import torch
from models.DPR import DPR
import configparser
from utils.data_utils import getCorpus, getQueries
from data.dataloader import DataLoader
from models.model_loader_helpers import createModels, loadModels
from utils.metrics_uitls import timeFunction, calculateMetrics
from utils.lookup_utils import retrieveQueryAndGetRelevancies
from utils.latex_utils import createLatexTable
from utils.misc import batch
device = ('cuda' if torch.cuda.is_available() else 'cpu')

## 1 Prepare Experiments

### 1.1 Define Experiment Configuration 

Here we define the configuration of the experiment.
Both the datasets to perform the experiment on and the model configurations.

Change the load_saved_models variable to True, to load locally saved models, instead of creating them during the experiment.

In [None]:
config = configparser.ConfigParser()
config.read('configs/config.ini')
datasets = list(config['DATASETS'])
data_loader = DataLoader(config)

load_saved_models = False

embedding_model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1" #'ProsusAI/finbert' #"bert-base-uncased"
embedding_index_folder_path = "indexes"
top_k = 100
batch_size = 50
subset_factors = [1, 2, 4]

model_descriptions = {
        "TF-IDF": {},
        "BM25": {},
        "DPR": {},
        "Crossencoder": {"n" : top_k*2},
        "KMeans": {"k":3},
        "CURE": {"n": 25, # Represenative points
                "shrinkage_fraction" : 0.1, # Fraction of points to be removed
                "threshold": 0.35, # Threshold for merging clusters
                "initial_clusters": 50, # Initial number of clusters
                "subsample_fraction": 0.5, # Fraction of points to be used for clustering
                "similarity_measure": "cosine"}}

### 1.2 Define Function to Pre-compute Embeddings

This function helps us reduce a lot of computations, by pre computing the embeddings offline and loading them online, instead of computing them multiple times (one time for each model that relies on embeddings).

In [None]:
def preComputeEmbeddings(dataset: str, 
                         documents: list[dict], 
                         embedding_model_name: str, 
                         embedding_index_folder_path: str):
    embedder = DPR(documents, model_name=embedding_model_name)
    embedding_index_path = getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)
    embedder.SaveIndex(embedding_index_path)
    return embedding_index_path

def getPreComputedEmbeddingsPath(dataset: str, embedding_index_folder_path: str):
    return os.path.join(embedding_index_folder_path,dataset,"embedding_index.pickle")

def InitializeModels(models: dict, device: str):
    for model_name, retriever in models.items():
        retriever.device = device # give attribute device to model
        if hasattr(retriever, 'model'):
            retriever.model.to(device) # send Encoder to device
            retriever.index.GetEmbeddingMatrix()
            retriever.index.embedding_matrix = retriever.index.embedding_matrix.to(device) # NOTE: This does not work inplace
        # if hasattr(retriever, 'crossencoder'):
        #     # retriever.crossencoder.to(device)
        #     print("CE DEVICE:", retriever.crossencoder.device)

## 2 Run Experiemnt

Here we define the experiment itself.
We itterate over all datasets and perform retrieval for each query for each model.
We then return the score metrics, which are the mean precision, recall, reciprocal rank and time for each model.

In [None]:
def runExperiment(data_loader: DataLoader, 
                  datasets: list[str], 
                  model_descriptions: dict[str, dict],
                  embedding_model_name: str,
                  embedding_index_folder_path: str,
                  top_k: int):
    
    score_metrics: dict[str, dict[str, dict[str, float]]] = {}
    print(f'Load saved models: {load_saved_models}')
    for dataset in datasets:
        score_metrics[dataset] = {}
        documents, relevant_doc_ids_for_all_queries = getCorpus(data_loader, dataset)

        assert len(documents) > top_k, "top_k cannot be longer than the amount of documents in the corpus!"

        queries = getQueries(data_loader, relevant_doc_ids_for_all_queries)

        if load_saved_models:
            print(f'Loading saved models!')
            models = loadModels(dataset, model_descriptions)
            InitializeModels(models, device=device)
            print("Models loaded!")
        else:
            # Compute embeddings if not done already
            if not os.path.exists(getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)):
                embedding_index_path = preComputeEmbeddings(dataset, 
                                documents,
                                embedding_model_name,
                                embedding_index_folder_path)
                print('Finished computing embeddings!')
            embedding_index_path = getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)
            print(f'Embedding index path: {embedding_index_path}')
            models = createModels(documents=documents, 
                                   dataset_name=dataset, 
                                   models=model_descriptions, 
                                   embedding_index_path=embedding_index_path,
                                   save=True)
        
        for model_name, model in models.items():
            if model_name == 'Crossencoder':
                print(f'\nActual device: {device}\nCrossencoder target device: {model.crossencoder._target_device}\nIf these do not match, consider running CreateModel as opposed to LoadModel!\n')
            print(f'Computing results for {model_name}')
            results = []
            times = []
            score_metrics[dataset][model_name] = {}

            itt = 0
            for query_batch in batch(queries, batch_size):
                elapsed, relevancies = timeFunction(retrieveQueryAndGetRelevancies, 
                                                 **{"model": model, 
                                                    "queries": query_batch, 
                                                    "k": top_k})
                results.extend(relevancies)
                times.append(elapsed)
                itt += batch_size
                if itt % 500 == 0:
                    print(f"Iter: {itt}/{len(queries)}")

            model_metrics = calculateMetrics(results, queries, subset_factors=subset_factors)
            score_metrics[dataset][model_name] = model_metrics
            score_metrics[dataset][model_name]["time"] = sum(times)/len(times)
    return score_metrics

Here we perform the acutal experiment. This might take a while to run, depending on the number of models and datasets.

In [None]:
score_metrics = runExperiment(data_loader,
                              datasets,
                              model_descriptions,
                              embedding_model_name,
                              embedding_index_folder_path,
                              top_k=top_k)

In [None]:
# Check results
score_metrics