## 0 Imports and Device

Here we import all necessary helper functions and classes.
We also define the device to run the models on (GPU or CPU).

In [11]:
from data.phishing import LoadPhishingDataset
from models.builers.retriever import Retriever
from data.dataloader import DataLoader
from models.model_loader_helpers import createModels, loadModels
from utils.phishing_utils import getPhishingQueries
from models.DPR import DPR
from utils.metrics_uitls import timeFunction
from utils.phishing_utils import calculatePhishingAccuracy, evaluatePhishingByMajorityVote
import configparser
import torch
import os
device = ('cuda' if torch.cuda.is_available() else 'cpu')

## 1 Prepare Experiments

### 1.1 Define Experiment Configuration 

Here we define the configuration of the experiment.
Both the datasets to perform the experiment on and the model configurations.

Change the load_saved_models variable to True, to load locally saved models, instead of creating them during the experiment.

In [12]:
config = configparser.ConfigParser()
config.read('configs/config.ini')
data_loader = DataLoader(config)

model_descriptions = {"TF-IDF": {},
        "BM25": {},
        "DPR": {},
        "Crossencoder": {"n":25},
        "KMeans": {"k":4},
        "CURE": {"k": 2, "n": 2, "shrinkage_fraction":0.2}}

load_saved_models = False

embedding_model_name = "bert-base-uncased"
embedding_index_folder_path = "indexes"

top_k = 25

### 1.2 Define Function to Pre-compute Embeddings

This function helps us reduce a lot of computations, by pre computing the embeddings offline and loading them online, instead of computing them multiple times (one time for each model that relies on embeddings).

In [13]:
def preComputeEmbeddings(dataset: str, 
                         documents: list[dict], 
                         embedding_model_name: str, 
                         embedding_index_folder_path: str):
    embedder = DPR(documents, model_name=embedding_model_name)
    embedding_index_path = getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)
    embedder.SaveIndex(embedding_index_path)
    return embedding_index_path

def getPreComputedEmbeddingsPath(dataset: str, embedding_index_folder_path: str):
    return os.path.join(embedding_index_folder_path,dataset,"embedding_index.pickle")

## 2 Run Experiemnt

Here we define the experiment itself.
We itterate over all datasets and perform retrieval for each query for each model.
We then return the score metrics, which are the mean precision, recall, reciprocal rank and time for each model.

In [14]:
def runPhishingExperiment( datasets_path: str, 
                  model_descriptions: dict[str, dict],
                  embedding_model_name: str,
                  embedding_index_folder_path: str,
                  top_k: int):
    score_metrics: dict[str, dict[str, float]] = {}
    dataset = LoadPhishingDataset(datasets_path)
    queries = getPhishingQueries(dataset)
    queries = queries[:30]
    documents = dataset.GetDocumentDicts()
    documents = documents[:25]
    if load_saved_models:
        models = loadModels(dataset, model_descriptions)
    else:
        embedding_index_path = preComputeEmbeddings(
                            "phishing", 
                            documents,
                            embedding_model_name,
                            embedding_index_folder_path)
        models: dict[str, Retriever] = createModels(documents=documents, 
                                dataset_name="phishing", 
                                models=model_descriptions, 
                                embedding_index_path=embedding_index_path,
                                save=True)
    for model_name, model in models.items():
        preds = []
        labels = []
        times = []
        score_metrics[model_name] = {}
        for query in queries:
            time, retrieved_documents = timeFunction(model.Lookup, 
                                                **{"query": query.getQuery(), 
                                                "k": top_k})
            retrieved_labels = [dataset.GetLabelFromId(document.GetId()) for document in retrieved_documents]
            pred = evaluatePhishingByMajorityVote(retrieved_labels)
            preds.append(pred)
            labels.append(query.getLabel())
            times.append(time)
        score_metrics[model_name]["accuracy"] = calculatePhishingAccuracy(preds, labels)
        score_metrics[model_name]["time"] = sum(times)/len(times)
    return score_metrics

In [10]:
score_metrics = runPhishingExperiment("datasets/Phishing_Email.csv", 
                  model_descriptions,
                  embedding_model_name,
                  embedding_index_folder_path,
                  top_k)
print(score_metrics)

Creating TF-IDF model
GetCorpusVocabulary Elapsed: 0.0011951923370361328s
GetInverseDocumentFrequencies Elapsed: 0.002248048782348633s
GetDocumentsTFIDFVectors Elapsed: 0.00797414779663086s
Saving model 'TF-IDF' at: models/pickled_models/phishing/TF-IDF.pickle
Creating BM25 model
GetCorpusVocabulary Elapsed: 0.0007119178771972656s
GetInverseDocumentFrequencies Elapsed: 0.0019381046295166016s
GetDocumentLengths Elapsed: 0.0003120899200439453s
GetDocumentBM25Vectors Elapsed: 0.007869958877563477s
Saving model 'BM25' at: models/pickled_models/phishing/BM25.pickle
Creating DPR model
Saving model 'DPR' at: models/pickled_models/phishing/DPR.pickle
Crossencoder model
Saving model 'Crossencoder' at: models/pickled_models/phishing/Crossencoder_n25.pickle
KMeans model
Saving model 'KMeans' at: models/pickled_models/phishing/KMeans_k4.pickle
CURE model
Saving model 'CURE' at: models/pickled_models/phishing/CURE_k2_n2_shrinkage_fraction0.2.pickle
QueryToVector Elapsed: 0.004414081573486328s
Calcu