## 0 Imports and Device

Here we import all necessary helper functions and classes.
We also define the device to run the models on (GPU or CPU).

In [1]:
from data.phishing import LoadPhishingDataset
from models.builers.retriever import Retriever
from data.dataloader import DataLoader
from models.model_loader_helpers import createModels, loadModels
from utils.phishing_utils import getPhishingQueries
from models.DPR import DPR
from utils.metrics_uitls import timeFunction
from utils.phishing_utils import calculatePhishingAccuracy, evaluatePhishingByMajorityVote
import configparser
import torch
import os
device = ('cuda' if torch.cuda.is_available() else 'cpu')

## 1 Prepare Experiments

### 1.1 Define Experiment Configuration 

Here we define the configuration of the experiment.
Both the datasets to perform the experiment on and the model configurations.

Change the load_saved_models variable to True, to load locally saved models, instead of creating them during the experiment.

In [2]:
config = configparser.ConfigParser()
config.read('configs/config.ini')
data_loader = DataLoader(config)

model_descriptions = {
        "TF-IDF": {},
        "BM25": {},
        "DPR": {},
        "Crossencoder": {"n":25},
        "KMeans": {"k":4},
        "CURE": {"n": 25, "initial_clusters": 25, "threshold": 0.35, "subsample_fraction": 0.5, "shrinkage_fraction":0.2}}

load_saved_models = False

embedding_model_name = "bert-base-uncased"
embedding_index_folder_path = "indexes"

top_k = 25

### 1.2 Define Function to Pre-compute Embeddings

This function helps us reduce a lot of computations, by pre computing the embeddings offline and loading them online, instead of computing them multiple times (one time for each model that relies on embeddings).

In [3]:
def preComputeEmbeddings(dataset: str, 
                         documents: list[dict], 
                         embedding_model_name: str, 
                         embedding_index_folder_path: str):
    embedder = DPR(documents, model_name=embedding_model_name)
    embedding_index_path = getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)
    embedder.SaveIndex(embedding_index_path)
    return embedding_index_path

def getPreComputedEmbeddingsPath(dataset: str, embedding_index_folder_path: str):
    return os.path.join(embedding_index_folder_path,dataset,"embedding_index.pickle")

## 2 Run Experiemnt

Here we define the experiment itself.
We itterate over all datasets and perform retrieval for each query for each model.
We then return the score metrics, which are the mean precision, recall, reciprocal rank and time for each model.

In [4]:
def runPhishingExperiment( datasets_path: str, 
                  model_descriptions: dict[str, dict],
                  embedding_model_name: str,
                  embedding_index_folder_path: str,
                  top_k: int):
    score_metrics: dict[str, dict[str, float]] = {}
    dataset = LoadPhishingDataset(datasets_path)
    queries = getPhishingQueries(dataset)
    queries = queries[:25]
    documents = dataset.GetDocumentDicts()
    documents = documents[:100]
    if load_saved_models:
        models = loadModels(dataset, model_descriptions)
    else:
        embedding_index_path = preComputeEmbeddings(
                            "phishing", 
                            documents,
                            embedding_model_name,
                            embedding_index_folder_path)
        models: dict[str, Retriever] = createModels(documents=documents, 
                                dataset_name="phishing", 
                                models=model_descriptions, 
                                embedding_index_path=embedding_index_path,
                                save=True)
    for model_name, model in models.items():
        preds = []
        labels = []
        score_metrics[model_name] = {}
        # for query in queries:
        #     time, retrieved_documents = timeFunction(model.Lookup, 
        #                                         **{"queries": [query.getQuery()], 
        #                                         "k": top_k})
        #     retrieved_labels = [dataset.GetLabelFromId(document.GetId()) for document in retrieved_documents]
        #     pred = evaluatePhishingByMajorityVote(retrieved_labels)
        #     preds.append(pred)
        #     labels.append(query.getLabel())
        #     times.append(time)
        time, retrieved_documents = timeFunction(model.Lookup, 
                                                **{"queries": [query.getQuery() for query in queries], 
                                                "k": top_k})
        retrieved_labels = [[dataset.GetLabelFromId(document.GetId()) for document in query] for query in retrieved_documents]
        preds = evaluatePhishingByMajorityVote(retrieved_labels)
        labels = [query.getLabel() for query in queries]
        
        score_metrics[model_name]["accuracy"] = calculatePhishingAccuracy(preds, labels)
        score_metrics[model_name]["time"] = time/len(queries)
    return score_metrics

In [5]:
score_metrics = runPhishingExperiment("data/datasets/Phishing_Email.csv", 
                  model_descriptions,
                  embedding_model_name,
                  embedding_index_folder_path,
                  top_k)
print(score_metrics)

Initializing retrieval model!


  from .autonotebook import tqdm as notebook_tqdm


Building embedding index using device: cpu. Running this on GPU is strongly adviced!
__BuildIndex Elapsed: 14.826141119003296s
DPR running on cpu
Embedding model is:
bert-base-uncased
Creating TF-IDF model
GetCorpusVocabulary Elapsed: 0.0024399757385253906s
GetInverseDocumentFrequencies Elapsed: 0.005352973937988281s
GetDocumentsTFIDFVectors Elapsed: 0.024586915969848633s
Saving model 'TF-IDF' at: models/pickled_models/phishing/TF-IDF.pickle
Creating BM25 model
GetCorpusVocabulary Elapsed: 0.002263784408569336s
GetInverseDocumentFrequencies Elapsed: 0.005182981491088867s
GetDocumentLengths Elapsed: 0.0009260177612304688s
GetDocumentBM25Vectors Elapsed: 0.025222063064575195s
Saving model 'BM25' at: models/pickled_models/phishing/BM25.pickle
Creating DPR model
Initializing retrieval model!
Embedding matrix initialized to cpu!
DPR running on cpu
Embedding model is:
sentence-transformers/multi-qa-mpnet-base-dot-v1
Saving model 'DPR' at: models/pickled_models/phishing/DPR.pickle
Crossencode