## 0 Imports and Device

Here we import all necessary helper functions and classes.
We also define the device to run the models on (GPU or CPU).

In [1]:
from data.phishing import LoadPhishingDataset
from models.builers.retriever import Retriever
from data.dataloader import DataLoader
from data.phishing import PhishingDataset
from models.model_loader_helpers import createModels, loadModels
from utils.phishing_utils import getPhishingQueries
from models.DPR import DPR
from utils.metrics_uitls import timeFunction
from utils.phishing_utils import calculatePhishingAccuracy, evaluatePhishingByMajorityVote
from utils.misc import batch
import configparser
import torch
import os
import pickle
device = ('cuda' if torch.cuda.is_available() else 'cpu')

## 1 Prepare Experiments

### 1.1 Define Experiment Configuration 

Here we define the configuration of the experiment.
Both the datasets to perform the experiment on and the model configurations.

Change the load_saved_models variable to True, to load locally saved models, instead of creating them during the experiment.

In [2]:
config = configparser.ConfigParser()
config.read('configs/config.ini')
data_loader = DataLoader(config)

top_k = 25
test_split = 0.2
batch_size=25

model_descriptions = {
        "TF-IDF": {},
        "BM25": {},
        "DPR": {},
        "Crossencoder": {"n":2*top_k},
        "KMeans": {"k":3},
        "CURE": {"n": 25,
                "shrinkage_fraction" : 0.1,
                "threshold": 0.25,
                "initial_clusters": 50,
                "subsample_fraction": 0.5,
                "similarity_measure": "cosine"}}

load_saved_models = False

embedding_model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"#"bert-base-uncased"
embedding_index_folder_path = "indexes"
phishing_dataset_path = "data/datasets/phishing_dataset.pickle"
datasets_path = "data/datasets/Phishing_Email.csv"

### 1.2 Define Function to Pre-compute Embeddings

This function helps us reduce a lot of computations, by pre computing the embeddings offline and loading them online, instead of computing them multiple times (one time for each model that relies on embeddings).

In [3]:
def preComputeEmbeddings(dataset: str, 
                         documents: list[dict], 
                         embedding_model_name: str, 
                         embedding_index_folder_path: str):
    embedder = DPR(documents, model_name=embedding_model_name)
    embedding_index_path = getPreComputedEmbeddingsPath(dataset, embedding_index_folder_path)
    embedder.SaveIndex(embedding_index_path)
    return embedding_index_path

def getPreComputedEmbeddingsPath(dataset: str, embedding_index_folder_path: str):
    return os.path.join(embedding_index_folder_path,dataset,"embedding_index.pickle")

### Load or create Phishing Dataset

This is required since unique IDs are generated for each document whenever a phishing dataset is made. Therefore, you also need to create new models that are compatible with your dataset by default.

In [4]:
def CreatePhishingDataset(datasets_path: str, save: bool = True):
    dataset = LoadPhishingDataset(datasets_path)
    dataset.Shuffle()
    if save:
        with open("data/datasets/phishing_dataset.pickle", 'wb') as f:
            pickle.dump(dataset, f)
    return dataset

def LoadPrecomputedPhishingDataset(phishing_dataset_path: str):
    # with open("data/datasets/phishing_dataset.pickle", 'rb') as f:
    with open(phishing_dataset_path, 'rb') as f:
        dataset = pickle.load(f)
    return dataset


In [5]:
# PhishingData = CreatePhishingDataset(datasets_path, save=True)
PhishingData = LoadPrecomputedPhishingDataset(phishing_dataset_path)

## 2 Run Experiemnt

Here we define the experiment itself.
We itterate over all datasets and perform retrieval for each query for each model.
We then return the score metrics, which are the mean precision, recall, reciprocal rank and time for each model.

In [6]:
def runPhishingExperiment(dataset: PhishingDataset, 
                  model_descriptions: dict[str, dict],
                  embedding_model_name: str,
                  embedding_index_folder_path: str,
                  top_k: int,
                  test_split: float):
    score_metrics: dict[str, dict[str, float]] = {}
    queries = getPhishingQueries(dataset)
    queries = queries[:int(len(queries)*test_split)]
    documents = dataset.GetDocumentDicts()
    documents = documents[int(len(queries)*test_split):]
    if load_saved_models:
        models = loadModels("phishing", model_descriptions)
    else:
        embedding_index_path = preComputeEmbeddings(
                            "phishing", 
                            documents,
                            embedding_model_name,
                            embedding_index_folder_path)
        models: dict[str, Retriever] = createModels(documents=documents, 
                                dataset_name="phishing", 
                                models=model_descriptions, 
                                embedding_index_path=embedding_index_path,
                                save=True)
    
    for model_name, model in models.items():
        retrieved_documents = []
        preds = []
        labels = []
        score_metrics[model_name] = {}
        total_time = 0
        print(f'Computing phishing results for {model_name}')
        iter_count = 0
        for query_batch in batch(queries, batch_size):
            time, retrieved_docs = timeFunction(model.Lookup, 
                                                **{"queries": [query.getQuery() for query in query_batch], 
                                                "k": top_k})
            retrieved_documents.extend(retrieved_docs)
            total_time += time
            iter_count += batch_size
            if iter_count % 250 == 0:
                print(f'Iter {iter_count}/{len(queries)}')
        
        retrieved_labels = [[dataset.GetLabelFromId(document.GetId()) for document in query] for query in retrieved_documents]
        preds = evaluatePhishingByMajorityVote(retrieved_labels)
        labels = [query.getLabel() for query in queries]
        
        score_metrics[model_name]["accuracy"] = calculatePhishingAccuracy(preds, labels)
        score_metrics[model_name]["time"] = total_time/len(queries)
    return score_metrics

In [7]:
score_metrics = runPhishingExperiment(PhishingData, 
                  model_descriptions,
                  embedding_model_name,
                  embedding_index_folder_path,
                  top_k,
                  test_split)
print(score_metrics)

Initializing retrieval model!


  from .autonotebook import tqdm as notebook_tqdm


Building embedding index using device: cuda. Running this on GPU is strongly adviced!
iter: 5000/17904
iter: 10000/17904
iter: 15000/17904
__BuildIndex Elapsed: 353.1629993915558s
DPR running on cuda
Embedding model is:
sentence-transformers/multi-qa-mpnet-base-dot-v1
Creating TF-IDF model
GetCorpusVocabulary Elapsed: 1.254044532775879s
GetInverseDocumentFrequencies Elapsed: 1.556990146636963s
GetDocumentsTFIDFVectors Elapsed: 12.727002620697021s
Saving model 'TF-IDF' at: models/pickled_models/phishing/TF-IDF.pickle
Creating BM25 model
GetCorpusVocabulary Elapsed: 1.1569676399230957s
GetInverseDocumentFrequencies Elapsed: 1.5650336742401123s
GetDocumentLengths Elapsed: 0.449967622756958s
GetDocumentBM25Vectors Elapsed: 13.621031284332275s
Saving model 'BM25' at: models/pickled_models/phishing/BM25.pickle
Creating DPR model
Initializing retrieval model!
Embedding matrix initialized to cuda!
DPR running on cuda
Embedding model is:
sentence-transformers/multi-qa-mpnet-base-dot-v1
Saving m