In [1]:
from models.builers.retriever import Retriever
from data.phisherman import LoadPhishingDataset

from utils.data_utils import getCorpus, getQueries
from models.model_loader_helpers import createModels, loadModels
from utils.metrics_uitls import timeFunction, calculateMetrics
from utils.lookup_utils import retrieveQueryAndGetRelevancies
from utils.latex_utils import createLatexTable

In [2]:
# Load dataset
dataset = LoadPhishingDataset()

In [3]:
from models.model_loader_helpers import createModels

models_to_create = {"TF-IDF": {},
                    "BM25": {}}

createModels(documents=dataset, dataset_name="Phisher", models=models_to_create, save=True)

Creating TF-IDF model
GetCorpusVocabulary Elapsed: 5.0884480476379395s
GetInverseDocumentFrequencies Elapsed: 21.464350938796997s
GetDocumentsTFIDFVectors Elapsed: 105.33353400230408s
Saving model 'TF-IDF' at: models/pickled_models/Phisher/TF-IDF.pickle
Creating BM25 model
GetCorpusVocabulary Elapsed: 5.311795234680176s
GetInverseDocumentFrequencies Elapsed: 13.972733974456787s
GetDocumentLengths Elapsed: 3.4158408641815186s


In [None]:
from models.model_loader_helpers import loadModels

models = loadModels("Phisher", models_to_create)

In [None]:
from data.query import Query

queries = []

for rel in dataset:
    id = rel.Id
    queries.append(Query(text=rel.text, id=id, relevant_document_ids=dataset.getRelatedDocuments(rel)))

In [None]:
queries[1].getQuery(), queries[0].getNumberOfRelevantDocuments()

In [None]:
def retrieveQueryAndGetScore(model: Retriever, query: Query, k: int):
    retrieved_documents = model.Lookup(query=query.getQuery(), k=k)
    relevancies = []
    for document in retrieved_documents:
        if query.isDocumentRelevant(document):
            relevancies.append(True)
        else:
            relevancies.append(False)
    return relevancies

In [None]:
import time

def timeFunction(function, **args):
    time_before = time.perf_counter()
    output = function(**args)
    time_after = time.perf_counter()
    return time_after - time_before, output

In [None]:
for i in range(5):
    t_, s_ = timeFunction(retrieveQueryAndGetScore, **{"model": models['TF-IDF'], "query": queries[i], "k": queries[i].getNumberOfRelevantDocuments()})
    print(t_)

In [None]:
def major_vote(relevancies: list[bool]) -> str:
    """Take a list of relevancies returned by the model and get the majority vote for the documents.

    Args:
        relevancies (list[bool]): List of relevancies returned by the model.

    Returns:
        str: Majority vote class for the documents.
    """
    return "Phishing Email" if relevancies.count(True) > relevancies.count(False) else "Safe Email"

In [None]:
def calculate_accuracy(preds: list[str], labels: list[str]) -> float:
    """Take list of predictions and calculate acc

    Args:
        preds (list[bool]): list of predictions
        label ([type]): label
    Returns:
        accuracy (float): accuracy
    """
    # inefficient but is only run once for each model
    correct = 0
    for i in range(len(preds)):
        if preds[i] == labels[i]:
            correct += 1
    return correct / len(preds)

In [None]:
from tqdm import tqdm
score_metrics = {}
# documents, relevant_doc_ids_for_all_queries = getCorpus(data_loader, dataset)
# queries = getQueries(data_loader, relevant_doc_ids_for_all_queries)
load_saved_models = True

if load_saved_models:
    models = loadModels("Phisher", models_to_create)
else:
    embedding_index_path = preComputeEmbeddings(dataset, 
                        documents,
                        embedding_model_name,
                        embedding_index_folder_path)
    models = createModels(documents=documents, 
                            dataset_name=dataset, 
                            models=model_descriptions, 
                            embedding_index_path=embedding_index_path,
                            save=True)
for model_name, model in models.items():
    results = {"y_true": [], "y_pred": []}
    times = []
    score_metrics[model_name] = {}
    for query in queries:
        # print(query.id, query.text)
        timestamp, relevancies = timeFunction(retrieveQueryAndGetRelevancies, 
                                            **{"model": model, 
                                            "query": query, 
                                            "k": 100}) # We retrieve top 100 documents

        truth = dataset[query.id].label
        pred = major_vote(relevancies)

        # Results are in this case just the majority vote of labels for retrieved documents
        results["y_true"].append(truth)
        results["y_pred"].append(truth)
        times.append(timestamp)
    accuracy = calculate_accuracy(results["y_pred"], results["y_true"])
    score_metrics[model_name]["accuracy"] = accuracy
    score_metrics[model_name]["time"] = sum(times)/len(times)