In [1]:
from models.builers.retriever import Retriever
from data.phisherman import LoadPhishingDataset

from utils.data_utils import getCorpus, getQueries
from models.model_loader_helpers import createModels, loadModels
from utils.metrics_uitls import timeFunction, calculateMetrics
from utils.lookup_utils import retrieveQueryAndGetRelevancies
from utils.latex_utils import createLatexTable

In [2]:
# Load dataset
dataset = LoadPhishingDataset()

In [3]:
from models.model_loader_helpers import createModels

models_to_create = {"TF-IDF": {},
                    "BM25": {}}

createModels(documents=dataset, dataset_name="Phisher", models=models_to_create, save=True)

Creating TF-IDF model
GetCorpusVocabulary Elapsed: 2.8280093669891357s
GetInverseDocumentFrequencies Elapsed: 4.044939994812012s
GetDocumentsTFIDFVectors Elapsed: 43.274760007858276s
Saving model 'TF-IDF' at: models/pickled_models/Phisher/TF-IDF.pickle
Creating BM25 model
GetCorpusVocabulary Elapsed: 2.8798651695251465s
GetInverseDocumentFrequencies Elapsed: 4.285123109817505s
GetDocumentLengths Elapsed: 0.9555890560150146s
GetDocumentBM25Vectors Elapsed: 40.61435103416443s
Saving model 'BM25' at: models/pickled_models/Phisher/BM25.pickle


{'TF-IDF': <models.TFIDF.TFIDF at 0x10f0f8280>,
 'BM25': <models.BM25.BM25 at 0x123114b80>}

In [4]:
from models.model_loader_helpers import loadModels

models = loadModels("Phisher", models_to_create)

In [5]:
from data.query import Query

queries = []

for rel in dataset:
    id = rel.Id
    queries.append(Query(text=rel.text, id=id, relevant_document_ids=dataset.getRelatedDocuments(rel)))

In [6]:
queries[1].getQuery(), queries[0].getNumberOfRelevantDocuments()

('the other side of galicismos galicismo is a spanish term which names the improper introduction of french words which are spanish sounding and thus very deceptive to the ear galicismo is often considered to be a barbarismo what would be the term which designates the opposite phenomenon that is unlawful words of spanish origin which may have crept into french can someone provide examples thank you joseph m kozono kozonoj gunet georgetown edu',
 11321)

In [12]:
def retrieveQueryAndGetScore(model: Retriever, query: Query, k: int):
    retrieved_documents = model.Lookup(query=query.getQuery(), k=k)
    relevancies = []
    for document in retrieved_documents:
        if query.isDocumentRelevant(document):
            relevancies.append(True)
        else:
            relevancies.append(False)
    return relevancies

In [8]:
def reciprocalRank(relevancies):
    for i, relevancy in enumerate(relevancies):
        if relevancy:
            return 1/(i+1)
    return 0

def meanReciprocalRank(scores):
    reciprocal_ranks = []
    for score in scores:
        reciprocal_ranks.append(reciprocalRank(score))

    return sum(reciprocal_ranks) / len(reciprocal_ranks)

def precision(relevancies):
    return sum([1 if relevancy else 0 for relevancy in relevancies]) / len(relevancies)

def recall(relevancies, query: Query):
    return sum([1 if relevancy else 0 for relevancy in relevancies]) / min(len(relevancies), query.GetNumberOfRelevantDocuments())

def calculate_metrics(results):
    """ Calculates precision, recall and accuracy based on the results of a query. True or false values are used to indicate whether a document is relevant or not.
    
    Args:
        results (list): A list of lists of booleans, where each list of booleans represents the retrieved documents for a query

    Returns:
        precision (float): Precision of the results
        recall (float): Recall of the results
        accuracy (float): Accuracy of the results
    """
    total_true_positives = 0  # Relevant documents correctly retrieved
    total_false_positives = 0 # Non-relevant documents incorrectly retrieved
    total_false_negatives = 0 # Relevant documents missed
    total_true_negatives = 0  # Non-relevant documents correctly not retrieved

    for query_results in results:
        true_positives = sum(query_results)
        false_positives = len(query_results) - true_positives
        # Assuming the length of the list is the total number of relevant documents for the query
        false_negatives = len(query_results) - true_positives  
        # True negatives can't be calculated without knowing the total number of non-relevant documents

        total_true_positives += true_positives
        total_false_positives += false_positives
        total_false_negatives += false_negatives

    # Calculating metrics
    precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
    recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
    accuracy = (total_true_positives + total_true_negatives) / (total_true_positives + total_false_positives + total_false_negatives + total_true_negatives) if (total_true_positives + total_false_positives + total_false_negatives + total_true_negatives) > 0 else 0

    mrr = meanReciprocalRank(results)

    return precision, recall, accuracy, mrr

In [9]:
import time

def timeFunction(function, **args):
    time_before = time.perf_counter()
    output = function(**args)
    time_after = time.perf_counter()
    return time_after - time_before, output

In [13]:
for i in range(5):
    t_, s_ = timeFunction(retrieveQueryAndGetScore, **{"model": models['TF-IDF'], "query": queries[i], "k": queries[i].getNumberOfRelevantDocuments()})
    print(t_)

QueryToVector Elapsed: 0.0024721622467041016s
CalculateScores Elapsed: 0.04094982147216797s
54.22045431600418 [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 

In [None]:
from tqdm import tqdm
score_metrics = {}
# documents, relevant_doc_ids_for_all_queries = getCorpus(data_loader, dataset)
# queries = getQueries(data_loader, relevant_doc_ids_for_all_queries)
load_saved_models = True

if load_saved_models:
    models = loadModels("Phisher", models_to_create)
else:
    embedding_index_path = preComputeEmbeddings(dataset, 
                        documents,
                        embedding_model_name,
                        embedding_index_folder_path)
    models = createModels(documents=documents, 
                            dataset_name=dataset, 
                            models=model_descriptions, 
                            embedding_index_path=embedding_index_path,
                            save=True)
for model_name, model in models.items():
    results = []
    times = []
    score_metrics[model_name] = {}
    for query in tqdm(queries):
        time, relevancies = timeFunction(retrieveQueryAndGetRelevancies, 
                                            **{"model": model, 
                                            "query": query, 
                                            "k": query.getNumberOfRelevantDocuments()})
        results.append(relevancies)
        times.append(time)
    precision, recall, reciprocal_rank = calculateMetrics(results, queries)
    score_metrics[model_name]["precision"] = precision
    score_metrics[model_name]["recall"] = recall
    score_metrics[model_name]["reciprocal_rank"] = reciprocal_rank
    score_metrics[model_name]["time"] = sum(times)/len(times)