# Install libraries (if needed)

In [1]:
"""
!pip install beir
!pip install fasttext
!pip install spacy
!pip scikit-learn
!pip install rank_bm25
"""

'\n!pip install beir\n!pip install fasttext\n!pip install spacy\n!pip scikit-learn\n!pip install rank_bm25\n'

# Import libraries

In [2]:
import pandas as pd

import beir 
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from utils_func import corpus_processing, matrix_creation, clustering, retriever_model, vector_creation

import os
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

try:
    import fasttext
    import fasttext.util
except:
    print('fasttext not imported')

  from tqdm.autonotebook import tqdm


# Run experiment

## Choose parameters

In [None]:
dataset = "nfcorpus"                        # dataset you want to use, had to be available in the beir benchmark: https://github.com/beir-cellar/beir

use_ft = True                               # whether to use fasttext or not to handle unseen words
path_ft = 'cc.en.100.bin'                   # path to the fasttext model, if empty and use_ft is true, the model will be downloaded in the current directory
save_cleaned_corpus = ''                    # path to save the cleaned corpusn, if empty, the corpus will not be saved
save_scores = ''                            # path to save the scores, if empty, the scores will not be saved 

load_cleaned_corpus = ''                    # path to load the cleaned corpus, if empty, the corpus will be cleaned
load_vectors = f'word_vectors/word_vectors_{dataset}.csv' # path to load the word vectors, if empty, the vectors will be created
vector_dimension = 100                      # dimension of the word vectors
path_to_save_model = ''                     # path to save the fasttext model trained on the corpora, if empty, the model will not be saved

remove_original_corpus = True               # whether to remove the original corpus from the memory or not

best_n_neighbors = 75                       # number of neighbors to consider to fill the similarity matrix
best_alpha = 0.76                           # alpha parameter, balancing the importance between similarity and coexistence
best_thresh = 0.75                          # threshold to consider a word as replaceable by another one
metric = 'cosine'                           # metric to use to compute the similarity matrix   
k1 = 1.5                                    # parameter of the BM25 algorithm 
b = 0.75                                    # parameter of the BM25 algorithm
thresh_prob=0.005                           # threshold to consider a value equals to 0 in the coexistence matrix           

knn_method = 'faiss'                        # method to use to compute the k-ne, either 'faiss' or 'exact' 

## Run an experiment

In [4]:
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
if not os.path.exists(f"datasets/"):
    os.makedirs(f"datasets/")
    
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

100%|██████████| 3633/3633 [00:00<00:00, 152607.98it/s]


In [None]:
try:
    if use_ft:
        fasttext_model = fasttext.load_model(path_ft)
    else:
        fasttext_model = None
except:
    print('Model not found')
    if use_ft:
        print('Downloading model...')
        fasttext.util.download_model('en', if_exists='ignore')  # English
        fasttext_model = fasttext.load_model('cc.en.300.bin')
        print('Reducing model...')
        if vector_dimension != 300:
            fasttext.util.reduce_model(fasttext_model, vector_dimension)
        print('Saving model...')
        if path_ft != '':
            fasttext_model.save_model(path_ft)
        fasttext_model.save_model(f'cc.en.{vector_dimension}.bin')
        print('Model saved.')
    
    else:
        fasttext_model = None

In [6]:
if load_cleaned_corpus == '':
    cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)
    if save_cleaned_corpus != '':
        corpus_processing.save_processed_corpus(cleaned_corpus, save_cleaned_corpus)
else:
    cleaned_corpus = pd.read_csv(load_cleaned_corpus)
    cleaned_corpus = {cleaned_corpus['doc_id'][i]:cleaned_corpus['text'][i] for i in range(len(cleaned_corpus))}

if remove_original_corpus:
    corpus = None

Prétraitement du corpus: 100%|██████████| 3633/3633 [00:28<00:00, 125.54it/s]


In [None]:
if os.path.exists(load_vectors):
    embeddings = pd.read_csv(load_vectors, sep=' ',na_values=[''], keep_default_na=False, index_col=0).dropna()
    for i in list(embeddings.columns)[1:]:
        embeddings[i] = embeddings[i].astype(float)
else:
    embeddings = vector_creation.create_vectors(cleaned_corpus, vector_dimension, path_to_save_vectors=load_vectors, path_to_save_model=path_to_save_model, fasttext_model=fasttext_model, epochs = 5, model = 'skipgram')


Creating file to train fasttext model:   0%|          | 0/3633 [00:00<?, ?it/s]

Creating file to train fasttext model: 100%|██████████| 3633/3633 [00:00<00:00, 171607.71it/s]
Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 45191.67it/s]


In [None]:
scores = {}
retriever = retriever_model.UCFIRe(embeddings, fasttext_model,n_neighbors = best_n_neighbors, alpha=best_alpha, thresh = best_thresh, metric = metric, k1 = k1, b = b, thresh_prob=thresh_prob)

retriever.fit(cleaned_corpus, is_clean=True, knn_method=knn_method)
retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
# Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
scores = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)  # Replace this with your evaluation metric

if save_scores != '':
    with open(save_scores, 'w') as f:
        f.write(str(scores))
    
print(scores)


Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 49678.24it/s]


Normalizing embeddings for cosine similarity...
Embeddings normalized.
Creating Faiss index...
Faiss index created.
getting distances
end of getting distances


filling similarity matrix: 100%|██████████| 28549/28549 [00:03<00:00, 8426.76it/s]
Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 59999.55it/s]
Getting word presence: 100%|██████████| 3633/3633 [00:00<00:00, 27311.89it/s]
Calculating coexistence probability: 100%|██████████| 28549/28549 [00:26<00:00, 1070.19it/s]
Getting replaceable words: 100%|██████████| 28549/28549 [00:08<00:00, 3343.32it/s]


finding cycles...
end of finding cycles


Creating clusters dict: 100%|██████████| 21543/21543 [00:03<00:00, 7135.27it/s]
Rewriting corpus: 100%|██████████| 3633/3633 [00:06<00:00, 531.71it/s]
tests in progress: 100%|██████████| 323/323 [00:14<00:00, 22.26it/s]

({'NDCG@1': 0.43034, 'NDCG@3': 0.39118, 'NDCG@5': 0.36141, 'NDCG@10': 0.33487, 'NDCG@100': 0.28527, 'NDCG@1000': 0.34877}, {'MAP@1': 0.05798, 'MAP@3': 0.09424, 'MAP@5': 0.10771, 'MAP@10': 0.12535, 'MAP@100': 0.15023, 'MAP@1000': 0.16125}, {'Recall@1': 0.05798, 'Recall@3': 0.10209, 'Recall@5': 0.12967, 'Recall@10': 0.1636, 'Recall@100': 0.26176, 'Recall@1000': 0.47483}, {'P@1': 0.43963, 'P@3': 0.36326, 'P@5': 0.30341, 'P@10': 0.24427, 'P@100': 0.06882, 'P@1000': 0.01782})



