# Install libraries (if needed)

In [1]:
"""
!pip install beir
!pip install fasttext
!pip install spacy
!pip scikit-learn
!pip install rank_bm25
!python -m spacy download en_core_web_sm
"""

'\n!pip install beir\n!pip install fasttext\n!pip install spacy\n!pip scikit-learn\n!pip install rank_bm25\n!pip install faiss-cpu\n'

# Import libraries

In [2]:
import pandas as pd

import beir 
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from utils_func import corpus_processing, matrix_creation, clustering, retriever_model, vector_creation

import os
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

try:
    import fasttext
    import fasttext.util
except:
    print('fasttext not imported')

  from tqdm.autonotebook import tqdm


# Run experiment

## Choose parameters

In [3]:
dataset = "nfcorpus"                        # dataset you want to use, had to be available in the beir benchmark: https://github.com/beir-cellar/beir

use_ft = True                               # whether to use fasttext or not to handle unseen words
path_ft = 'cc.en.100.bin'                   # path to the fasttext model, if empty and use_ft is true, the model will be downloaded in the current directory
save_cleaned_corpus = ''                    # path to save the cleaned corpusn, if empty, the corpus will not be saved
save_scores = ''                            # path to save the scores, if empty, the scores will not be saved 

load_cleaned_corpus = ''                    # path to load the cleaned corpus, if empty, the corpus will be cleaned
load_vectors = f'word_vectors/word_vectors_{dataset}.csv' # path to load the word vectors, if empty, the vectors will be created
vector_dimension = 100                      # dimension of the word vectors
path_to_save_model = ''                     # path to save the fasttext model trained on the corpora, if empty, the model will not be saved

remove_original_corpus = False              # whether to remove the original corpus from the memory or not, to save memory

best_n_neighbors = 75                       # number of neighbors to consider to fill the similarity matrix
best_alpha = 0.76                           # alpha parameter, balancing the importance between similarity and coexistence
best_thresh = 0.75                          # threshold to consider a word as replaceable by another one
metric = 'cosine'                           # metric to use to compute the similarity matrix   
k1 = 1.5                                    # parameter of the BM25 algorithm 
b = 0.75                                    # parameter of the BM25 algorithm
thresh_prob=0.05                            # threshold to consider a value equals to 0 in the coexistence matrix           

knn_method = 'faiss'                        # method to use to compute the k-ne, either 'faiss' or 'exact' 

## Run an experiment

In [4]:
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
if not os.path.exists(f"datasets/"):
    os.makedirs(f"datasets/")

if not os.path.exists(f"datasets/{dataset}"):
    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
    data_path = util.download_and_unzip(url, "datasets")

data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

100%|██████████| 3633/3633 [00:00<00:00, 100892.57it/s]


In [5]:
try:
    if use_ft:
        fasttext_model = fasttext.load_model(path_ft)
    else:
        fasttext_model = None
except:
    print('Model not found')
    if use_ft:
        print('Downloading model...')
        fasttext.util.download_model('en', if_exists='ignore')  # English
        fasttext_model = fasttext.load_model('cc.en.300.bin')
        print('Reducing model...')
        if vector_dimension != 300:
            fasttext.util.reduce_model(fasttext_model, vector_dimension)
        print('Saving model...')
        if path_ft != '':
            fasttext_model.save_model(path_ft)
        fasttext_model.save_model(f'cc.en.{vector_dimension}.bin')
        print('Model saved.')
    
    else:
        fasttext_model = None

In [6]:
if load_cleaned_corpus == '':
    cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)
    if save_cleaned_corpus != '':
        corpus_processing.save_processed_corpus(cleaned_corpus, save_cleaned_corpus)
else:
    cleaned_corpus = pd.read_csv(load_cleaned_corpus)
    cleaned_corpus = {cleaned_corpus['doc_id'][i]:cleaned_corpus['text'][i] for i in range(len(cleaned_corpus))}

if remove_original_corpus:
    corpus = None

Prétraitement du corpus: 100%|██████████| 3633/3633 [00:34<00:00, 105.18it/s]


In [7]:
if os.path.exists(load_vectors):
    embeddings = pd.read_csv(load_vectors, sep=' ',na_values=[''], keep_default_na=False, index_col=0).dropna()
    for i in list(embeddings.columns)[1:]:
        embeddings[i] = embeddings[i].astype(float)
else:
    embeddings = vector_creation.create_vectors(cleaned_corpus, vector_dimension, path_to_save_vectors=load_vectors, path_to_save_model=path_to_save_model, epochs = 5, model = 'skipgram')


In [8]:
scores = {}
retriever = retriever_model.UCFIRe(embeddings, fasttext_model,n_neighbors = best_n_neighbors, alpha=best_alpha, thresh = best_thresh, metric = metric, k1 = k1, b = b, thresh_prob=thresh_prob)

retriever.fit(cleaned_corpus, is_clean=True, knn_method=knn_method)
retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
# Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
scores = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)  # Replace this with your evaluation metric

if save_scores != '':
    with open(save_scores, 'w') as f:
        f.write(str(scores))
    
print(scores)


Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 27465.34it/s]


Normalizing embeddings for cosine similarity...
Embeddings normalized.
Creating Faiss index...
Faiss index created.
getting distances
end of getting distances


filling similarity matrix: 100%|██████████| 28157/28157 [00:06<00:00, 4365.94it/s]
Getting unique words: 100%|██████████| 3633/3633 [00:00<00:00, 28995.75it/s]
Getting word presence: 100%|██████████| 3633/3633 [00:00<00:00, 19201.81it/s]
Calculating coexistence probability: 100%|██████████| 28157/28157 [00:24<00:00, 1172.30it/s]
Getting replaceable words: 100%|██████████| 28157/28157 [00:09<00:00, 2841.41it/s]


finding graph components...
graph components found


Creating clusters dict: 100%|██████████| 21439/21439 [00:02<00:00, 8481.13it/s]
Rewriting corpus: 100%|██████████| 3633/3633 [00:08<00:00, 425.17it/s] 
tests in progress: 100%|██████████| 323/323 [00:25<00:00, 12.71it/s]


({'NDCG@1': 0.42724, 'NDCG@3': 0.38821, 'NDCG@5': 0.36029, 'NDCG@10': 0.33389, 'NDCG@100': 0.28547, 'NDCG@1000': 0.3482}, {'MAP@1': 0.05822, 'MAP@3': 0.09384, 'MAP@5': 0.1074, 'MAP@10': 0.12476, 'MAP@100': 0.15008, 'MAP@1000': 0.161}, {'Recall@1': 0.05822, 'Recall@3': 0.10164, 'Recall@5': 0.12946, 'Recall@10': 0.16255, 'Recall@100': 0.26206, 'Recall@1000': 0.47349}, {'P@1': 0.43653, 'P@3': 0.36017, 'P@5': 0.30279, 'P@10': 0.24334, 'P@100': 0.06947, 'P@1000': 0.01786})


### Results without handling missing words

In [9]:
retriever.switch_fasttext_model(None)
retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
# Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
scores = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)  # Replace this with your evaluation metric
scores

tests in progress: 100%|██████████| 323/323 [00:04<00:00, 78.69it/s]


({'NDCG@1': 0.4257,
  'NDCG@3': 0.38622,
  'NDCG@5': 0.35801,
  'NDCG@10': 0.33163,
  'NDCG@100': 0.28329,
  'NDCG@1000': 0.34507},
 {'MAP@1': 0.05651,
  'MAP@3': 0.09239,
  'MAP@5': 0.10594,
  'MAP@10': 0.12316,
  'MAP@100': 0.14835,
  'MAP@1000': 0.15918},
 {'Recall@1': 0.05651,
  'Recall@3': 0.10016,
  'Recall@5': 0.12787,
  'Recall@10': 0.16063,
  'Recall@100': 0.26028,
  'Recall@1000': 0.46912},
 {'P@1': 0.43653,
  'P@3': 0.36017,
  'P@5': 0.30217,
  'P@10': 0.24241,
  'P@100': 0.06941,
  'P@1000': 0.01776})

# Make a research

In [10]:
n_doc = 5 # number of documents to retrieve
query = {list(queries.items())[0][0]:list(queries.items())[0][1]}
print(query)

{'PLAIN-2': 'Do Cholesterol Statin Drugs Cause Breast Cancer?'}


In [11]:
results = retriever.search(cleaned_corpus, query, n_doc, 'cos_sim') # example of a search
results

tests in progress: 100%|██████████| 1/1 [00:00<00:00, 39.73it/s]


{'PLAIN-2': {'MED-10': 20.928497095049888,
  'MED-14': 20.651564093589315,
  'MED-1193': 17.809039262489723,
  'MED-2429': 17.765721752415434,
  'MED-2431': 16.97744472842882}}

In [12]:
for quer_id in list(results.keys()):
    print(f'Query: {queries[quer_id]}')
    for doc_id in list(results[quer_id].keys()):
        print('\n')
        print(f'\tDocument: {corpus[doc_id]}')
        print(f'\tScore: {results[quer_id][doc_id]}')
    
    print('\n')

Query: Do Cholesterol Statin Drugs Cause Breast Cancer?


	Document: {'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants died, of which 3,61