In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import beir 
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from utils_func import corpus_processing, matrix_creation, clustering, retriever_model

import optuna

import os

  from tqdm.autonotebook import tqdm


In [2]:
# Example corpus and queries (replace with your actual data)

dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

100%|██████████| 3633/3633 [00:00<00:00, 140406.59it/s]


In [3]:
embeddings = pd.read_csv('word_vectors_nfcorpus.txt', sep=' ', header=None).dropna()
embeddings.columns = ['word'] + [str(i) for i in range(100)]
for i in list(embeddings.columns)[1:]:
    embeddings[i] = embeddings[i].astype(float)

embeddings = embeddings.set_index('word')

In [4]:
len(embeddings)

29458

# Corpus cleaning

In [5]:
cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)

100%|██████████| 3633/3633 [00:21<00:00, 165.62it/s]


In [9]:
unique_words_corpus = set([word for doc in cleaned_corpus.values() for word in doc.split(" ")])

In [13]:
len(unique_words_corpus)

29460

# Matrix creation

In [None]:
#test = words_coexistence_probability({i:cleaned_corpus[i] for i in list(cleaned_corpus.keys())[:10]})
#words_coexistence_probability = matrix_creation.words_coexistence_probability
coexistence_matrix_path = 'words_coexistence_matrix_nfcorpus_test.csv'
if os.path.exists(coexistence_matrix_path):
    words_coexistence_matrix = pd.read_csv(coexistence_matrix_path, na_values=[''])
    #words_coexistence_matrix.set_index('level_0', inplace=True)
    words_coexistence_matrix.index = words_coexistence_matrix.columns
else:
    words_coexistence_matrix = matrix_creation.words_coexistence_probability(cleaned_corpus)
    words_coexistence_matrix.to_csv('words_coexistence_matrix_nfcorpus_test.csv', index=False)

KeyboardInterrupt: 

In [8]:
words_in_common = list(set(words_coexistence_matrix.columns).intersection(set(embeddings.index)))

In [9]:
embeddings = embeddings.loc[words_in_common]

In [None]:
sim_mat = matrix_creation.get_similarity_matrix(embeddings[:], metric='euclidean', n_neighbors=20)

In [10]:
def get_final_similarity_matrix(similarity_matrix, coexistence_matrix, alpha=0.5):
    sim_mat_vals = similarity_matrix.values
    coex_mat_vals = coexistence_matrix.values
    final_matrix = alpha*sim_mat_vals + (1-alpha)*coex_mat_vals
    return pd.DataFrame(final_matrix, index=similarity_matrix.index, columns=similarity_matrix.columns)

In [8]:
words_in_common = list(set(words_coexistence_matrix.columns).intersection(set(embeddings.index)))

In [9]:
words_in_common

['rye',
 'mexiletine',
 'nonfirearm',
 'pregnenolone',
 'hotdog',
 'norfolk',
 'build',
 'lcds',
 'igs',
 'reperfusion',
 'electrophoretic',
 'grain',
 'uic',
 'antimetatstatic',
 'intervention',
 'athlete',
 'biotransforming',
 'seguimiento',
 'calciuria',
 'saharan',
 'scour',
 'uninfected',
 'microplate',
 'hydrolase',
 'nonlethal',
 'suspect',
 'disentangle',
 'qald',
 'folgende',
 'hypernatremia',
 'sy5y',
 'discrepant',
 'purchase',
 'faba',
 'modulatory',
 '48',
 'mutton',
 'relative',
 'immunosorbent',
 'papillomaviruse',
 'borobudur',
 'internationally',
 '2074',
 'tubo',
 'ardèche',
 'alt',
 'fractionate',
 '224',
 'tracer',
 '137',
 'extracorporeal',
 'aortic',
 'outrage',
 'ss',
 'impending',
 'everywhere',
 'wheezer',
 'congeners',
 '633',
 'evidently',
 'pessimistic',
 'scavenger',
 'insight',
 'cosponsor',
 'clip',
 'opposing',
 'underscore',
 'to',
 'tsnas',
 'cortisol',
 'skill',
 'horizontally',
 'resilience',
 'convincingly',
 'decarboxylase',
 '8014',
 'legitimate',

In [10]:
replaceable_words = clustering.get_replaceable_words(sim_mat, words_coexistence_matrix, alpha=0.5, thresh=0.8)

100%|██████████| 22156/22156 [12:46<00:00, 28.92it/s]


In [11]:
word_graph = clustering.Graph(replaceable_words)

In [12]:
clusters = word_graph.find_all_cycles()
clusters

[{'rye'},
 {'mexiletine'},
 {'firearm', 'nonfirearm'},
 {'pregnenolone'},
 {'hotdog'},
 {'norfolk'},
 {'build'},
 {'lcds'},
 {'igs'},
 {'reperfusion'},
 {'electrophoretic'},
 {'uic'},
 {'grain'},
 {'antimetatstatic'},
 {'intervention'},
 {'athlete'},
 {'biotransforming'},
 {'navarra', 'seguimiento', 'universidad'},
 {'calciuria'},
 {'saharan'},
 {'scour'},
 {'uninfected'},
 {'microplate'},
 {'hydrolase'},
 {'nonlethal'},
 {'suspect'},
 {'disentangle'},
 {'qald'},
 {'erkrankungen', 'folgende', 'roggen', 'zeigen'},
 {'hypernatremia'},
 {'sy5y'},
 {'discrepant'},
 {'purchase'},
 {'faba'},
 {'modulatory'},
 {'48'},
 {'mutton'},
 {'relative'},
 {'immunosorbent'},
 {'papillomaviruse'},
 {'borobudur'},
 {'internationally'},
 {'2074'},
 {'tubo'},
 {'ardèche'},
 {'alt'},
 {'fractionate'},
 {'224'},
 {'tracer'},
 {'137'},
 {'extracorporeal'},
 {'aortic'},
 {'outrage'},
 {'ss'},
 {'impending'},
 {'everywhere'},
 {'wheezer'},
 {'congeners'},
 {'633'},
 {'evidently'},
 {'pessimistic'},
 {'scavenger

In [40]:
clusters.sort(key=lambda x: len(x), reverse=True)

In [41]:
clusters

[{'574',
  'accru',
  'ainsi',
  'algunos',
  'algún',
  'alor',
  'altas',
  'alto',
  'amplia',
  'asma',
  'asociaciones',
  'asociaron',
  'asoció',
  'autorreferida',
  'bajo',
  'basó',
  'cociente',
  'conclusión',
  'confianza',
  'connus',
  'conocido',
  'consumo',
  'contexte',
  'corporal',
  'corporelle',
  'diario',
  'diferencias',
  'différence',
  'débile',
  'embargo',
  'encuesta',
  'enfermedad',
  'entre',
  'estimer',
  'estimé',
  'estudio',
  'existen',
  'facteur',
  'factore',
  'familia',
  'familles',
  'feuille',
  'geográficas',
  'hebdomadaire',
  'hojas',
  'hombre',
  'hombres',
  'homme',
  'hortalizas',
  'ic95',
  'igual',
  'lait',
  'laitier',
  'logistique',
  'logística',
  'lácteo',
  'légumes',
  'marco',
  'marquées',
  'mujeres',
  'méthodes',
  'nacional',
  'observaron',
  'padecer',
  'padecerla',
  'parmi',
  'parte',
  'pequeña',
  'pollo',
  'posibilidades',
  'prevalencia',
  'probabilidad',
  'proporción',
  'prévalence',
  'quelconqu

In [13]:
clust_dict = clustering.clusters_dict(clusters)
clust_dict

{'rye': 0,
 'mexiletine': 1,
 'nonfirearm': 2,
 'firearm': 2,
 'pregnenolone': 3,
 'hotdog': 4,
 'norfolk': 5,
 'build': 6,
 'lcds': 7,
 'igs': 8,
 'reperfusion': 9,
 'electrophoretic': 10,
 'uic': 11,
 'grain': 12,
 'antimetatstatic': 13,
 'intervention': 14,
 'athlete': 15,
 'biotransforming': 16,
 'seguimiento': 17,
 'navarra': 17,
 'universidad': 17,
 'calciuria': 18,
 'saharan': 19,
 'scour': 20,
 'uninfected': 21,
 'microplate': 22,
 'hydrolase': 23,
 'nonlethal': 24,
 'suspect': 25,
 'disentangle': 26,
 'qald': 27,
 'roggen': 28,
 'folgende': 28,
 'erkrankungen': 28,
 'zeigen': 28,
 'hypernatremia': 29,
 'sy5y': 30,
 'discrepant': 31,
 'purchase': 32,
 'faba': 33,
 'modulatory': 34,
 '48': 35,
 'mutton': 36,
 'relative': 37,
 'immunosorbent': 38,
 'papillomaviruse': 39,
 'borobudur': 40,
 'internationally': 41,
 '2074': 42,
 'tubo': 43,
 'ardèche': 44,
 'alt': 45,
 'fractionate': 46,
 '224': 47,
 'tracer': 48,
 '137': 49,
 'extracorporeal': 50,
 'aortic': 51,
 'outrage': 52,
 's

In [14]:
#rewritten = rewrite_corpus({key:cleaned_corpus[key] for key in tqdm(list(cleaned_corpus.keys())[:10])}, clust_dict)
rewritten = clustering.rewrite_corpus(cleaned_corpus, clust_dict)

100%|██████████| 3633/3633 [01:14<00:00, 48.90it/s]


In [15]:
rewritten

{'MED-10': '9467 11211 14766 1850 18623 8133 18738 17979 8989 17816 6158 20737 721 17816 4332 2833 13907 9467 14152 11841 2069 8013 6631 785 13785 785 20982 7489 14357 1306 20631 18073 1850 18623 19089 14273 785 17537 17784 8642 4401 7489 1767 5059 12160 21211 7281 785 1850 18623 1060 6920 9467 19947 6631 18738 9736 21010 8989 785 1850 18623 4755 785 17816 8989 17443 12072 17997 18166 1850 18623 4755 6631 20737 11145 3607 11704 6158 785 2731 18623 16494 10137 17784 9467 11211 3609 14766 13768 785 21005 16124 5752 6158 18738 9437 6790 9315 12160 11211 785 10124 1489 1218 114 1765 67 129 7489 6920 9467 19947 12515 9467 11211 5111 21053 20507 17991 18738 2531 785 11906 4332 11211 9467 11145 785 19764 21269 15875 785 5220 13768 785 21005 5234 5220 11906 10402 785 20820 16124 13424 67 1850 18623 13768 130 13140 13523 3349 9443 14766 2932 1578 9532 7067 956 14766 1297 956 9467 11211 16124 18973 12515 17256 7281 785 1850 18623 1060 6362 169 2589 14766 6362 169 2589 20609 785 7281 5229 4992 70

# Retrieval

## BM25

In [14]:
from rank_bm25 import BM25Okapi
import optuna

class BM25_model:
  def __init__(self, corpus, k1=1.5, b=0.75):
    cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)
    self.tokenized_corpus = [cleaned_corpus[key].split(" ") for key in corpus.keys()]
    self.bm25_model = BM25Okapi(self.tokenized_corpus, k1=k1, b=b)
    self.keys = list(corpus.keys())

  def search(self, corpus: dict[str, dict[str, str]], queries: dict[str, str], top_k: int, score_function,**kwargs) -> dict[str, dict[str, float]]:
    results = {}
    for query_id, query in tqdm(queries.items(), desc="tests in progress"):
        # Process the query
        #cleaned_query = preprocess_corpus([query])
        cleaned_query = corpus_processing.clean_tokens(corpus_processing.nlp(query))
        tokenized_query = cleaned_query.split(" ")
        # Apply BM25 to get scores
        scores = self.bm25_model.get_scores(tokenized_query)
        # Sort the scores in descending order and save the results
        ordered_keys_index = np.argsort(scores)[::-1][:top_k]
        sorted_scores = {self.keys[i] : scores[i] for i in ordered_keys_index}
        results[query_id] = sorted_scores
    return results
  

In [7]:
def objective(trial):
    # Suggest values for k1 and b
    k1 = trial.suggest_float("k1", 1.2, 2.0)  # Adjust range as needed
    b = trial.suggest_float("b", 0.0, 1.0)    # BM25's b typically lies between 0 and 1

    # Instantiate the retriever model with the suggested parameters
    retriever = BM25_model(cleaned_corpus, k1=k1, b=b)
    retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
    results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
    # Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
    score = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)[0]['NDCG@10']  # Replace this with your evaluation metric

    return score  # Higher score is better (Optuna minimizes by default)

study = optuna.create_study(direction="maximize")  # Use "minimize" if your score is a loss
study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value)

[I 2025-01-09 08:51:11,391] A new study created in memory with name: no-name-c149680f-eebe-4ccf-bdf4-8cc357b02ed7
tests in progress: 100%|██████████| 323/323 [00:03<00:00, 88.31it/s] 
[I 2025-01-09 08:51:15,806] Trial 0 finished with value: 0.30397 and parameters: {'k1': 1.689151112584689, 'b': 0.3743315999040079}. Best is trial 0 with value: 0.30397.
tests in progress: 100%|██████████| 323/323 [00:03<00:00, 98.46it/s] 
[I 2025-01-09 08:51:19,592] Trial 1 finished with value: 0.30198 and parameters: {'k1': 1.4126695191350203, 'b': 0.9045533710793888}. Best is trial 0 with value: 0.30397.
tests in progress: 100%|██████████| 323/323 [00:02<00:00, 108.99it/s]
[I 2025-01-09 08:51:23,051] Trial 2 finished with value: 0.30077 and parameters: {'k1': 1.3655081127949722, 'b': 0.023618988706922472}. Best is trial 0 with value: 0.30397.
tests in progress: 100%|██████████| 323/323 [00:02<00:00, 109.46it/s]
[I 2025-01-09 08:51:26,479] Trial 3 finished with value: 0.30204 and parameters: {'k1': 1.37

{'k1': 1.865819183429756, 'b': 0.48273398944742674}
0.30749


In [15]:
#k1=1.865819183429756, b=0.48273398944742674
model_bm25Okapi = BM25_model(corpus, k1=1.5, b=0.75)

100%|██████████| 3633/3633 [00:27<00:00, 133.62it/s]


In [16]:
retriever_bm25Okapi = EvaluateRetrieval(model_bm25Okapi, score_function="cos_sim") # or "dot" if you wish dot-product

#### Retrieve dense results (format of results is identical to qrels)
#results = retriever.retrieve(corpus_to_give, {list(queries.keys())[i]:queries[list(queries.keys())[i]] for i in range(50)})
results_bm25Okapi = retriever_bm25Okapi.retrieve(model_bm25Okapi.tokenized_corpus, queries)

#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg_bm25Okapi, _map_bm25Okapi, recall_bm25Okapi, precision_bm25Okapi = retriever_bm25Okapi.evaluate(qrels, results_bm25Okapi, retriever_bm25Okapi.k_values)

tests in progress: 100%|██████████| 323/323 [00:02<00:00, 107.93it/s]


In [17]:
ndcg_bm25Okapi, _map_bm25Okapi, recall_bm25Okapi, precision_bm25Okapi

({'NDCG@1': 0.41641,
  'NDCG@3': 0.38023,
  'NDCG@5': 0.34791,
  'NDCG@10': 0.31565,
  'NDCG@100': 0.26908,
  'NDCG@1000': 0.33146},
 {'MAP@1': 0.05604,
  'MAP@3': 0.09369,
  'MAP@5': 0.10305,
  'MAP@10': 0.1181,
  'MAP@100': 0.14077,
  'MAP@1000': 0.15156},
 {'Recall@1': 0.05604,
  'Recall@3': 0.10531,
  'Recall@5': 0.1209,
  'Recall@10': 0.15052,
  'Recall@100': 0.24624,
  'Recall@1000': 0.45475},
 {'P@1': 0.43034,
  'P@3': 0.35397,
  'P@5': 0.29226,
  'P@10': 0.22601,
  'P@100': 0.06582,
  'P@1000': 0.0174})

## Mine

In [16]:
def objective(trial):
    # Suggest values for k1 and b
    k1 = trial.suggest_float("k1", 1.2, 2.0)  # Adjust range as needed
    #b = trial.suggest_float("b", 0.0, 1.0)    # BM25's b typically lies between 0 and 1
    b = 0.75

    # Instantiate the retriever model with the suggested parameters
    retriever = retriever_model.Retriever(rewritten, clust_dict, k1=k1, b=b)
    retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
    results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
    # Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
    score = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)[0]['NDCG@10']  # Replace this with your evaluation metric

    return score  # Higher score is better (Optuna minimizes by default)

study = optuna.create_study(direction="maximize")  # Use "minimize" if your score is a loss
study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value)


[I 2025-01-10 14:55:19,815] A new study created in memory with name: no-name-931bfa38-066e-439d-bbdf-2aadd0f4c0fe
tests in progress: 100%|██████████| 323/323 [00:05<00:00, 57.87it/s]
[I 2025-01-10 14:55:26,214] Trial 0 finished with value: 0.29995 and parameters: {'k1': 1.2287223545403996}. Best is trial 0 with value: 0.29995.
tests in progress: 100%|██████████| 323/323 [00:04<00:00, 65.16it/s]
[I 2025-01-10 14:55:31,717] Trial 1 finished with value: 0.30066 and parameters: {'k1': 1.2750775582117548}. Best is trial 1 with value: 0.30066.
tests in progress: 100%|██████████| 323/323 [00:04<00:00, 64.90it/s]
[I 2025-01-10 14:55:37,240] Trial 2 finished with value: 0.30235 and parameters: {'k1': 1.963075281282854}. Best is trial 2 with value: 0.30235.
tests in progress: 100%|██████████| 323/323 [00:05<00:00, 63.59it/s]
[I 2025-01-10 14:55:42,921] Trial 3 finished with value: 0.30364 and parameters: {'k1': 1.5474879988686085}. Best is trial 3 with value: 0.30364.
tests in progress: 100%|███

{'k1': 1.609218899336134}
0.30398


In [23]:
# k1= 1.8891134483579508, b=0.4800276968019695
# base : k1 = 1.5, b = 0.75
model_okapi = retriever_model.Retriever(rewritten, clust_dict, k1= 1.61, b=0.75)

In [24]:
retriever_okapi = EvaluateRetrieval(model_okapi, score_function="cos_sim") # or "dot" if you wish dot-product

#### Retrieve dense results (format of results is identical to qrels)
#results = retriever.retrieve(corpus_to_give, {list(queries.keys())[i]:queries[list(queries.keys())[i]] for i in range(50)})
results_okapi = retriever_okapi.retrieve(model_okapi.tokenized_corpus, queries)

#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg_okapi, _map_okapi, recall_okapi, precision_okapi = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)


tests in progress: 100%|██████████| 323/323 [00:03<00:00, 88.91it/s] 


In [25]:
ndcg_okapi, _map_okapi, recall_okapi, precision_okapi

({'NDCG@1': 0.40093,
  'NDCG@3': 0.36413,
  'NDCG@5': 0.33404,
  'NDCG@10': 0.30398,
  'NDCG@100': 0.26349,
  'NDCG@1000': 0.32944},
 {'MAP@1': 0.05252,
  'MAP@3': 0.09096,
  'MAP@5': 0.10017,
  'MAP@10': 0.11407,
  'MAP@100': 0.13616,
  'MAP@1000': 0.14687},
 {'Recall@1': 0.05252,
  'Recall@3': 0.10464,
  'Recall@5': 0.11888,
  'Recall@10': 0.14666,
  'Recall@100': 0.24512,
  'Recall@1000': 0.46568},
 {'P@1': 0.40867,
  'P@3': 0.33746,
  'P@5': 0.2805,
  'P@10': 0.21734,
  'P@100': 0.06458,
  'P@1000': 0.01771})

In [None]:
indexes = list(ndcg_okapi.keys()) + list(_map_okapi.keys()) + list(recall_okapi.keys()) + list(precision_okapi.keys())

values_mine_okapi = list(ndcg_okapi.values()) + list(_map_okapi.values()) + list(recall_okapi.values()) + list(precision_okapi.values())
values_bm25Okapi = list(ndcg_bm25Okapi.values()) + list(_map_bm25Okapi.values()) + list(recall_bm25Okapi.values()) + list(precision_bm25Okapi.values())

comparison = pd.DataFrame({'indexes':indexes, 'mine_okapi':values_mine_okapi, 'bm25Okapi':values_bm25Okapi})
comparison.columns = ['metrics', 'mine_okapi', 'bm25Okapi']
comparison.set_index('metrics', inplace=True)

comparison['diff'] = comparison['mine_okapi'] - comparison['bm25Okapi']
comparison

In [65]:
indexes = list(ndcg_okapi.keys()) + list(_map_okapi.keys()) + list(recall_okapi.keys()) + list(precision_okapi.keys())

values_mine_okapi = list(ndcg_okapi.values()) + list(_map_okapi.values()) + list(recall_okapi.values()) + list(precision_okapi.values())
#values_bm25Okapi = list(ndcg_bm25Okapi.values()) + list(_map_bm25Okapi.values()) + list(recall_bm25Okapi.values()) + list(precision_bm25Okapi.values())

comparison = pd.DataFrame({'indexes':indexes, 'mine_okapi':values_mine_okapi})#, 'bm25Okapi':values_bm25Okapi})
comparison.columns = ['metrics', 'mine_okapi']#, 'bm25Okapi']
comparison.set_index('metrics', inplace=True)

#comparison['diff'] = comparison['mine_okapi'] - comparison['bm25Okapi']
comparison.T.to_csv('comparison_nfcorpus.csv', index=False, mode = 'a')

In [35]:
print('Model better than bm25 for ' + str(comparison[comparison['diff'] > 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')
print('Model worse than bm25 for ' + str(comparison[comparison['diff'] < 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')
print('Model equal to bm25 for ' + str(comparison[comparison['diff'] == 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')

Model better than bm25 for 5/24 metrics
Model worse than bm25 for 19/24 metrics
Model equal to bm25 for 0/24 metrics


## Finetuning

In [8]:
def objective(trial):
    # Suggest values for k1 and b
    #k1 = trial.suggest_float("k1", 1.2, 2.0)  # Adjust range as needed
    #b = trial.suggest_float("b", 0.0, 1.0)    # BM25's b typically lies between 0 and 1
    n_neighbors = trial.suggest_int("n_neighbors", 5, 100)
    alpha = trial.suggest_float("alpha", 0.3, 1.0)
    thresh = trial.suggest_float("thresh", 0.4, 1.0)
    #metric = trial.suggest_categorical("metric", ['cosine', 'euclidean'])
    metric = 'cosine'
    
    # Instantiate the retriever model with the suggested parameters
    retriever = retriever_model.FullRetriever(embeddings, n_neighbors = n_neighbors, alpha=alpha, thresh = thresh, metric = metric, k1 = 1.5, b = 0.75, coexistence_matrix = words_coexistence_matrix)
    retriever.fit(cleaned_corpus, is_clean=True)

    retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
    results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
    # Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
    scores = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)  # Replace this with your evaluation metric
    score = scores[0]['NDCG@10']

    ndcg_okapi, _map_okapi, recall_okapi, precision_okapi = scores
    indexes = list(ndcg_okapi.keys()) + list(_map_okapi.keys()) + list(recall_okapi.keys()) + list(precision_okapi.keys())

    values_mine_okapi = list(ndcg_okapi.values()) + list(_map_okapi.values()) + list(recall_okapi.values()) + list(precision_okapi.values())

    comparison = pd.DataFrame({'indexes':indexes, f'n_neighbors={n_neighbors} alpha={alpha} thresh={thresh} metric={metric}':values_mine_okapi})
    comparison.columns = ['metrics', f'n_neighbors={n_neighbors} alpha={alpha} thresh={thresh} metric={metric}']
    comparison.set_index('metrics', inplace=True)

    comparison.T.to_csv('comparison_nfcorpus.csv', index=True, mode = 'a', header=False)
    
    return score  # Higher score is better (Optuna minimizes by default)

study = optuna.create_study(direction="maximize")  # Use "minimize" if your score is a loss
study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value)

[I 2025-01-11 22:38:38,371] A new study created in memory with name: no-name-2364f87f-014c-4169-9c47-c1572b5f9bca
100%|██████████| 22156/22156 [32:07<00:00, 11.49it/s]   
100%|██████████| 3633/3633 [00:48<00:00, 75.36it/s] 
tests in progress: 100%|██████████| 323/323 [00:03<00:00, 91.99it/s] 
[I 2025-01-11 23:11:52,930] Trial 0 finished with value: 0.27838 and parameters: {'n_neighbors': 45, 'alpha': 0.8859024180037227, 'thresh': 0.7974211837509664}. Best is trial 0 with value: 0.27838.
100%|██████████| 22156/22156 [08:28<00:00, 43.57it/s]
100%|██████████| 3633/3633 [00:43<00:00, 83.05it/s] 
tests in progress: 100%|██████████| 323/323 [00:03<00:00, 100.36it/s]
[I 2025-01-11 23:21:21,751] Trial 1 finished with value: 0.30364 and parameters: {'n_neighbors': 51, 'alpha': 0.3946918808963106, 'thresh': 0.7789542766435167}. Best is trial 1 with value: 0.30364.
100%|██████████| 22156/22156 [08:18<00:00, 44.41it/s]
100%|██████████| 3633/3633 [00:44<00:00, 81.81it/s] 
tests in progress: 100%|██

{'n_neighbors': 73, 'alpha': 0.7618809756468575, 'thresh': 0.7543853066734008}
0.30843


In [11]:
'''
best_n_neighbors = study.best_params['n_neighbors']
best_alpha = study.best_params['alpha']
best_thresh = study.best_params['thresh']

best_n_neighbors, best_alpha, best_thresh
'''
best_n_neighbors = 73
best_alpha = 0.7618809756468575
best_thresh = 0.7543853066734008

In [12]:
retriever = retriever_model.FullRetriever(embeddings, n_neighbors = best_n_neighbors, alpha=best_alpha, thresh = best_thresh, metric = 'cosine', k1 = 1.5, b = 0.75, coexistence_matrix = words_coexistence_matrix)
retriever.fit(cleaned_corpus, is_clean=True)

retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
# Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
scores = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)  # Replace this with your evaluation metric
scores

100%|██████████| 29451/29451 [15:20<00:00, 31.98it/s]
100%|██████████| 3633/3633 [00:09<00:00, 369.48it/s]
tests in progress: 100%|██████████| 323/323 [00:03<00:00, 87.54it/s] 


({'NDCG@1': 0.4257,
  'NDCG@3': 0.38534,
  'NDCG@5': 0.35501,
  'NDCG@10': 0.3296,
  'NDCG@100': 0.28155,
  'NDCG@1000': 0.34342},
 {'MAP@1': 0.05684,
  'MAP@3': 0.09297,
  'MAP@5': 0.10547,
  'MAP@10': 0.12257,
  'MAP@100': 0.14765,
  'MAP@1000': 0.1585},
 {'Recall@1': 0.05684,
  'Recall@3': 0.10235,
  'Recall@5': 0.12574,
  'Recall@10': 0.1589,
  'Recall@100': 0.25796,
  'Recall@1000': 0.46758},
 {'P@1': 0.43963,
  'P@3': 0.35913,
  'P@5': 0.29907,
  'P@10': 0.24149,
  'P@100': 0.06885,
  'P@1000': 0.01767})

In [44]:
from rank_bm25 import BM25Okapi, BM25L, BM25Plus
def objective(trial):
    # Suggest values for k1 and b
    k1 = trial.suggest_float("k1", 0.0, 2.0)  # Adjust range as needed
    b = trial.suggest_float("b", 0.0, 1.0)    # BM25's b typically lies between 0 and 1
    epsilon = trial.suggest_float("epsilon", 0.0, 1.0)
    #epsilon = 0.0
    #delta = 0.75

    # Instantiate the retriever model with the suggested parameters

    retriever.retriever.bm25_model = BM25Okapi(retriever.tokenized_corpus, k1=k1, b=b, epsilon=epsilon)
    retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
    results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
    # Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
    score = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)[0]['NDCG@10']  # Replace this with your evaluation metric

    return score  # Higher score is better (Optuna minimizes by default)

study = optuna.create_study(direction="maximize")  # Use "minimize" if your score is a loss
study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value)

[I 2025-01-13 21:47:08,012] A new study created in memory with name: no-name-16f7e08d-f51f-4803-9b7a-ba7e9dfc6298
tests in progress: 100%|██████████| 323/323 [00:02<00:00, 112.06it/s]
[I 2025-01-13 21:47:11,142] Trial 0 finished with value: 0.31755 and parameters: {'k1': 1.8905898179102507, 'b': 0.005728593236105861, 'epsilon': 0.8696500411402038}. Best is trial 0 with value: 0.31755.
tests in progress: 100%|██████████| 323/323 [00:02<00:00, 108.55it/s]
[I 2025-01-13 21:47:14,343] Trial 1 finished with value: 0.30584 and parameters: {'k1': 0.3443797360852101, 'b': 0.6003319036056877, 'epsilon': 0.0042630522135381765}. Best is trial 0 with value: 0.31755.
tests in progress: 100%|██████████| 323/323 [00:02<00:00, 111.26it/s]
[I 2025-01-13 21:47:17,478] Trial 2 finished with value: 0.31709 and parameters: {'k1': 1.065819738796474, 'b': 0.4518048659084476, 'epsilon': 0.3893223609095169}. Best is trial 0 with value: 0.31755.
tests in progress: 100%|██████████| 323/323 [00:02<00:00, 114.39it

{'k1': 1.9054211598230129, 'b': 0.5971507702716906, 'epsilon': 0.7992503732505253}
0.32375


In [16]:
from rank_bm25 import BM25Okapi, BM25L, BM25Plus

#retriever.retriever.bm25_model = BM25Okapi(retriever.tokenized_corpus, k1=study.best_params['k1'], b=study.best_params['b'], epsilon=study.best_params['epsilon'])
retriever.retriever.bm25_model = BM25Okapi(retriever.tokenized_corpus, k1=1.9, b=0.6, epsilon=0.4)

retriever_okapi = EvaluateRetrieval(retriever, score_function="cos_sim") # or "dot" if you wish dot-product
results_okapi = retriever_okapi.retrieve(retriever.tokenized_corpus, queries)
# Evaluate the model (implement your own evaluation logic, e.g., compute mean reciprocal rank)
scores = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)  # Replace this with your evaluation metric
scores

tests in progress: 100%|██████████| 323/323 [00:03<00:00, 103.95it/s]


({'NDCG@1': 0.4226,
  'NDCG@3': 0.38452,
  'NDCG@5': 0.35531,
  'NDCG@10': 0.33212,
  'NDCG@100': 0.28225,
  'NDCG@1000': 0.3437},
 {'MAP@1': 0.05687,
  'MAP@3': 0.09342,
  'MAP@5': 0.10564,
  'MAP@10': 0.12351,
  'MAP@100': 0.14834,
  'MAP@1000': 0.15914},
 {'Recall@1': 0.05687,
  'Recall@3': 0.1027,
  'Recall@5': 0.12585,
  'Recall@10': 0.16076,
  'Recall@100': 0.25891,
  'Recall@1000': 0.46706},
 {'P@1': 0.43653,
  'P@3': 0.36017,
  'P@5': 0.30155,
  'P@10': 0.24427,
  'P@100': 0.06901,
  'P@1000': 0.01766})

# Anserini 

In [19]:
import os
 
# Définir JAVA_HOME dans le script Python
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-23'  # Remplacez par votre chemin exact
os.environ['PATH'] = os.environ['JAVA_HOME'] + r'\bin;' + os.environ['PATH']

from pyserini.search.lucene import LuceneSearcher

Exception: Unable to create jni env, no jvm dll found.