In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import beir 
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from utils_func import matrix_creation, corpus_processing, clustering, retriever_model

import os

  from tqdm.autonotebook import tqdm


In [2]:
# Example corpus and queries (replace with your actual data)

dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

100%|██████████| 3633/3633 [00:00<00:00, 12391.48it/s]


In [3]:
embeddings = pd.read_csv('word_vectors_nfcorpus_100dim.txt', sep=' ', header=None).dropna()
embeddings.columns = ['word'] + [str(i) for i in range(100)]
for i in list(embeddings.columns)[1:]:
    embeddings[i] = embeddings[i].astype(float)

embeddings = embeddings.set_index('word')

# Corpus cleaning

In [4]:
cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)

100%|██████████| 3633/3633 [00:28<00:00, 126.88it/s]


# Matrix creation

In [None]:
#test = words_coexistence_probability({i:cleaned_corpus[i] for i in list(cleaned_corpus.keys())[:10]})
#words_coexistence_probability = matrix_creation.words_coexistence_probability
coexistence_matrix_path = 'words_coexistence_matrix_nfcorpus_test.csv'
if os.path.exists(coexistence_matrix_path):
    words_coexistence_matrix = pd.read_csv(coexistence_matrix_path, na_values=[''])
    words_coexistence_matrix.set_index('level_0', inplace=True)
else:
    words_coexistence_matrix = matrix_creation.words_coexistence_probability(cleaned_corpus)
    words_coexistence_matrix.to_csv('words_coexistence_matrix_nfcorpus_test.csv', index=False)

In [6]:
words_in_common = list(set(words_coexistence_matrix.columns).intersection(set(embeddings.index)))

In [7]:
#embeddings = embeddings.loc[words_in_common]
sim_mat = matrix_creation.get_similarity_matrix(embeddings[:], metric='euclidean', n_neighbors=20)

found 0 physical cores < 1
  File "c:\Users\maxim\anaconda3\envs\research_ir\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [8]:
def get_final_similarity_matrix(similarity_matrix, coexistence_matrix, alpha=0.5):
    sim_mat_vals = similarity_matrix.values
    coex_mat_vals = coexistence_matrix.values
    final_matrix = alpha*sim_mat_vals + (1-alpha)*coex_mat_vals
    return pd.DataFrame(final_matrix, index=similarity_matrix.index, columns=similarity_matrix.columns)

In [9]:
words_in_common = list(set(words_coexistence_matrix.columns) - (set(embeddings.index)))

In [10]:
words_in_common

['null']

In [None]:
replaceable_words = clustering.get_replaceable_words(sim_mat, words_coexistence_matrix, alpha=0.5, thresh=0.8)

100%|██████████| 29715/29715 [18:07<00:00, 27.33it/s]


In [16]:
word_graph = clustering.Graph(replaceable_words)

In [17]:
clusters = word_graph.find_all_cycles()
clusters

[{'subprimal'},
 {'polyethylene'},
 {'boundary'},
 {'myc'},
 {'secretion'},
 {'revisit'},
 {'ipss'},
 {'unbuilde'},
 {'2.3+/-2.3'},
 {'praldiet', 'praltotal'},
 {'8.18'},
 {'4.91'},
 {'delineating'},
 {'660'},
 {'acetal'},
 {'dyspnea'},
 {'0.25–0.58', '0.25–0.65'},
 {'or)=1.1', 'or=1.2', 'or=1.3', 'or=2.1', 'or=2.6', 'or=2.9'},
 {'nasogastric'},
 {'nct00438425'},
 {'surgeon'},
 {'p=0.035'},
 {'triggerfish'},
 {'undercounte', 'undercounting', 'underreporte'},
 {'8.6'},
 {'hydrophilic'},
 {'bioaccumulative'},
 {'bioreactor'},
 {'cloudy'},
 {'0.89–1.25', '0.89–1.43', '0.99–1.43'},
 {'sui'},
 {'0.177'},
 {'purchasing'},
 {'helicobacter', 'pylori'},
 {'donald'},
 {'burning'},
 {'mimolette'},
 {'658'},
 {'or=0.18', 'or=0.30', 'or=0.43', 'or]=0.21'},
 {'830'},
 {'sulfamethoxazole', 'trimethoprim'},
 {'enormously'},
 {'hale'},
 {'contraction'},
 {'purell'},
 {'25.1'},
 {'p=0.031'},
 {'hyperproliferation'},
 {'856'},
 {'necrotize'},
 {'résumé'},
 {'insulinotropic'},
 {'cvd-'},
 {'1,569'},
 {'ne

In [18]:
clusters.sort(key=lambda x: len(x), reverse=True)

In [19]:
clusters

[{'1,21',
  '1,30',
  '1,36',
  '1,39',
  '1,8',
  'asma',
  'confianza',
  'connus',
  'conocido',
  'diferencias',
  'différence',
  'd’asthme',
  'enfermedad',
  'familia',
  'familles',
  'hojas',
  'hombre',
  'hombres',
  'homme',
  'ic95',
  'légumes',
  'l’asthme',
  'marquées',
  'mujeres',
  'méthodes',
  'observaron',
  'padecer',
  'padecerla',
  'pequeña',
  'posibilidades',
  'prevalencia',
  'probabilidad',
  'proporción',
  'prévalence',
  'quelconque',
  'referencia',
  'riesgo',
  'sobre',
  'souffrir',
  'variacione'},
 {'ausschließt',
  'complianceraten',
  'daher',
  'denen',
  'erkrankungen',
  'etwa',
  'faktoren',
  'folgende',
  'führen',
  'gerste',
  'jedoch',
  'klinischen',
  'konzentriert',
  'konzepte',
  'lebensmittel',
  'mögliche',
  'möglicherweise',
  'nicht',
  'nichtdiätetische',
  'nichtdiätetischen',
  'resultiert',
  'roggen',
  'schlecht',
  'sehr',
  'selbst',
  'unsurprise',
  'werden',
  'zeigen',
  'übersicht'},
 {'0·35–0·75',
  '0·36–0·89'

In [21]:
clust_dict = clustering.clusters_dict(clusters)
clust_dict

{'familles': 0,
 'marquées': 0,
 'hombre': 0,
 'diferencias': 0,
 'prévalence': 0,
 'homme': 0,
 'connus': 0,
 'padecerla': 0,
 'asma': 0,
 'pequeña': 0,
 'conocido': 0,
 'quelconque': 0,
 'prevalencia': 0,
 'referencia': 0,
 'confianza': 0,
 '1,8': 0,
 'riesgo': 0,
 'posibilidades': 0,
 'familia': 0,
 'légumes': 0,
 'mujeres': 0,
 'observaron': 0,
 'probabilidad': 0,
 'différence': 0,
 'variacione': 0,
 'ic95': 0,
 '1,36': 0,
 'padecer': 0,
 '1,30': 0,
 'l’asthme': 0,
 'hombres': 0,
 'enfermedad': 0,
 'hojas': 0,
 'd’asthme': 0,
 '1,21': 0,
 'proporción': 0,
 'méthodes': 0,
 'souffrir': 0,
 'sobre': 0,
 '1,39': 0,
 'nichtdiätetischen': 1,
 'schlecht': 1,
 'daher': 1,
 'möglicherweise': 1,
 'gerste': 1,
 'klinischen': 1,
 'erkrankungen': 1,
 'konzentriert': 1,
 'roggen': 1,
 'mögliche': 1,
 'lebensmittel': 1,
 'führen': 1,
 'nicht': 1,
 'konzepte': 1,
 'resultiert': 1,
 'nichtdiätetische': 1,
 'werden': 1,
 'unsurprise': 1,
 'faktoren': 1,
 'jedoch': 1,
 'übersicht': 1,
 'ausschließt':

In [36]:
#rewritten = rewrite_corpus({key:cleaned_corpus[key] for key in tqdm(list(cleaned_corpus.keys())[:10])}, clust_dict)
rewritten = clustering.rewrite_corpus(cleaned_corpus, clust_dict)

  0%|          | 0/3633 [00:00<?, ?it/s]

100%|██████████| 3633/3633 [00:13<00:00, 266.17it/s]


In [37]:
rewritten

{'MED-10': '18579 10107 23379 24597 4782 13935 9744 17488 1506 4489 7086 14440 22682 4489 9823 23131 14561 18579 3200 19787 21351 23972 286 286 8558 286 23957 8196 21369 6686 25852 14578 24597 4782 1986 1514 286 27248 2674 17452 13771 8196 4505 16463 10469 3961 16521 286 24597 4782 18366 9313 18579 14426 286 9744 5423 22352 1506 286 24597 4782 2937 286 4489 1506 18853 6168 18665 26568 24597 4782 2937 286 14440 17839 27594 17222 12296 14847 7086 286 15865 4782 1321 14450 2674 18579 10107 27449 23379 10191 286 26728 16444 12800 7086 9744 13090 21566 23573 10469 10107 286 4720 15198 11995 12483 15977 3741 22388 8196 9313 18579 14426 12443 18579 10107 24056 18323 22582 14162 9744 9857 286 15145 14415 9823 10107 18579 17839 286 21132 18856 12126 286 7449 7804 10191 286 26728 6094 7189 7804 15866 14415 11422 286 4211 2649 26842 16444 4972 3741 24597 4782 10191 26982 24451 17270 23971 12833 23379 15753 7789 12660 19823 21592 23379 3117 21592 18579 10107 16444 10159 12443 11484 16521 286 24597

# Retrieval

## BM25

In [161]:
from rank_bm25 import BM25Okapi

class BM25_model:
  def __init__(self, corpus, k1=0.9, b=0.4):
    cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)
    self.tokenized_corpus = [cleaned_corpus[key].split(" ") for key in corpus.keys()]
    self.bm25_model = BM25Okapi(self.tokenized_corpus, k1=k1, b=b)
    self.keys = list(corpus.keys())

  def search(self, corpus: dict[str, dict[str, str]], queries: dict[str, str], top_k: int, score_function,**kwargs) -> dict[str, dict[str, float]]:
    results = {}
    for query_id, query in tqdm(queries.items(), desc="tests in progress"):
        # Process the query
        #cleaned_query = preprocess_corpus([query])
        cleaned_query = corpus_processing.clean_tokens(corpus_processing.nlp(query))
        tokenized_query = cleaned_query.split(" ")
        # Apply BM25 to get scores
        scores = self.bm25_model.get_scores(tokenized_query)
        # Sort the scores in descending order and save the results
        ordered_keys_index = np.argsort(scores)[::-1][:top_k]
        sorted_scores = {self.keys[i] : scores[i] for i in ordered_keys_index}
        results[query_id] = sorted_scores
    return results
  

In [162]:
model_bm25Okapi = BM25_model(corpus)




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A




In [163]:
retriever_bm25Okapi = EvaluateRetrieval(model_bm25Okapi, score_function="cos_sim") # or "dot" if you wish dot-product

#### Retrieve dense results (format of results is identical to qrels)
#results = retriever.retrieve(corpus_to_give, {list(queries.keys())[i]:queries[list(queries.keys())[i]] for i in range(50)})
results_bm25Okapi = retriever_bm25Okapi.retrieve(model_bm25Okapi.tokenized_corpus, queries)

#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg_bm25Okapi, _map_bm25Okapi, recall_bm25Okapi, precision_bm25Okapi = retriever_bm25Okapi.evaluate(qrels, results_bm25Okapi, retriever_bm25Okapi.k_values)




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


tests in progress: 100%|██████████| 323/323 [00:03<00:00, 101.37it/s]


In [164]:
ndcg_bm25Okapi, _map_bm25Okapi, recall_bm25Okapi, precision_bm25Okapi

({'NDCG@1': 0.39628,
  'NDCG@3': 0.35956,
  'NDCG@5': 0.3313,
  'NDCG@10': 0.2997,
  'NDCG@100': 0.25972,
  'NDCG@1000': 0.32634},
 {'MAP@1': 0.05118,
  'MAP@3': 0.08803,
  'MAP@5': 0.09692,
  'MAP@10': 0.11152,
  'MAP@100': 0.13345,
  'MAP@1000': 0.14411},
 {'Recall@1': 0.05118,
  'Recall@3': 0.09922,
  'Recall@5': 0.11237,
  'Recall@10': 0.14359,
  'Recall@100': 0.24364,
  'Recall@1000': 0.46633},
 {'P@1': 0.40557,
  'P@3': 0.33333,
  'P@5': 0.27988,
  'P@10': 0.21517,
  'P@100': 0.06443,
  'P@1000': 0.01779})

## Mine

In [165]:
class Mine:
  def __init__(self, corpus:dict[str, str], clusters_dict:dict[str,str], k1:float=0.9, b:float=0.4):
    cleaned_corpus = corpus
    self.tokenized_corpus = [cleaned_corpus[key].split(" ") for key in corpus.keys()]
    self.bm25_model = BM25Okapi(self.tokenized_corpus, k1=k1, b=b)    
    self.keys = list(corpus.keys())
    self.clusters_dict = clusters_dict


  def search(self, corpus: dict[str, dict[str, str]], queries: dict[str, str], top_k: int, score_function,**kwargs) -> dict[str, dict[str, float]]:
    results = {}
    for query_id, query in tqdm(queries.items(), desc="tests in progress"):
        # Process the query
        #cleaned_query = preprocess_corpus([query])
        cleaned_query = corpus_processing.clean_tokens(corpus_processing.nlp(query))
        cleaned_query = rewrite_text(cleaned_query, self.clusters_dict)
        tokenized_query = cleaned_query.split(" ")
        # Apply BM25 to get scores
        scores = self.bm25_model.get_scores(tokenized_query)
        # Sort the scores in descending order and save the results
        ordered_keys_index = np.argsort(scores)[::-1][:top_k]
        sorted_scores = {self.keys[i] : scores[i] for i in ordered_keys_index}
        results[query_id] = sorted_scores
    return results

In [166]:
model_okapi = Mine(rewritten, clust_dict)

In [167]:
retriever_okapi = EvaluateRetrieval(model_okapi, score_function="cos_sim") # or "dot" if you wish dot-product

#### Retrieve dense results (format of results is identical to qrels)
#results = retriever.retrieve(corpus_to_give, {list(queries.keys())[i]:queries[list(queries.keys())[i]] for i in range(50)})
results_okapi = retriever_okapi.retrieve(model_okapi.tokenized_corpus, queries)

#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg_okapi, _map_okapi, recall_okapi, precision_okapi = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)





[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


tests in progress: 100%|██████████| 323/323 [00:04<00:00, 69.25it/s]


In [168]:
ndcg_okapi, _map_okapi, recall_okapi, precision_okapi

({'NDCG@1': 0.39628,
  'NDCG@3': 0.35885,
  'NDCG@5': 0.33155,
  'NDCG@10': 0.30002,
  'NDCG@100': 0.26025,
  'NDCG@1000': 0.32704},
 {'MAP@1': 0.05118,
  'MAP@3': 0.08781,
  'MAP@5': 0.09708,
  'MAP@10': 0.11172,
  'MAP@100': 0.13369,
  'MAP@1000': 0.14436},
 {'Recall@1': 0.05118,
  'Recall@3': 0.09865,
  'Recall@5': 0.11335,
  'Recall@10': 0.14447,
  'Recall@100': 0.24481,
  'Recall@1000': 0.46755},
 {'P@1': 0.40557,
  'P@3': 0.3323,
  'P@5': 0.27988,
  'P@10': 0.21517,
  'P@100': 0.06437,
  'P@1000': 0.01782})

In [169]:
indexes = list(ndcg_okapi.keys()) + list(_map_okapi.keys()) + list(recall_okapi.keys()) + list(precision_okapi.keys())

values_mine_okapi = list(ndcg_okapi.values()) + list(_map_okapi.values()) + list(recall_okapi.values()) + list(precision_okapi.values())
values_bm25Okapi = list(ndcg_bm25Okapi.values()) + list(_map_bm25Okapi.values()) + list(recall_bm25Okapi.values()) + list(precision_bm25Okapi.values())

comparison = pd.DataFrame({'indexes':indexes, 'mine_okapi':values_mine_okapi, 'bm25Okapi':values_bm25Okapi})
comparison.columns = ['metrics', 'mine_okapi', 'bm25Okapi']
comparison.set_index('metrics', inplace=True)

comparison['diff'] = comparison['mine_okapi'] - comparison['bm25Okapi']
comparison

Unnamed: 0_level_0,mine_okapi,bm25Okapi,diff
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NDCG@1,0.39628,0.39628,0.0
NDCG@3,0.35885,0.35956,-0.00071
NDCG@5,0.33155,0.3313,0.00025
NDCG@10,0.30002,0.2997,0.00032
NDCG@100,0.26025,0.25972,0.00053
NDCG@1000,0.32704,0.32634,0.0007
MAP@1,0.05118,0.05118,0.0
MAP@3,0.08781,0.08803,-0.00022
MAP@5,0.09708,0.09692,0.00016
MAP@10,0.11172,0.11152,0.0002


In [170]:
print('Model better than bm25 for ' + str(comparison[comparison['diff'] > 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')
print('Model worse than bm25 for ' + str(comparison[comparison['diff'] < 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')
print('Model equal to bm25 for ' + str(comparison[comparison['diff'] == 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')

Model better than bm25 for 13/24 metrics
Model worse than bm25 for 5/24 metrics
Model equal to bm25 for 6/24 metrics
