In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import beir 
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from utils_func import corpus_processing, matrix_creation, clustering, retriever_model

import os

  from tqdm.autonotebook import tqdm


In [2]:
# Example corpus and queries (replace with your actual data)

dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
data_path = f"datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/3633 [00:00<?, ?it/s]

100%|██████████| 3633/3633 [00:00<00:00, 63904.26it/s]


In [3]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
        data[tokens[0]] = list(data[tokens[0]])
    return data

# Load the vectors
vectors = load_vectors("wiki-news-300d-1M-subword.vec/wiki-news-300d-1M-subword.vec")

999994it [02:24, 6940.90it/s] 


In [4]:
embeddings = pd.DataFrame(vectors).T

In [None]:
from sklearn.decomposition import PCA

vals = embeddings.values
pca = PCA(n_components=100)
pca.fit(vals)
pca_vals = pca.transform(vals)

embeddings = pd.DataFrame(pca_vals, index=embeddings.index)

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
",",0.0203,-0.0123,-0.0076,0.0187,0.0173,-0.0052,0.0147,0.1364,-0.0435,0.0033,...,0.0043,0.0269,-0.0427,-0.0267,0.0277,0.0025,0.0063,-0.0154,0.0306,0.0047
the,0.0129,0.0026,0.0098,0.0063,0.0102,-0.0002,-0.0056,-0.0829,-0.0420,0.0064,...,-0.0066,0.0213,0.0053,0.0097,-0.0035,-0.0046,-0.0099,-0.0186,0.0098,0.0019
.,0.0049,-0.0030,0.0672,0.0136,-0.0624,0.0077,-0.0257,0.2816,-0.1042,-0.0288,...,-0.0015,0.0286,-0.0633,0.0325,-0.0189,-0.0202,-0.0086,-0.0458,-0.0115,0.0011
and,0.0012,-0.0021,-0.0039,0.0048,0.0152,0.0051,0.0023,-0.0461,-0.0189,0.0139,...,-0.0093,0.0114,0.0190,0.0039,-0.0019,0.0004,0.0060,-0.0001,0.0021,-0.0117
of,0.0045,-0.0415,0.0522,0.0169,-0.0966,-0.0374,-0.0040,-0.1236,0.0400,0.0147,...,-0.0013,0.0072,0.0099,-0.0093,-0.0058,0.0045,-0.0028,-0.0119,-0.0263,0.0068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
whitespotted,-0.0049,-0.0013,-0.0015,-0.0045,-0.0073,0.0080,-0.0059,-0.0295,0.0029,0.0038,...,-0.0202,0.0085,-0.0108,-0.0093,0.0108,-0.0013,-0.0049,0.0069,0.0012,0.0048
sacoglossan,-0.0111,-0.0081,-0.0072,0.0027,-0.0014,-0.0063,-0.0251,-0.0162,-0.0109,0.0163,...,0.0055,0.0105,-0.0023,-0.0353,0.0230,-0.0167,-0.0042,0.0021,-0.0069,0.0231
Iseya,-0.0024,0.0045,0.0041,0.0090,0.0080,-0.0298,-0.0222,-0.0167,0.0017,0.0124,...,-0.0063,0.0210,-0.0015,-0.0305,0.0220,0.0022,0.0273,0.0049,0.0088,0.0042
Bayyah,-0.0142,-0.0066,-0.0136,0.0116,-0.0025,-0.0201,-0.0132,-0.0206,0.0020,0.0122,...,0.0176,-0.0076,0.0286,-0.0027,0.0066,-0.0031,0.0216,-0.0032,-0.0098,0.0093


In [None]:
embeddings = pd.read_csv('wiki-news-300d-1M-subword.vec/wiki-news-300d-1M-subword.vec', sep=' ', header=None).dropna()
embeddings.columns = ['word'] + [str(i) for i in range(300)]
for i in list(embeddings.columns)[1:]:
    embeddings[i] = embeddings[i].astype(float)

embeddings = embeddings.set_index('word')

# Corpus cleaning

In [5]:
cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)

100%|██████████| 3633/3633 [00:34<00:00, 104.01it/s]


# Matrix creation

In [7]:
#test = words_coexistence_probability({i:cleaned_corpus[i] for i in list(cleaned_corpus.keys())[:10]})
#words_coexistence_probability = matrix_creation.words_coexistence_probability
coexistence_matrix_path = 'words_coexistence_matrix_nfcorpus_test.csv'
if os.path.exists(coexistence_matrix_path):
    words_coexistence_matrix = pd.read_csv(coexistence_matrix_path, na_values=[''])
    words_coexistence_matrix.set_index('level_0', inplace=True)
else:
    words_coexistence_matrix = matrix_creation.words_coexistence_probability(cleaned_corpus)
    words_coexistence_matrix.to_csv('words_coexistence_matrix_nfcorpus_test.csv', index=False)

In [8]:
words_in_common = list(set(words_coexistence_matrix.columns).intersection(set(embeddings.index)))

In [9]:
embeddings = embeddings.loc[words_in_common]
sim_mat = matrix_creation.get_similarity_matrix(embeddings[:], metric='euclidean', n_neighbors=20)

found 0 physical cores < 1
  File "c:\Users\maxim\anaconda3\envs\research_ir\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [10]:
def get_final_similarity_matrix(similarity_matrix, coexistence_matrix, alpha=0.5):
    sim_mat_vals = similarity_matrix.values
    coex_mat_vals = coexistence_matrix.values
    final_matrix = alpha*sim_mat_vals + (1-alpha)*coex_mat_vals
    return pd.DataFrame(final_matrix, index=similarity_matrix.index, columns=similarity_matrix.columns)

In [11]:
words_in_common = list(set(words_coexistence_matrix.columns).intersection(set(embeddings.index)))

In [12]:
words_in_common

['homa-%b',
 'trend)=0.018',
 'p=.03',
 'p=0.04',
 'taqman',
 'diphyllobothriidea',
 '80.3+/-11.3',
 'nonpickle',
 'btub',
 'chlorophenyl)ethane',
 'oestrogen+progestin',
 'antiadhesion',
 'turmerone',
 'free+sulfate',
 'hrql',
 "3',5,7",
 'smr=1.8',
 'auxotrophy',
 '1.40%-3.64',
 '0·45',
 'ipss',
 'p=0·07',
 'phytoestrogens)-and',
 'n’est',
 'd=1.12',
 'intake/1,000',
 'benzo(alpha)pyrene',
 'fbr',
 'δsh4',
 'mmp2',
 'glucosylrutinoside',
 'py1068',
 'h-6',
 'hba(1c',
 'cycle‐ergometer',
 "cow's",
 'ohsaki',
 '36.64',
 'hualien',
 '−0.065',
 'smsfa',
 '17/30',
 'inoreal',
 '0·70',
 'oligofructose',
 'tetg',
 '1.998',
 'prp.',
 'dihomo',
 'or=1.41',
 'arrhenius',
 'j96',
 'ntg',
 '0.813',
 'transmethylation',
 'octylphenol',
 '137,682',
 'heteroresistance',
 'adenomateous',
 'b(12)-enriche',
 '.0016',
 'mrc5',
 'cvd)-free',
 'ptrend=0.02',
 '0.482',
 'malassezia',
 'p=.040',
 '35,698',
 '121.41',
 'x(l',
 'mmp)-3',
 '47,896',
 'citratus',
 'cd161(+',
 'pwawg',
 'phospho',
 'nonadvanced

In [12]:
replaceable_words = clustering.get_replaceable_words(sim_mat, words_coexistence_matrix, alpha=0.5, thresh=0.8)

100%|██████████| 20270/20270 [17:23<00:00, 19.42it/s]   


In [13]:
word_graph = clustering.Graph(replaceable_words)

In [14]:
clusters = word_graph.find_all_cycles()
clusters

[{'recognize'},
 {'arabinose'},
 {'sons'},
 {'dendrogram'},
 {'administration'},
 {'distillation'},
 {'1.17'},
 {'upwind'},
 {'crippling'},
 {'english'},
 {'epsilon'},
 {'mustard'},
 {'swim'},
 {'liberated'},
 {'latinos'},
 {'916'},
 {'white-'},
 {'water'},
 {'pes'},
 {'epigastric'},
 {'least'},
 {'equally'},
 {'underestimation'},
 {'-4.5'},
 {'2240', '4130', '4240'},
 {'balbisiana', 'superbum'},
 {'giving'},
 {'spur'},
 {'fibrosis'},
 {'cellulite'},
 {'unabsorbed'},
 {'medroxyprogesterone'},
 {'immunologically'},
 {'1366'},
 {'flavonoid'},
 {'78.3'},
 {'saving'},
 {'infectivity'},
 {'ambulance'},
 {'undergoes'},
 {'strong'},
 {'sub'},
 {'806', '812'},
 {'thymic'},
 {'careful'},
 {'aspects'},
 {'constricted'},
 {'educate'},
 {'inconsistent'},
 {'photometry'},
 {'logic'},
 {'5066'},
 {'crypt'},
 {'alliance'},
 {'globe'},
 {'accroître',
  'comme',
  'communauté',
  'consulté',
  'contre',
  'contrôle',
  'efficace',
  'entraîné',
  'essentielle',
  'indiquant',
  'intervenant',
  'liées'

In [15]:
clusters.sort(key=lambda x: len(x), reverse=True)

In [16]:
clusters

[{'accru',
  'actuellement',
  'adulte',
  'adultos',
  'ainsi',
  'algunos',
  'algún',
  'altas',
  'alto',
  'amplia',
  'ans',
  'análisis',
  'asma',
  'asociaciones',
  'año',
  'bajo',
  'carne',
  'cas',
  'casos',
  'compte',
  'conclusión',
  'confianza',
  'connus',
  'conocido',
  'consumo',
  'contexte',
  'corporelle',
  'corregir',
  'del',
  'diario',
  'diferencias',
  'différence',
  'diversos',
  'edad',
  'encuesta',
  'enfermedad',
  'enquête',
  'entre',
  'estimer',
  'estimé',
  'estudio',
  'examiné',
  'excepción',
  'existen',
  'facteur',
  'familia',
  'familles',
  'feuille',
  'fois',
  'fruta',
  'fue',
  'fueron',
  'función',
  'geográficas',
  'géographique',
  'hebdomadaire',
  'hojas',
  'hombre',
  'hombres',
  'homme',
  'igual',
  'inclus',
  'inde',
  'indiens',
  'lait',
  'largement',
  'leche',
  'logistique',
  'légumes',
  'marco',
  'marquées',
  'masa',
  'mediante',
  'menor',
  'momento',
  'mujeres',
  'más',
  'méthodes',
  'métodos',

In [17]:
clust_dict = clustering.clusters_dict(clusters)
clust_dict

{'hebdomadaire': 0,
 'régression': 0,
 'parmi': 0,
 'âgés': 0,
 'análisis': 0,
 'fueron': 0,
 'géographique': 0,
 'resumen': 0,
 'tra': 0,
 'participaron': 0,
 'métodos': 0,
 'méthodes': 0,
 'mediante': 0,
 'algunos': 0,
 'igual': 0,
 'seulement': 0,
 'feuille': 0,
 'parte': 0,
 'inde': 0,
 'poulet': 0,
 'région': 0,
 'hojas': 0,
 'diferencias': 0,
 'mujeres': 0,
 'hombres': 0,
 'marquées': 0,
 'zonas': 0,
 'referencia': 0,
 'verde': 0,
 'tercera': 0,
 'del': 0,
 'nous': 0,
 'posibilidades': 0,
 'examiné': 0,
 'pollo': 0,
 'fue': 0,
 'indiens': 0,
 'hombre': 0,
 'enquête': 0,
 'asma': 0,
 'marco': 0,
 'légumes': 0,
 'élevés': 0,
 'fruta': 0,
 'amplia': 0,
 'una': 0,
 'excepción': 0,
 'año': 0,
 'menor': 0,
 'estudio': 0,
 'geográficas': 0,
 'connus': 0,
 'altas': 0,
 'petite': 0,
 'masa': 0,
 'casos': 0,
 'inclus': 0,
 'función': 0,
 'différence': 0,
 'estimé': 0,
 'momento': 0,
 'productos': 0,
 'utilisé': 0,
 'riesgo': 0,
 'fois': 0,
 'existen': 0,
 'facteur': 0,
 'urbanas': 0,
 'índ

In [18]:
#rewritten = rewrite_corpus({key:cleaned_corpus[key] for key in tqdm(list(cleaned_corpus.keys())[:10])}, clust_dict)
rewritten = clustering.rewrite_corpus(cleaned_corpus, clust_dict)

100%|██████████| 3633/3633 [01:10<00:00, 51.67it/s]


In [19]:
rewritten

{'MED-10': '12139 4821 428 1824 13523 13946 5915 6899 1488 1879 8649 13744 4103 1879 7846 3760 16212 12139 4606 14974 18629 3972 138 138 7116 138 7042 4178 15241 12464 17169 10281 1824 13523 14704 6167 138 14531 2638 18215 14999 4178 10258 16564 17307 7597 8236 138 1824 13523 15894 16074 12139 7864 138 5915 17103 16405 1488 138 1824 13523 1358 138 1879 1488 2722 8247 14186 19028 1824 13523 1358 138 13744 5690 1134 3325 8649 138 7327 13523 6864 1142 2638 12139 4821 5514 428 11993 138 6387 18966 15137 8649 5915 6036 6466 18509 17307 4821 138 11507 6336 4009 4226 16293 5084 6612 4178 16074 12139 7864 4477 12139 4821 18363 9179 13218 6710 5915 9741 138 9236 11502 7846 4821 12139 5690 138 14129 9421 17305 138 13691 5189 11993 138 6387 16860 5189 5245 11502 10716 138 16761 3346 8181 18966 9864 5084 1824 13523 11993 3824 428 17211 6524 17244 428 2232 13418 4123 13305 3141 428 11229 3141 12139 4821 18966 15595 4477 11021 8236 138 1824 13523 15894 1091 12983 3754 2680 428 1091 10015 3754 2680 1

# Retrieval

## BM25

In [20]:
from rank_bm25 import BM25Okapi

class BM25_model:
  def __init__(self, corpus, k1=0.9, b=0.4):
    cleaned_corpus = corpus_processing.preprocess_corpus_dict(corpus)
    self.tokenized_corpus = [cleaned_corpus[key].split(" ") for key in corpus.keys()]
    self.bm25_model = BM25Okapi(self.tokenized_corpus, k1=k1, b=b)
    self.keys = list(corpus.keys())

  def search(self, corpus: dict[str, dict[str, str]], queries: dict[str, str], top_k: int, score_function,**kwargs) -> dict[str, dict[str, float]]:
    results = {}
    for query_id, query in tqdm(queries.items(), desc="tests in progress"):
        # Process the query
        #cleaned_query = preprocess_corpus([query])
        cleaned_query = corpus_processing.clean_tokens(corpus_processing.nlp(query))
        tokenized_query = cleaned_query.split(" ")
        # Apply BM25 to get scores
        scores = self.bm25_model.get_scores(tokenized_query)
        # Sort the scores in descending order and save the results
        ordered_keys_index = np.argsort(scores)[::-1][:top_k]
        sorted_scores = {self.keys[i] : scores[i] for i in ordered_keys_index}
        results[query_id] = sorted_scores
    return results
  

In [None]:
model_bm25Okapi = BM25_model(corpus, k1=1.5, b=0.75)

  0%|          | 0/3633 [00:00<?, ?it/s]

In [163]:
retriever_bm25Okapi = EvaluateRetrieval(model_bm25Okapi, score_function="cos_sim") # or "dot" if you wish dot-product

#### Retrieve dense results (format of results is identical to qrels)
#results = retriever.retrieve(corpus_to_give, {list(queries.keys())[i]:queries[list(queries.keys())[i]] for i in range(50)})
results_bm25Okapi = retriever_bm25Okapi.retrieve(model_bm25Okapi.tokenized_corpus, queries)

#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg_bm25Okapi, _map_bm25Okapi, recall_bm25Okapi, precision_bm25Okapi = retriever_bm25Okapi.evaluate(qrels, results_bm25Okapi, retriever_bm25Okapi.k_values)




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


tests in progress: 100%|██████████| 323/323 [00:03<00:00, 101.37it/s]


In [164]:
ndcg_bm25Okapi, _map_bm25Okapi, recall_bm25Okapi, precision_bm25Okapi

({'NDCG@1': 0.39628,
  'NDCG@3': 0.35956,
  'NDCG@5': 0.3313,
  'NDCG@10': 0.2997,
  'NDCG@100': 0.25972,
  'NDCG@1000': 0.32634},
 {'MAP@1': 0.05118,
  'MAP@3': 0.08803,
  'MAP@5': 0.09692,
  'MAP@10': 0.11152,
  'MAP@100': 0.13345,
  'MAP@1000': 0.14411},
 {'Recall@1': 0.05118,
  'Recall@3': 0.09922,
  'Recall@5': 0.11237,
  'Recall@10': 0.14359,
  'Recall@100': 0.24364,
  'Recall@1000': 0.46633},
 {'P@1': 0.40557,
  'P@3': 0.33333,
  'P@5': 0.27988,
  'P@10': 0.21517,
  'P@100': 0.06443,
  'P@1000': 0.01779})

## Mine

In [166]:
model_okapi = retriever_model.Retriever(rewritten, clust_dict, k1= 1.5, b=0.75)

In [167]:
retriever_okapi = EvaluateRetrieval(model_okapi, score_function="cos_sim") # or "dot" if you wish dot-product

#### Retrieve dense results (format of results is identical to qrels)
#results = retriever.retrieve(corpus_to_give, {list(queries.keys())[i]:queries[list(queries.keys())[i]] for i in range(50)})
results_okapi = retriever_okapi.retrieve(model_okapi.tokenized_corpus, queries)

#### Evaluate your retrieval using NDCG@k, MAP@K ...
ndcg_okapi, _map_okapi, recall_okapi, precision_okapi = retriever_okapi.evaluate(qrels, results_okapi, retriever_okapi.k_values)





[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


tests in progress: 100%|██████████| 323/323 [00:04<00:00, 69.25it/s]


In [168]:
ndcg_okapi, _map_okapi, recall_okapi, precision_okapi

({'NDCG@1': 0.39628,
  'NDCG@3': 0.35885,
  'NDCG@5': 0.33155,
  'NDCG@10': 0.30002,
  'NDCG@100': 0.26025,
  'NDCG@1000': 0.32704},
 {'MAP@1': 0.05118,
  'MAP@3': 0.08781,
  'MAP@5': 0.09708,
  'MAP@10': 0.11172,
  'MAP@100': 0.13369,
  'MAP@1000': 0.14436},
 {'Recall@1': 0.05118,
  'Recall@3': 0.09865,
  'Recall@5': 0.11335,
  'Recall@10': 0.14447,
  'Recall@100': 0.24481,
  'Recall@1000': 0.46755},
 {'P@1': 0.40557,
  'P@3': 0.3323,
  'P@5': 0.27988,
  'P@10': 0.21517,
  'P@100': 0.06437,
  'P@1000': 0.01782})

In [169]:
indexes = list(ndcg_okapi.keys()) + list(_map_okapi.keys()) + list(recall_okapi.keys()) + list(precision_okapi.keys())

values_mine_okapi = list(ndcg_okapi.values()) + list(_map_okapi.values()) + list(recall_okapi.values()) + list(precision_okapi.values())
values_bm25Okapi = list(ndcg_bm25Okapi.values()) + list(_map_bm25Okapi.values()) + list(recall_bm25Okapi.values()) + list(precision_bm25Okapi.values())

comparison = pd.DataFrame({'indexes':indexes, 'mine_okapi':values_mine_okapi, 'bm25Okapi':values_bm25Okapi})
comparison.columns = ['metrics', 'mine_okapi', 'bm25Okapi']
comparison.set_index('metrics', inplace=True)

comparison['diff'] = comparison['mine_okapi'] - comparison['bm25Okapi']
comparison

Unnamed: 0_level_0,mine_okapi,bm25Okapi,diff
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NDCG@1,0.39628,0.39628,0.0
NDCG@3,0.35885,0.35956,-0.00071
NDCG@5,0.33155,0.3313,0.00025
NDCG@10,0.30002,0.2997,0.00032
NDCG@100,0.26025,0.25972,0.00053
NDCG@1000,0.32704,0.32634,0.0007
MAP@1,0.05118,0.05118,0.0
MAP@3,0.08781,0.08803,-0.00022
MAP@5,0.09708,0.09692,0.00016
MAP@10,0.11172,0.11152,0.0002


In [170]:
print('Model better than bm25 for ' + str(comparison[comparison['diff'] > 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')
print('Model worse than bm25 for ' + str(comparison[comparison['diff'] < 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')
print('Model equal to bm25 for ' + str(comparison[comparison['diff'] == 0]['diff'].count()) + '/' + str(len(indexes))+ ' metrics')

Model better than bm25 for 13/24 metrics
Model worse than bm25 for 5/24 metrics
Model equal to bm25 for 6/24 metrics


({'NDCG@1': 0.39628,
  'NDCG@3': 0.35885,
  'NDCG@5': 0.33155,
  'NDCG@10': 0.30002,
  'NDCG@100': 0.26025,
  'NDCG@1000': 0.32704},
 {'MAP@1': 0.05118,
  'MAP@3': 0.08781,
  'MAP@5': 0.09708,
  'MAP@10': 0.11172,
  'MAP@100': 0.13369,
  'MAP@1000': 0.14436},
 {'Recall@1': 0.05118,
  'Recall@3': 0.09865,
  'Recall@5': 0.11335,
  'Recall@10': 0.14447,
  'Recall@100': 0.24481,
  'Recall@1000': 0.46755},
 {'P@1': 0.40557,
  'P@3': 0.3323,
  'P@5': 0.27988,
  'P@10': 0.21517,
  'P@100': 0.06437,
  'P@1000': 0.01782})