>[Libraries](#scrollTo=DM9Y5EEq50cW)

>[Read data set](#scrollTo=jiNJjv1y00kj)

>[Take confidence with data](#scrollTo=P6y-bjBZ03vX)

>[First part of assignment](#scrollTo=NXjb0Y_3GWG2)

>>[BM25](#scrollTo=H0COPLtn08Uo)

>>>[Download data](#scrollTo=A_66bkIIjTxh)

>>[all-MiniML-L6-v2](#scrollTo=F8KsrWcV40iV)

>>>[Download data](#scrollTo=B63zRXr6j_BU)

>>[Ground truth MIPS](#scrollTo=cFJMSZVIKo3Y)

>>>[Download data](#scrollTo=nWRvsjsBOK3y)

>[Second part of assignment](#scrollTo=XKLI5Dqnigl7)

>>[TAAT for sparse vectors](#scrollTo=69vobk4CjAOi)

>>>[Download data](#scrollTo=6YpQ0AXe5axN)

>>[FAISS for dense vectors](#scrollTo=HqWRiD5733zb)

>>>[Download data](#scrollTo=-_BbpufnH8E7)

>>[Ground truth](#scrollTo=uLVmYvFLLwUJ)

>>>[Download data](#scrollTo=4ID2VxF-LwUQ)



# Libraries

In [None]:
# hide output
%%capture 
!pip install beir
!pip install -U sentence-transformers
!pip install faiss-gpu
#!pip install faiss-cpu

# Read data set

In [None]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

# name of dataset
dataset = "scifact"
# url of dataset 
URL = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
# donwload the dataset and put in a specific folder
data_path = util.download_and_unzip(URL, "dataset")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
# remove zip
!rm ./dataset/*.zip

  from tqdm.autonotebook import tqdm


dataset/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

# Take confidence with data

In [None]:
len(qrels)

300

In [None]:
qrels.items()

dict_items([('1', {'31715818': 1}), ('3', {'14717500': 1}), ('5', {'13734012': 1}), ('13', {'1606628': 1}), ('36', {'5152028': 1, '11705328': 1}), ('42', {'18174210': 1}), ('48', {'13734012': 1}), ('49', {'5953485': 1}), ('50', {'12580014': 1}), ('51', {'45638119': 1}), ('53', {'45638119': 1}), ('54', {'49556906': 1}), ('56', {'4709641': 1}), ('57', {'4709641': 1}), ('70', {'5956380': 1, '4414547': 1}), ('72', {'6076903': 1}), ('75', {'4387784': 1}), ('94', {'1215116': 1}), ('99', {'18810195': 1}), ('100', {'4381486': 1}), ('113', {'6157837': 1}), ('115', {'33872649': 1}), ('118', {'6372244': 1}), ('124', {'4883040': 1}), ('127', {'21598000': 1}), ('128', {'8290953': 1}), ('129', {'27768226': 1}), ('130', {'27768226': 1}), ('132', {'7975937': 1}), ('133', {'38485364': 1, '6969753': 1, '17934082': 1, '16280642': 1, '12640810': 1}), ('137', {'26016929': 1}), ('141', {'6955746': 1, '14437255': 1}), ('142', {'10582939': 1}), ('143', {'10582939': 1}), ('146', {'10582939': 1}), ('148', {'108

THE UPPER FILE IS COMPOSED BY:<br>
(query_id, corpus_id, score)<br>
NB: in corpus_id I must read from right to left, all number exept the first from left are composed by 3 digits<br>
**IS THE GROUND TRUTH!**

In [None]:
len(queries)

300

In [None]:
queries.items()

dict_items([('1', '0-dimensional biomaterials show inductive properties.'), ('3', '1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.'), ('5', '1/2000 in UK have abnormal PrP positivity.'), ('13', '5% of perinatal mortality is due to low birth weight.'), ('36', 'A deficiency of vitamin B12 increases blood levels of homocysteine.'), ('42', 'A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.'), ('48', 'A total of 1,000 people in the UK are asymptomatic carriers of vCJD infection.'), ('49', 'ADAR1 binds to Dicer to cleave pre-miRNA.'), ('50', 'AIRE is expressed in some skin tumors.'), ('51', 'ALDH1 expression is associated with better breast cancer outcomes.'), ('53', 'ALDH1 expression is associated with poorer prognosis in breast cancer.'), ('54', 'AMP-activated protein kinase (AMPK) activation increases inflammation-relat

THE UPPER FILE IS COMPOSED BY:<br>
(query_id, text of query)<br>
NB: query_id is the key of dictionary

In [None]:
len(corpus)

5183

In [None]:
corpus.items()

Output hidden; open in https://colab.research.google.com to view.

THE UPPER FILE IS COMPOSED BY:<br>
(doc_id, title, text)<br>
NB: title and text are keys of other dictionary

# First part of assignment

## BM25

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from scipy.sparse import csr_matrix
from tqdm.notebook import tqdm
from operator import itemgetter
from google.colab import files

In [None]:
# precompute BM25 for all docs
# return precomputed tau and idf for all terms
def precompute_BM25(bow_doc, B = 0.75, K = 1.2):
  # average lenght of doc in this collection
  avg_length = bow_doc.sum() / bow_doc.shape[0]
  ## FOR COMPUTING TAU
  # compute F_t
  F_t = bow_doc / (1 - B + B * (bow_doc.sum(1) / avg_length))
  # compute tau
  tau = csr_matrix(F_t / (K + F_t))
  ## FOR COMPUTING IDF
  # for each column determine which row has non zero values
  no_zero_col = bow_doc.transpose().tolil().rows
  # for each term it determines the number of docs where this term is present 
  doc_f = np.array(list(map(len, no_zero_col)))
  # compute inverse document frequency
  idf_for_all_terms = np.log10(1 + ((bow_doc.shape[0] - doc_f + 0.5) / (doc_f + 0.5)))
  return tau, idf_for_all_terms


# compute the final score for a specific query
# return a dictionary ordered composed by {doc_id: score}
def compute_score_BM25(tau, idf, query, doc_ids):
  # change type and create a copy of query
  query = query.astype(np.float32, copy=True)
  # extract the position for this row where values aren't zero (the terms used in query)
  non_zero_values = query[0, :].nonzero()[1]
  # subsitute that values with the right value found in idf (idf for those terms)
  query[0, non_zero_values] = idf[non_zero_values]
  # compute BM25
  score = tau.dot(query[0, :].transpose())
  # create a dict, where keys are doc id and value are scores
  order_dict = dict(sorted([(doc_ids[pos], score[pos, 0]) for pos in score.nonzero()[0]], reverse=True, key=itemgetter(1)))
  return order_dict

In [None]:
# create list of questions
list_queries = []
for key_q in queries:
  list_queries.append(queries[key_q])
# create list of possible answers
list_docs = []
for key_d in corpus:
  list_docs.append(corpus[key_d]["text"])
# create a unique list
complete_list = list_queries + list_docs

In [None]:
# instance of BOW 
# NB: Remove accents and perform other character normalization during the preprocessing step, remove upper case
BOW_transformer = CountVectorizer(strip_accents='unicode')
# train BOW (define dictionary)
BOW_transformer.fit(complete_list)

In [None]:
# transform all queries in BOW
bow_queries = BOW_transformer.transform(list_queries)
# transform all docs in BOW
bow_docs = BOW_transformer.transform(list_docs)

In [None]:
# precompute tau and idf
pre_tau, pre_idf = precompute_BM25(bow_docs)

In [None]:
query_ids = list(queries.keys())
# dictionary has as key the query id and inside
# there is another dictionary that contains as key doc id and as value the score
d_score_BM25 = {}
# compute queries
pbar = tqdm(total=bow_queries.shape[0], desc="Loading...")
for (n, query) in enumerate(bow_queries):
  d_score_BM25[query_ids[n]] = compute_score_BM25(pre_tau, pre_idf, query, list(corpus.keys()))
  pbar.update(1)

### Download data


In [None]:
# save dictionary as file
np.save('d_score_BM25.npy', d_score_BM25)
# download file
files.download('./d_score_BM25.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## all-MiniML-L6-v2

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from operator import itemgetter
from tqdm.notebook import tqdm
from google.colab import files

In [None]:
# compute the final score for a specific query
# return a dictionary ordered composed by {doc_id: score}
def compute_score_TRANS(model, docs_embed, query_sentence, doc_ids):
  # embedding of query
  query_emb = model.encode(query_sentence)
  # determine score
  score = docs_embed.dot(query_emb)
  # create a dict, where keys are doc id and value are scores
  order_dict = dict(sorted([(doc_ids[pos], score[pos]) for pos in range(score.size)], reverse=True, key=itemgetter(1)))
  return order_dict

In [None]:
# download NN model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# create embedding for each doc
doc_embeddings = model.encode(list_docs)
query_embeddings = model.encode(list_queries)

In [None]:
# dictionary has as key the query id and inside
# there is another dictionary that contains as key doc id and as value the score
d_score_TRANS = {}
# compute queries
for key_q in tqdm(queries.keys(), desc="Loading..."):
  d_score_TRANS[key_q] = compute_score_TRANS(model, doc_embeddings, queries[key_q], list(corpus.keys()))

Loading...:   0%|          | 0/300 [00:00<?, ?it/s]

### Download data

In [None]:
# save dictionary as file
np.save('d_score_TRANS.npy', d_score_TRANS) 
# download file
files.download('./d_score_TRANS.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Ground truth MIPS

In [None]:
# ground truth
d_score_ground_truth = {}
for key_q in queries.keys():
  d_score_ground_truth[key_q] = dict(sorted([(key_d, d_score_BM25[key_q].get(key_d, 0) + d_score_TRANS[key_q].get(key_d, 0)) for key_d in corpus.keys()], 
                                            reverse=True, key=itemgetter(1)))

### Download data

In [None]:
# save dictionary as file
np.save('d_score_ground_truth.npy', d_score_ground_truth) 
# download file
files.download('./d_score_ground_truth.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Second part of assignment

## TAAT for sparse vectors

In [None]:
import numpy as np
from operator import itemgetter
from tqdm.notebook import tqdm
from google.colab import files

In [None]:
# precompute TAAT defining a vector where for each term there is a dictionary composed by (doc_id: tau_score)
def precomputed_TAAT(p_tau, d_keys: np.array):
  inv_ind_structure = np.array([])
  # transpose matrix to have |V| x |D|
  p_tau = p_tau.transpose()
  # define load bar
  pbar = tqdm(total=p_tau.shape[0], desc="Loading...")
  # for each term it determines the values for WAND
  for row_t in p_tau:
    # for each term 
    # gives me the positions of docs that have
    # a value of tao different from zero
    positions = row_t.nonzero()[1]
    # append dictionary to the wand structure
    #                                                             doc_id        : tau score
    inv_ind_structure = np.append(inv_ind_structure, {d_keys[pos] : row_t[0, pos] for pos in positions})
    # update bar
    pbar.update()
  return inv_ind_structure

# compute TAAT
def compute_TAAT(query, inv_ind, idf, all_doc_keys: list):
  # terms that are not zero
  no_zero_terms = query.nonzero()[1]
  idf = idf[no_zero_terms]
  # hold only terms in the query
  inv_ind_terms = inv_ind[no_zero_terms]
  # contains docs with final scores
  final_dict_doc_score = {}
  # for each term
  for (docs_term, id) in zip(inv_ind_terms, idf):
    # for each doc in the term
    for keys in docs_term.keys():
      # compute the score
      final_dict_doc_score[keys] = final_dict_doc_score.get(keys, 0) + docs_term[keys] * id
  missed_doc = set(all_doc_keys)
  # docs that are not present in score
  missed_doc.difference_update(set(list(final_dict_doc_score.keys())))
  # add missed documents with score equal 0
  for k_missed in missed_doc:
    final_dict_doc_score[k_missed] = 0
  return dict(sorted(final_dict_doc_score.items(), reverse=True, key=itemgetter(1)))

In [None]:
# precomputed TAAT
pre_TAAT = precomputed_TAAT(pre_tau, np.array(list(corpus.keys())))

Loading...:   0%|          | 0/35557 [00:00<?, ?it/s]

In [None]:
query_ids = list(queries.keys())
# dictionary has as key the query id and inside
# there is another dictionary that contains as key doc id and as value the score
d_score_TAAT = {}
# compute queries
pbar = tqdm(total=bow_queries.shape[0], desc="Loading...")
for (n, query) in enumerate(bow_queries):
  d_score_TAAT[query_ids[n]] = compute_TAAT(query, pre_TAAT, pre_idf, list(corpus.keys()))
  pbar.update()

Loading...:   0%|          | 0/300 [00:00<?, ?it/s]

### Download data

In [None]:
# save dictionary as file
np.save('d_score_TAAT.npy', d_score_TAAT)
# download file
files.download('./d_score_TAAT.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## FAISS for dense vectors

In [None]:
from faiss import IndexFlatL2
from tqdm.notebook import tqdm
from google.colab import files

In [None]:
# initialize with the dimension of dense vector
index = IndexFlatL2(doc_embeddings.shape[1])
# add all docs
index.add(doc_embeddings)

In [None]:
# how many neighbornhood I want to watch
K = doc_embeddings.shape[0]
# compute the euclidean distance
D, I = index.search(query_embeddings, K)

In [None]:
d_keys = list(corpus.keys())
q_keys = list(queries.keys())
d_score_FAISS = {}
pbar = tqdm(total=bow_queries.shape[0], desc="Loading...")
for n, (i_row, d_row) in enumerate(zip(I, D)):
  #                                            -d since that d is the distance between q and doc
  #                                            so in this way I transform the minimum in the maximum like measure of quality
  d_score_FAISS[q_keys[n]] = dict([(d_keys[i], -d) for i, d in zip(i_row, d_row)])
  pbar.update()

Loading...:   0%|          | 0/300 [00:00<?, ?it/s]

### Download data

In [None]:
# save dictionary as file
np.save('d_score_FAISS.npy', d_score_FAISS)
# download file
files.download('./d_score_FAISS.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Ground truth

### Download data

In [None]:
# save dictionary as file
np.save('qrels.npy', qrels) 
# download file
files.download('./qrels.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>