In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
#nltk.download('words')
#nltk.download('stopwords')
#nltk.download('punkt')

data_dir="datasets/"
dataset_name_dir="nfcorpus/"
#pd.read_csv(data_dir+dataset_name_dir+qrels/test.tsv", sep='\t', header=0)

In [2]:
corpus=pd.read_json(data_dir+dataset_name_dir+"corpus.jsonl", lines=True, dtype_backend='pyarrow')
queries=pd.read_json(data_dir+dataset_name_dir+"queries.jsonl", lines=True, dtype_backend='pyarrow')

### Sparse representation

In [3]:
#Preprocessing for sparse representation
corpus["text_lower"]=corpus["text"].apply(str.lower)
queries["text_lower"]=queries["text"].apply(str.lower)

tokenizer=RegexpTokenizer(r'\w+')
corpus["tokens"]=corpus["text_lower"].apply(tokenizer.tokenize)
queries["tokens"]=queries["text_lower"].apply(tokenizer.tokenize)

stemmer = PorterStemmer()
corpus["stem_tokens"] = corpus["tokens"].apply(lambda x: [stemmer.stem(y) for y in x])
queries["stem_tokens"] = queries["tokens"].apply(lambda x: [stemmer.stem(y) for y in x])

In [4]:
def compute_sparse_repr(vocab: np.array, corpus: pd.DataFrame, queries: pd.DataFrame):
    bm25 = BM25Okapi(corpus["stem_tokens"])
    doc_reps=bm25.get_scores(vocab)
    queries["sparse_emb"]=queries["stem_tokens"].apply(lambda x: [1 if t in x else 0 for t in vocab])
    return doc_reps

In [8]:
len(corpus)

3633

In [5]:
vocab=np.array(words.words())
doc_reps=compute_sparse_repr(vocab, corpus, queries)
print("Done repr")
#Here it's basically computed sparse_score=<q_sparse, d_sparse>
queries["sparse_scores"]=queries["sparse_emb"].apply(lambda x: np.dot(x, doc_reps))

Done repr


ValueError: shapes (236736,) and (3633,) not aligned: 236736 (dim 0) != 3633 (dim 0)

### Dense representation

In [None]:
transformers = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
corpus["dense_emb"]=transformers.encode(corpus["text"])
queries["dense_emb"]=transformers.encode(queries["text"])

In [None]:
#Here it's basically computed dense_score=<q_dense, d_dense>
queries["dense_scores"]=queries["dense_emb"].apply(lambda x: np.dot(x, corpus["dense_emb"]))

In [None]:
corpus.head(2)

In [None]:
queries.head(2)

## Top k retrieval

In [None]:
k=5
queries["top_k_total_score"]=queries["sparse_scores"]+queries["dense_score"]

In [None]:
queries["ground_truth_docs"]=queries["top_k_total_score"].apply(lambda x: corpus.loc[np.argsort(x)[:k], "_id"])

## Top k' retrieval (approximate case)

In [None]:
k_prime=k
queries["index_sparse_top_k_prime"]=queries["sparse_score"].apply(lambda x: (np.argsort(x)[:k_prime], corpus.loc[np.argsort(x)[:k_prime], "_id"]))
queries["index_dense_top_k_prime"]=queries["dense_score"].apply(lambda x: (np.argsort(x)[:k_prime], corpus.loc[np.argsort(x)[:k_prime], "_id"])))

In [None]:
queries["index_union_top_k_prime"]=queries.apply(lambda x: np.unique(np.concatenate(x["index_sparse_top_k_prime"], x["index_dense_top_k_prime"])))

queries["top_k_prime_total_score"]=queries["index_union_top_k_prime"].apply(
    lambda x: [(x["sparse_score"][idx]+x["dense_score"][idx], docid) for idx, docid in x])

queries["approx_docs-score"]=queries["top_k_prime_total_score"].apply(lambda x: sorted(x)[:k])

In [17]:
""" queries["score_dense_top_k"]=queries.apply(lambda x: [
    (i, x["dense_score"][x["index_sparse_top_k"][i]]+
        x["dense_score"][x["index_dense_top_k"][i]]) for i in range(k_prime)])

#TODO: DEVO farlo per ogni riga!
tmp=queries[["index_sparse_top_k"], ["index_dense_top_k"], ["sparse_score"], ["dense_score"]]
for idx_s, idx_d, d_s, s_s in tmp.itertuples():
    for i in range(k_prime):
        i_d_s=idx_s[i]
        i_s_s=idx_d[i]

        d_s[i_d_s]+d_s[i_s_s]
        s_s[i_s_s]+s_s[i_s_s]


queries["total_score"].apply(lambda x: corpus.loc[np.argsort(x)[:k], "_id"])
 """;

(0, 's')

## Evaluations

## TEST

In [None]:
def get_top_k_sparse(corpus: pd.DataFrame, queries: pd.DataFrame, k: int):
    bm25 = BM25Okapi(corpus["stem_tokens"])
    queries["bm25_scores"]=queries["stem_tokens"].apply(bm25.get_scores)
    queries["bm25_topk_docIDs"]=queries["bm25_scores"].apply(lambda x: corpus.loc[np.argsort(x)[:k], "_id"])


def get_top_k_dense(corpus: pd.DataFrame, queries: pd.DataFrame, k: int, model):

    queries["transf_scores"]=queries["dense_emb"].apply(lambda x: np.dot(x, corpus["dense_emb"]))
    queries["transf_topk_docIDs"]=queries["transf_scores"].apply(lambda x: corpus.loc[np.argsort(x)[:k], "_id"])
    
#def get_top_k(corpus: pd.DataFrame, queries: pd.DataFrame, k: int):