In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import words
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#nltk.download('words')
#nltk.download('stopwords')
#nltk.download('punkt')

data_dir="datasets/"
dataset_name_dir="nfcorpus/"
#pd.read_csv(data_dir+dataset_name_dir+qrels/test.tsv", sep='\t', header=0)

In [None]:
corpus=pd.read_json(data_dir+dataset_name_dir+"corpus.jsonl", lines=True)
queries=pd.read_json(data_dir+dataset_name_dir+"queries.jsonl", lines=True)

In [None]:
corpus.info()

In [None]:
queries.info()

### Sparse representation

In [None]:
def compute_sparse_repr(vocab: np.array, corpus: pd.DataFrame, queries: pd.DataFrame):
    doc_tfidf=TfidfVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')
    q_counter=CountVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')

    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    sparse_q=q_counter.fit_transform(queries["text"])

    return sparse_doc, sparse_q

In [None]:
vocab=np.unique(np.char.lower(words.words()))
sparse_doc, sparse_q=compute_sparse_repr(vocab, corpus, queries)

#Here it's basically computed sparse_score=<q_sparse, d_sparse>
queries["sparse_scores"]=list(np.dot(sparse_q, sparse_doc.transpose()).toarray())

### Dense representation

In [None]:
def compute_dense_repr(corpus: pd.DataFrame, queries: pd.DataFrame):
    transformers = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    dense_c=transformers.encode(corpus["text"], convert_to_numpy = True)
    dense_q=transformers.encode(queries["text"], convert_to_numpy = True)
    
    return dense_c, dense_q

In [None]:
dense_c, dense_q=compute_dense_repr(corpus, queries)

#Here it's basically computed dense_score=<q_dense, d_dense>
queries["dense_scores"]=np.dot(dense_q, dense_c.transpose())

In [None]:
corpus.head(2)

In [None]:
queries.head(2)

## Top k retrieval

In [None]:
k=5
queries["total_score_top_k"]=queries["sparse_scores"]+queries["dense_scores"]

In [None]:
queries["ground_truth_docs-score"]=queries.apply(lambda x: (x["total_score_top_k"], corpus.loc[np.argsort(x["total_score_top_k"])[-k:], "_id"].values), axis=1)
queries["ground_truth_docs"]=queries["ground_truth_docs-score"].apply(lambda x: [elem[1] for elem in x])
queries["ground_truth_docs"]=queries["ground_truth_docs"].apply(set)

## Top k' retrieval (approximate case)

In [None]:
k_prime=k

queries["index_sparse_top_k_prime"]=queries["sparse_scores"].apply(lambda x: np.argsort(x)[-k_prime:])
queries["index_dense_top_k_prime"]=queries["dense_scores"].apply(lambda x: np.argsort(x)[-k_prime:])

In [None]:
queries["index_union_top_k_prime"]=queries.apply(lambda x: np.concatenate((x["index_sparse_top_k_prime"], x["index_dense_top_k_prime"])), axis=1)
queries["index_union_top_k_prime"]=queries["index_union_top_k_prime"].apply(np.unique)

queries["approx_docs-score"]=queries.apply(
    lambda x: sorted([(x["sparse_scores"][idx]+x["dense_scores"][idx], corpus.iloc[idx]["_id"]) for idx in x["index_union_top_k_prime"]], reverse=True)[:k], axis=1)

queries["approx_docs"]=queries["approx_docs-score"].apply(lambda x: [elem[1] for elem in x])
queries["approx_docs"]=queries["approx_docs"].apply(set)

## Evaluations

In [None]:
queries["recall"]=queries.apply(lambda x: len(x['ground_truth_docs'].intersection(x["approx_docs"]))/len(x['ground_truth_docs']), axis=1)
np.mean(queries["recall"])