In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import words
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#nltk.download('words')
#nltk.download('stopwords')
#nltk.download('punkt')

data_dir="datasets/"
dataset_name_dir="nfcorpus/"

In [None]:
corpus=pd.read_json(data_dir+dataset_name_dir+"corpus.jsonl", lines=True)
queries=pd.read_json(data_dir+dataset_name_dir+"queries.jsonl", lines=True)

In [None]:
corpus.info()

In [None]:
queries.info()

### Sparse representation

In [None]:
def compute_sparse_repr(vocab: np.array, corpus: pd.DataFrame, queries: pd.DataFrame):
    doc_tfidf=TfidfVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')
    q_counter=CountVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')

    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    sparse_q=q_counter.fit_transform(queries["text"])

    return sparse_doc, sparse_q

In [None]:
vocab=np.unique(np.char.lower(words.words()))
sparse_doc, sparse_q=compute_sparse_repr(vocab, corpus, queries)

#Here it's basically computed sparse_score=<q_sparse, d_sparse>
sparse_score_df=pd.DataFrame(np.dot(sparse_q, sparse_doc.transpose()).toarray(), index=queries["_id"], columns=corpus["_id"])

### Dense representation

In [None]:
def compute_dense_repr(corpus: pd.DataFrame, queries: pd.DataFrame):
    transformers = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    dense_c=transformers.encode(corpus["text"], convert_to_numpy = True)
    dense_q=transformers.encode(queries["text"], convert_to_numpy = True)
    
    return dense_c, dense_q

In [None]:
dense_c, dense_q=compute_dense_repr(corpus, queries)

#Here it's basically computed dense_score=<q_dense, d_dense>
dense_score_df=pd.DataFrame(np.dot(dense_q, dense_c.transpose()), index=queries["_id"], columns=corpus["_id"])

In [None]:
sparse_score_df.to_parquet("sparse_score_df_nfcorpus.parquet")
dense_score_df.to_parquet("dense_score_dfnfcorpus.parquet")

## Top k retrieval

In [None]:
sparse_score_df=pd.read_parquet("sparse_score_df_nfcorpus.parquet")
dense_score_df=pd.read_parquet("dense_score_dfnfcorpus.parquet")

In [None]:
def compute_exact_retrieval(sparse_score_df: pd.DataFrame, dense_score_df: pd.DataFrame, k: int):
    total_score_df=sparse_score_df+dense_score_df

    idx_exact_top_k=np.argsort(total_score_df)[:, :-k-1:-1]
    top_k_exact_docs=np.array(total_score_df.columns[idx_exact_top_k.reshape(-1)]).reshape(-1, k)

    return total_score_df, top_k_exact_docs

In [None]:
k=5
total_score_df, top_k_exact_docs=compute_exact_retrieval(sparse_score_df, dense_score_df, k)

## Top k' retrieval (approximate case)

In [None]:
def compute_approx_retrieval(sparse_score_df: pd.DataFrame, dense_score_df: pd.DataFrame, total_score_df: pd.DataFrame, k_prime: int):
    idx_sparse_scores=np.argsort(sparse_score_df)[:, :-k_prime-1:-1]
    idx_dense_scores=np.argsort(dense_score_df)[:, :-k_prime-1:-1]

    concat_idx=np.concatenate((idx_sparse_scores, idx_dense_scores), axis=1)
    union_idx=[np.unique(x) for x in concat_idx]
    idx_approx_top_k=np.asarray([union_idx[i][np.argsort(total_score_df.iloc[i, union_idx[i]])[:-k-1:-1].values]
                    for i in range(len(union_idx))])
    top_k_approx_docs=np.array(total_score_df.columns[idx_approx_top_k.reshape(-1)]).reshape(-1, k)

    return top_k_approx_docs

## Evaluations

In [None]:
mean_recalls=[]
for k_prime in range(k, len(corpus)+1):
    top_k_approx_docs=compute_approx_retrieval(sparse_score_df, dense_score_df, total_score_df, k_prime)
    
    recall=[len(np.intersect1d(top_k_exact_docs[i], top_k_approx_docs[i], assume_unique=True))/k for i in range(len(top_k_exact_docs))]
    mean_recalls.append(np.mean(recall))

In [16]:
#TODO: FARE STEMMING

0.8797652147049738