## Learning with Massive Data
### Assignment 2 - Studying Sparse-Dense Retrieval
#### Giovanni Costa - 880892

Contents:
- [Sparse representation](#s_repr)
- [Dense representation](#d_repr)
- [Top k retrieval](#exact_retr)
- [Top k\' retrieval (approximate case)](#approx_retr)
- [Evaluations](#eval)

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import words
from nltk.stem import PorterStemmer
from matplotlib import pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

data_dir="datasets/"
dataset1_name_dir="trec-covid/"
dataset2_name_dir="scifact/"

In [None]:
corpus_df1=pd.read_json(data_dir+dataset1_name_dir+"corpus.jsonl", lines=True)
queries_df1=pd.read_json(data_dir+dataset1_name_dir+"queries.jsonl", lines=True)

In [None]:
corpus_df1.info()

In [None]:
corpus_df1.head(2)

In [None]:
queries_df1.info()

In [None]:
queries_df1.head(2)

In [None]:
corpus_df2=pd.read_json(data_dir+dataset2_name_dir+"corpus.jsonl", lines=True)
queries_df2=pd.read_json(data_dir+dataset2_name_dir+"queries.jsonl", lines=True)

In [None]:
corpus_df2.info()

In [None]:
corpus_df2.head(2)

In [None]:
queries_df2.info()

In [None]:
queries_df2.head(2)

<a id='s_repr'></a>
### Sparse representation

In [None]:
#Sk-learn's "TfidfVectorizer" and "CountVectorizer" extension to provide the stemming feature
class StemmedTfidfVectorizer(TfidfVectorizer):
    stemmer = PorterStemmer()
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfVectorizer.stemmer.stem(w) for w in analyzer(doc))
    
class StemmedTfidfCountVectorizer(CountVectorizer):
    stemmer = PorterStemmer()
    def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfCountVectorizer.stemmer.stem(w) for w in analyzer(doc))
    
    
def compute_sparse_repr(vocab: np.array, corpus: pd.DataFrame, queries: pd.DataFrame):
    #Extract only the word and the numbers, made a lowercase transformation and usage of custom vocabulary to make representations independent
    doc_tfidf=StemmedTfidfVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')
    q_counter=StemmedTfidfCountVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')

    #Computation of the sparse embedding
    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    sparse_q=q_counter.fit_transform(queries["text"])

    return sparse_doc, sparse_q

In [None]:
#Stem the vocabulary and drop the duplicates
stemmer=PorterStemmer()
vocab=np.unique([stemmer.stem(w) for w in np.char.lower(words.words())])

In [None]:
sparse_doc, sparse_q=compute_sparse_repr(vocab, corpus_df1, queries_df1)

#Here it's basically computed sparse_score=<q_sparse, d_sparse>
sparse_score_df=pd.DataFrame(np.dot(sparse_q, sparse_doc.transpose()).toarray(), index=queries_df1["_id"], columns=corpus_df1["_id"])
sparse_score_df.to_parquet("sparse_score_df_"+dataset1_name_dir.split("/")[0]+".parquet")

In [None]:
sparse_doc, sparse_q=compute_sparse_repr(vocab, corpus_df2, queries_df2)

#Here it's basically computed sparse_score=<q_sparse, d_sparse>
sparse_score_df=pd.DataFrame(np.dot(sparse_q, sparse_doc.transpose()).toarray(), index=queries_df2["_id"], columns=corpus_df2["_id"])
sparse_score_df.to_parquet("sparse_score_df_"+dataset2_name_dir.split("/")[0]+".parquet")

<a id='d_repr'></a>
### Dense representation

In [None]:
def compute_dense_repr(corpus: pd.DataFrame, queries: pd.DataFrame):
    transformers = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    ##Computation of the dense embedding
    dense_c=transformers.encode(corpus["text"], convert_to_numpy = True)
    dense_q=transformers.encode(queries["text"], convert_to_numpy = True)
    
    return dense_c, dense_q

In [None]:
dense_c, dense_q=compute_dense_repr(corpus_df1, queries_df1)

#Here it's basically computed dense_score=<q_dense, d_dense>
dense_score_df=pd.DataFrame(np.dot(dense_q, dense_c.transpose()), index=queries_df1["_id"], columns=corpus_df1["_id"])
dense_score_df.to_parquet("dense_score_df_"+dataset1_name_dir.split("/")[0]+".parquet")

In [None]:
dense_c, dense_q=compute_dense_repr(corpus_df2, queries_df2)

#Here it's basically computed dense_score=<q_dense, d_dense>
dense_score_df=pd.DataFrame(np.dot(dense_q, dense_c.transpose()), index=queries_df2["_id"], columns=corpus_df2["_id"])
dense_score_df.to_parquet("dense_score_df_"+dataset2_name_dir.split("/")[0]+".parquet")

<a id='exact_retr'></a>
## Top k retrieval

In [None]:
sparse_score_df1=pd.read_parquet("sparse_score_df_"+dataset1_name_dir.split("/")[0]+".parquet")
dense_score_df1=pd.read_parquet("dense_score_df_"+dataset1_name_dir.split("/")[0]+".parquet")

In [None]:
sparse_score_df1.info()

In [None]:
dense_score_df1.info()

In [None]:
sparse_score_df2=pd.read_parquet("sparse_score_df_"+dataset2_name_dir.split("/")[0]+".parquet")
dense_score_df2=pd.read_parquet("dense_score_df_"+dataset2_name_dir.split("/")[0]+".parquet")

In [None]:
sparse_score_df2.info()

In [None]:
dense_score_df2.info()

In [None]:
def compute_exact_retrieval(sparse_score_df: pd.DataFrame, dense_score_df: pd.DataFrame, k: int):
    #Sum the two scores
    total_score_df=sparse_score_df+dense_score_df

    #Get the exact top-k documents indexes
    idx_exact_top_k=np.argsort(total_score_df)[:, :-k-1:-1]
    #Get the exact top-k documents ids
    top_k_exact_docs=np.array(total_score_df.columns[idx_exact_top_k.reshape(-1)]).reshape(-1, k)

    return total_score_df, top_k_exact_docs

In [None]:
k=5
total_score_df1, top_k_exact_docs_df1=compute_exact_retrieval(sparse_score_df1, dense_score_df1, k)

In [None]:
total_score_df1.head(2)

In [None]:
top_k_exact_docs_df1[0]

In [None]:
k=5
total_score_df2, top_k_exact_docs_df2=compute_exact_retrieval(sparse_score_df2, dense_score_df2, k)

In [None]:
total_score_df2.head(2)

In [None]:
top_k_exact_docs_df2[0]

<a id='approx_retr'></a>
## Top k' retrieval (approximate case)

In [None]:
def compute_approx_retrieval(sparse_score_df: pd.DataFrame, dense_score_df: pd.DataFrame, total_score_df: pd.DataFrame, k_prime: int):
    #Get the indexes of top-k' docs for the sparse representations
    idx_sparse_scores=np.argsort(sparse_score_df)[:, :-k_prime-1:-1]
    #Get the indexes of top-k' docs for the dense representations
    idx_dense_scores=np.argsort(dense_score_df)[:, :-k_prime-1:-1]

    #Merging of the two vectors of indexes
    concat_idx=np.concatenate((idx_sparse_scores, idx_dense_scores), axis=1)
    #Drop duplicates in every row
    union_idx=[np.unique(x) for x in concat_idx]

    #Get the indexes of the top-k documents given the merged approximate sets:
    #for every query, take the top-k indexes using the specific indexes of the merged set, based on total_score computed previous
    idx_approx_top_k=np.asarray([ union_idx[i][np.argsort(total_score_df.iloc[i, union_idx[i]])[:-k-1:-1].values]
                                  for i in range(len(union_idx)) ])
    
    #Get the doc ids from the retrieved indexes
    top_k_approx_docs=np.array(total_score_df.columns[idx_approx_top_k.reshape(-1)]).reshape(-1, k)

    return top_k_approx_docs

In [None]:
top_k_approx_docs_df1=compute_approx_retrieval(sparse_score_df1, dense_score_df1, total_score_df1, k)

In [None]:
top_k_approx_docs_df1[0]

In [None]:
top_k_approx_docs_df2=compute_approx_retrieval(sparse_score_df2, dense_score_df2, total_score_df2, k)

In [None]:
top_k_approx_docs_df2[0]

<a id='eval'></a>
## Evaluations

In [None]:
def evaluation_fun(sparse_score_df: pd.DataFrame, dense_score_df: pd.DataFrame, corpus_len: int, k_list: list, step:int=1, epsilon:float=0.01):
    mean_recalls_list=[[], [], []]
    top_k_exact_approx_lists=[[], [], []]
    max_k_prime_list=[]
    i=0
    for k in k_list:
        total_score_df, top_k_exact_docs=compute_exact_retrieval(sparse_score_df, dense_score_df, k)
        for k_prime in range(k, corpus_len+1, step):
            top_k_approx_docs=compute_approx_retrieval(sparse_score_df, dense_score_df, total_score_df, k_prime)
            recalls=[len(np.intersect1d(top_k_exact_docs[i], top_k_approx_docs[i], assume_unique=True))/k for i in range(len(top_k_exact_docs))]
            mean=np.mean(recalls)
            mean_recalls_list[i].append(mean)

            if mean>=1-epsilon or k==corpus_len:
                print(k_prime)
                max_k_prime_list.append(k_prime)
                break
        i+=1
    
    return mean_recalls_list, top_k_exact_approx_lists, max_k_prime_list


In [None]:
def print_plot(k_list, mean_recalls_list, top_k_exact_approx_lists, max_k_prime_list, step=1):
    plt.figure(figsize=(4, 3))
    plt.ylabel('Recall scores')
    plt.xlabel('K\' values')
    plt.ylim(np.min(np.concatenate(mean_recalls_list)), 1)
    plt.hlines(np.max(np.concatenate(mean_recalls_list)), np.min(k_list), np.max(max_k_prime_list), linewidth=2, linestyles="dashed", colors="grey")

    for i in range(len(k_list)):    
        plt.plot(range(k_list[i], max_k_prime_list[i]+1, step), mean_recalls_list[i], linewidth=2, legend=str(i))
    plt.grid()
    plt.legend()
    plt.show()

In [None]:
k_list=[5, 50, 100]
mean_recalls_list_df1, top_k_exact_approx_lists_df1, max_k_prime_list_df1=evaluation_fun(sparse_score_df1, dense_score_df1, len(corpus_df1), k_list)

In [None]:
print_plot(k_list, mean_recalls_list_df1, top_k_exact_approx_lists_df1, max_k_prime_list_df1)

In [None]:
mean_recalls_list_df2, top_k_exact_approx_lists_df2, max_k_prime_list_df2=evaluation_fun(sparse_score_df2, dense_score_df2, len(corpus_df2), k_list)

In [None]:
print_plot(k_list, mean_recalls_list_df2, top_k_exact_approx_lists_df2, max_k_prime_list_df2)

In [None]:
for i in range(len(k_list)):
    print("K: ", k_list[i])
    idx=np.argmax(mean_recalls_list_df1[i])
    print("Exit threshold: ", 1-0.01)
    print("Highest recall w.r.t. exact solution: ", mean_recalls_list_df1[i][idx], ",")
    print("obtained with k\': ", max_k_prime_list_df1[i])
    print()


In [None]:
for i in range(len(k_list)):
    print("K: ", k_list[i])
    idx=np.argmax(mean_recalls_list_df2[i])
    print("Exit threshold: ", 1-0.01)
    print("Highest recall w.r.t. exact solution: ", mean_recalls_list_df2[i][idx], ",")
    print("obtained with k\': ", max_k_prime_list_df2[i])
    print()
