# Learning with Massive Data
<p>
Assignment 3 - Similarity search for document pairs<br>
Giovanni Costa - 880892
</p>

Contents:
- [Sparse representation](#s_repr)
- [Dense representation](#d_repr)
- [Top k retrieval](#exact_retr)
- [Top k\' retrieval (approximate case)](#approx_retr)
- [Evaluations](#eval)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.sparse import load_npz, save_npz
from nltk.corpus import words
from nltk.stem import PorterStemmer
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.random_projection import SparseRandomProjection

## Document sparse representation

In [None]:
dataset="datasets/nfcorpus/corpus.jsonl"

In [None]:
df_docs=pd.read_json(dataset, lines=True)
df_docs.info()

In [None]:
df_docs.head(3)

In [None]:
#Sk-learn's "TfidfVectorizer" extension to provide the stemming feature
class StemmedTfidfVectorizer(TfidfVectorizer):
    stemmer = PorterStemmer()
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfVectorizer.stemmer.stem(w) for w in analyzer(doc))
    
    
def compute_sparse_repr(corpus: pd.DataFrame):
    #Extract only the word and the numbers, made a lowercase transformation and usage of custom vocabulary to make representations independent
    doc_tfidf=StemmedTfidfVectorizer(lowercase=True, stop_words=None, token_pattern=r'\w+', binary=True)

    #Computation of the sparse embedding
    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    vocab=doc_tfidf.vocabulary_
    
    return sparse_doc, vocab

In [None]:
sparse_repr, vocab=compute_sparse_repr(df_docs)
save_npz("sparse_repr_nfcorpus.npz", sparse_repr)

## Sequential Implementation 
### Exact similarity search

In [None]:
threshold=0.8

In [None]:
sparse_repr=load_npz("sparse_repr_nfcorpus.npz")
print(sparse_repr.shape)
print("Density ratio:", sparse_repr.count_nonzero()/(sparse_repr.shape[0]*sparse_repr.shape[1]))

In [None]:
#%%timeit
cosine_scores=cosine_similarity(sparse_repr, dense_output=False)
np.fill_diagonal(cosine_scores, -1)
num_of_pairs=(cosine_scores>=threshold).sum()/2

In [None]:
print(num_of_pairs)

In [None]:
""" 
#tmp=pd.DataFrame(cosine_scores, columns=df_docs["_id"], index=df_docs["_id"])
x, y=np.where(tmp>=0.5)
for i, j in zip(tmp.index[x], tmp.columns[y]):
    print(i, j) """

### Approximate similarity search
(using Sparse Random Projection)

In [None]:
epsilon=0.1

In [None]:
sr_proj=SparseRandomProjection(eps=epsilon, random_state=32)
sr_proj.fit(sparse_repr);

In [None]:
print(sr_proj.n_components_)
print(sr_proj.density_)

In [None]:
sparse_repr_approx_srp=sr_proj.transform(sparse_repr)
print(sparse_repr_approx_srp.shape)
print("Density ratio:", sparse_repr_approx_srp.count_nonzero()/(sparse_repr_approx_srp.shape[0]*sparse_repr_approx_srp.shape[1]))

In [None]:
#%%timeit
sparse_repr_approx_srp=sr_proj.transform(sparse_repr)
cosine_scores_approx_srp=cosine_similarity(sparse_repr_approx_srp, dense_output=False)
np.fill_diagonal(cosine_scores_approx_srp, -1)
num_of_pairs_approx_srp=(cosine_scores_approx_srp>=threshold).sum()/2

In [None]:
num_of_pairs_approx_srp

## Parallel implementation

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.mllib.linalg.distributed import RowMatrix
import pyspark.pandas as ps

os.environ['PYSPARK_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\Scripts\ipython.exe'

spark = SparkSession.builder.appName("MyApp").getOrCreate()
sc=spark.sparkContext
spark

In [None]:
sparse_repr=load_npz("sparse_repr_nfcorpus.npz")
print(sparse_repr.shape)

sorted_pairs = sorted(vocab.items(), key=lambda x: x[1])
vocab_terms = [pair[0] for pair in sorted_pairs]

In [None]:
sparse_df

In [None]:
cumsum = np.cumsum(np.dot(a,b))
index = np.argmax(cumsum < threshold)
result = cumsum[index] #b(d) value


def my_map(id, doc_repr, sorted_idx, ):
    for idx in sorted_idx:
        if doc_repr[idx]>

In [None]:
""" sparse_df=pd.DataFrame.sparse.from_spmatrix(sparse_repr)
sdf = ps.from_pandas(sparse_df)
sdf.max(axis=0)

sdf.apply() """

In [None]:
#https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.DataFrame.applymap.html

In [48]:
spark.stop()

## Results

In [None]:
""" def evaluation_fun(total_score_df: pd.DataFrame, 
                   idx_sparse: np.array, idx_dense: np.array, idx_total: np.array,                   
                   corpus_len: int, k_list: list, step:int=1, epsilon:float=0.01):
    mean_recalls_list=[list() for i in range(len(k_list))]
    top_k_exact_approx_lists=[list() for i in range(len(k_list))]
    max_k_prime_list=[]
    i=0
    for k in k_list:
        top_k_exact_docs=compute_exact_retrieval(total_score_df, idx_total, k)
        for k_prime in range(k, corpus_len+1, step):
            top_k_approx_docs=compute_approx_retrieval(idx_sparse, idx_dense, total_score_df, k, k_prime)
            recalls=[len(np.intersect1d(top_k_exact_docs[i], top_k_approx_docs[i], assume_unique=True))/k for i in range(len(top_k_exact_docs))]
            mean=np.mean(recalls)
            mean_recalls_list[i].append(mean)

            if mean>=1-epsilon or k==corpus_len:
                print(k_prime)
                max_k_prime_list.append(k_prime)
                break
        i+=1
    
    return mean_recalls_list, top_k_exact_approx_lists, max_k_prime_list
 """

In [None]:
""" def print_plot(k_list, mean_recalls_list, max_k_prime_list, step=1):
    plt.figure(figsize=(7, 5))
    plt.ylabel('Recall scores')
    plt.xlabel('K\' values')
    plt.ylim(np.min(np.concatenate(mean_recalls_list)), 1)
    plt.hlines(np.max(np.concatenate(mean_recalls_list)), np.min(k_list), np.max(max_k_prime_list), linewidth=2, linestyles="dashed", colors="grey")

    for i in range(len(k_list)):    
        plt.plot(range(k_list[i], max_k_prime_list[i]+1, step), mean_recalls_list[i], linewidth=2, label="k="+str(k_list[i]))
    plt.grid()
    plt.legend()
    plt.show() """