In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import words
from nltk.stem import PorterStemmer
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
dataset="datasets/nfcorpus/corpus.jsonl"
threshold=0.9

In [None]:
df_docs=pd.read_json(dataset, lines=True)
df_docs.info()

In [None]:
df_docs.head(3)

In [None]:
#Sk-learn's "TfidfVectorizer" and "CountVectorizer" extension to provide the stemming feature
class StemmedTfidfVectorizer(TfidfVectorizer):
    stemmer = PorterStemmer()
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfVectorizer.stemmer.stem(w) for w in analyzer(doc))
    
    
def compute_sparse_repr(vocab: np.array, corpus: pd.DataFrame):
    #Extract only the word and the numbers, made a lowercase transformation and usage of custom vocabulary to make representations independent
    doc_tfidf=StemmedTfidfVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')

    #Computation of the sparse embedding
    sparse_doc=doc_tfidf.fit_transform(corpus["text"])

    return sparse_doc

In [None]:
#Stem the vocabulary and drop the duplicates
stemmer=PorterStemmer()
vocab=np.unique([stemmer.stem(w) for w in np.char.lower(words.words())])

In [None]:
sparse_repr=compute_sparse_repr(vocab, df_docs)

In [None]:
cosine_scores=cosine_similarity(sparse_repr)

In [None]:
np.fill_diagonal(cosine_scores, -1)
tmp=pd.DataFrame(cosine_scores, columns=df_docs["_id"], index=df_docs["_id"])


In [None]:
x, y=np.where(tmp>=0.5)

In [None]:
for i, j in zip(tmp.index[x], tmp.columns[y]):
    print(i, j)

# Results

In [None]:
""" def evaluation_fun(total_score_df: pd.DataFrame, 
                   idx_sparse: np.array, idx_dense: np.array, idx_total: np.array,                   
                   corpus_len: int, k_list: list, step:int=1, epsilon:float=0.01):
    mean_recalls_list=[list() for i in range(len(k_list))]
    top_k_exact_approx_lists=[list() for i in range(len(k_list))]
    max_k_prime_list=[]
    i=0
    for k in k_list:
        top_k_exact_docs=compute_exact_retrieval(total_score_df, idx_total, k)
        for k_prime in range(k, corpus_len+1, step):
            top_k_approx_docs=compute_approx_retrieval(idx_sparse, idx_dense, total_score_df, k, k_prime)
            recalls=[len(np.intersect1d(top_k_exact_docs[i], top_k_approx_docs[i], assume_unique=True))/k for i in range(len(top_k_exact_docs))]
            mean=np.mean(recalls)
            mean_recalls_list[i].append(mean)

            if mean>=1-epsilon or k==corpus_len:
                print(k_prime)
                max_k_prime_list.append(k_prime)
                break
        i+=1
    
    return mean_recalls_list, top_k_exact_approx_lists, max_k_prime_list
 """

In [None]:
""" def print_plot(k_list, mean_recalls_list, max_k_prime_list, step=1):
    plt.figure(figsize=(7, 5))
    plt.ylabel('Recall scores')
    plt.xlabel('K\' values')
    plt.ylim(np.min(np.concatenate(mean_recalls_list)), 1)
    plt.hlines(np.max(np.concatenate(mean_recalls_list)), np.min(k_list), np.max(max_k_prime_list), linewidth=2, linestyles="dashed", colors="grey")

    for i in range(len(k_list)):    
        plt.plot(range(k_list[i], max_k_prime_list[i]+1, step), mean_recalls_list[i], linewidth=2, label="k="+str(k_list[i]))
    plt.grid()
    plt.legend()
    plt.show() """