In [1]:
import os
import numpy as np
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.ml.linalg import SparseVector
from scipy.sparse import load_npz


os.environ['PYSPARK_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\Scripts\ipython.exe'

spark = SparkSession.builder.appName("MyApp").getOrCreate()
sc=spark.sparkContext
spark



In [2]:
#df_id=pd.read_parquet("ids_nfcorpus.parquet")
#vocab=pd.read_parquet("terms_nfcorpus.parquet")
threshold=0.7

In [3]:
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
#Sk-learn's "TfidfVectorizer" extension to provide the stemming feature
class StemmedTfidfVectorizer(TfidfVectorizer):
    stemmer = PorterStemmer()
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfVectorizer.stemmer.stem(w) for w in analyzer(doc))
    
    
def compute_sparse_repr(corpus: pd.DataFrame):
    #Extract only the word and the numbers, made a lowercase transformation and usage of custom vocabulary to make representations independent
    doc_tfidf=StemmedTfidfVectorizer(lowercase=True, stop_words=None, token_pattern=r'\w+', binary=True)

    #Computation of the sparse embedding
    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    vocab=doc_tfidf.vocabulary_
    
    return sparse_doc, vocab

In [4]:
toy_df=pd.DataFrame([["Hi how are you John?", 100], ["Hi how are you John my bro?", 101]], columns=["text", "id"])

In [5]:
toy_df

Unnamed: 0,text,id
0,Hi how are you John?,100
1,Hi how are you John my bro?,101


In [6]:
sparse_repr, vocab=compute_sparse_repr(toy_df)

In [7]:
cosine_scores=cosine_similarity(sparse_repr)
print(cosine_scores)
np.fill_diagonal(cosine_scores, -1)
num_of_pairs=(cosine_scores>=threshold).sum()/2
print(num_of_pairs)

[[1.         0.74740735]
 [0.74740735 1.        ]]
1.0


In [8]:
#sparse_repr=load_npz("sparse_repr_nfcorpus.npz")

In [9]:
def csr_to_sparse_vector(row):
    return SparseVector(row.shape[1], list(zip(row.indices, row.data)))

docs_sparse_forSpark = [csr_to_sparse_vector(sparse_repr.getrow(i)) for i in range(sparse_repr.shape[0])]
doc_ids = toy_df["id"] #df_id["_id"]

In [10]:
def sparse_argsort(matrix, idx):
    row=matrix.getrow(idx)
    sorted_indices = np.argsort(row.data)[::-1]
    return row.indices[sorted_indices].tolist()
sorted_index_term_doc=[sparse_argsort(sparse_repr, idx) for idx in range(sparse_repr.shape[0])]

In [11]:
sparse_repr.toarray()

array([[0.4472136 , 0.        , 0.4472136 , 0.4472136 , 0.4472136 ,
        0.        , 0.4472136 ],
       [0.33425073, 0.46977774, 0.33425073, 0.33425073, 0.33425073,
        0.46977774, 0.33425073]])

In [12]:
sorted_index_term_doc

[[2, 3, 0, 6, 4], [5, 1, 2, 3, 0, 6, 4]]

In [13]:
#TODO check if is right
d_star=sparse_repr.max(axis=0).toarray().reshape(-1)

In [14]:
d_star_sc=sc.broadcast(d_star)
rdd_forMap=sc.parallelize([(doc_ids[i], (docs_sparse_forSpark[i], sorted_index_term_doc[i])) for i in range(sparse_repr.shape[0])])

In [15]:
d_star

array([0.4472136 , 0.46977774, 0.4472136 , 0.4472136 , 0.4472136 ,
       0.46977774, 0.4472136 ])

Map( (doc_id, (doc, sorted_index_term_doc, b_d)) )
    return (index_term, (doc_id, doc) )

GroupBy: done by Spark

Reduce( (index_term,  list( (doc_id, doc) )) )



In [16]:
def b_d(sparse_repr, term_order):
    #print(sparse_repr)
    sparse_repr_tmp=[sparse_repr[t] for t in term_order]
    #print(sparse_repr_tmp)
    d_star_tmp=[d_star_sc.value[t] for t in term_order]
    cum_sum=0
    index=0
    #print()

    for i in range(len(d_star_tmp)):
        mult_val=sparse_repr_tmp[i]*d_star_tmp[i]
        index=i
        if cum_sum+mult_val>=threshold:
            index=index-1
            break
        cum_sum+=mult_val

    return index


def my_map(elem):
    result=[]
    doc_id=elem[0]
    sparse_repr=elem[1][0]
    sorted_index=elem[1][1]

    for i, t_idx in enumerate(sorted_index):
        if i>b_d(sparse_repr, sorted_index):
            result.append( (t_idx, (doc_id, sparse_repr)) )
    return result

rdd_forReduce=rdd_forMap.flatMap(my_map)
#iter=map(my_map, rdd_forMap)

In [17]:
#for i in iter:
#    print()

In [18]:
tmp=rdd_forReduce.collect()

In [19]:
tmp

[(6,
  (100,
   SparseVector(7, {0: 0.4472, 2: 0.4472, 3: 0.4472, 4: 0.4472, 6: 0.4472}))),
 (4,
  (100,
   SparseVector(7, {0: 0.4472, 2: 0.4472, 3: 0.4472, 4: 0.4472, 6: 0.4472}))),
 (3,
  (101,
   SparseVector(7, {0: 0.3343, 1: 0.4698, 2: 0.3343, 3: 0.3343, 4: 0.3343, 5: 0.4698, 6: 0.3343}))),
 (0,
  (101,
   SparseVector(7, {0: 0.3343, 1: 0.4698, 2: 0.3343, 3: 0.3343, 4: 0.3343, 5: 0.4698, 6: 0.3343}))),
 (6,
  (101,
   SparseVector(7, {0: 0.3343, 1: 0.4698, 2: 0.3343, 3: 0.3343, 4: 0.3343, 5: 0.4698, 6: 0.3343}))),
 (4,
  (101,
   SparseVector(7, {0: 0.3343, 1: 0.4698, 2: 0.3343, 3: 0.3343, 4: 0.3343, 5: 0.4698, 6: 0.3343})))]

In [20]:
vocab

{'hi': 2, 'how': 3, 'are': 0, 'you': 6, 'john': 4, 'my': 5, 'bro': 1}

In [21]:
def max_of_intersection(list1, list2):
    max=0
    i = 0
    j = 0
    while i < len(list1) and j < len(list2):
        elem1=list1[i]
        elem2=list2[j]

        if elem1 == elem2:
            if elem1>max:
                max=elem1
            i += 1
            j += 1
        elif elem1 < elem2:
            i += 1
        else:
            j += 1

    return max

def my_reduce(elem):
    result=[]
    pairs_dict={}
    for id1, d1 in elem[1]:
        for id2, d2 in elem[1]:
            if id1!=id2 and (not pairs_dict.get((id2, id1), False)) and elem[0]==max_of_intersection(d1.indices, d2.indices):
                
                sim=d1.dot(d2)/(d1.norm(2)*d2.norm(2))
                if sim>=threshold:
                    pairs_dict[(id1, id2)]=True
                    result.append((id1, id2, sim))
    return result
                
from collections import OrderedDict

def reduce_by_key(ls):
    d = OrderedDict()
    for key, sublist in ls:
        d.setdefault(key, []).extend([sublist])
    return list(d.items())
iter=reduce_by_key(tmp)

In [22]:
res=map(my_reduce, iter)

In [23]:
lista=[]
for e in res:
    lista.append(e)


In [24]:
lista

[[(100, 101, 0.7474073540060464)], [], [], []]

In [25]:
a

NameError: name 'a' is not defined

In [None]:
spark.stop()