In [97]:
import os
import numpy as np
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.ml.linalg import SparseVector
from scipy.sparse import load_npz


os.environ['PYSPARK_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\Scripts\ipython.exe'

spark = SparkSession.builder.appName("MyApp").getOrCreate()
sc=spark.sparkContext
spark

In [98]:
#df_id=pd.read_parquet("ids_nfcorpus.parquet")
#vocab=pd.read_parquet("terms_nfcorpus.parquet")
threshold=0.6

In [99]:
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
#Sk-learn's "TfidfVectorizer" extension to provide the stemming feature
class StemmedTfidfVectorizer(TfidfVectorizer):
    stemmer = PorterStemmer()
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (StemmedTfidfVectorizer.stemmer.stem(w) for w in analyzer(doc))
    
    
def compute_sparse_repr(corpus: pd.DataFrame):
    #Extract only the word and the numbers, made a lowercase transformation and usage of custom vocabulary to make representations independent
    doc_tfidf=StemmedTfidfVectorizer(lowercase=True, stop_words=None, token_pattern=r'\w+', binary=True)

    #Computation of the sparse embedding
    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    vocab=doc_tfidf.vocabulary_
    
    return sparse_doc, vocab

In [100]:
toy_df=pd.DataFrame([["Hi how are you John?", 100], ["Hi how are you John my bro?", 101], ["Hi how are you?", 102], ["Bro call me John!", 103]], columns=["text", "id"])

In [101]:
toy_df

Unnamed: 0,text,id
0,Hi how are you John?,100
1,Hi how are you John my bro?,101
2,Hi how are you?,102
3,Bro call me John!,103


In [102]:
sparse_repr, vocab=compute_sparse_repr(toy_df)

In [103]:
cosine_scores=cosine_similarity(sparse_repr)
print(cosine_scores)
np.fill_diagonal(cosine_scores, -1)
num_of_pairs=(cosine_scores>=threshold).sum()/2
print(num_of_pairs)

[[1.         0.74617541 0.89442719 0.1640141 ]
 [0.74617541 1.         0.66739957 0.30910547]
 [0.89442719 0.66739957 1.         0.        ]
 [0.1640141  0.30910547 0.         1.        ]]
3.0


In [104]:
#sparse_repr=load_npz("sparse_repr_nfcorpus.npz")

In [105]:
def csr_to_sparse_vector(row):
    return SparseVector(row.shape[1], list(zip(row.indices, row.data)))

docs = [csr_to_sparse_vector(sparse_repr.getrow(i)) for i in range(sparse_repr.shape[0])]
doc_ids = toy_df["id"] #df_id["_id"]

In [106]:
def sparse_argsort(matrix, idx):
    row=matrix.getrow(idx)
    sorted_indices = np.argsort(row.data)[::-1]
    return row.indices[sorted_indices]
sorted_index_term_doc=[sparse_argsort(sparse_repr, idx) for idx in range(sparse_repr.shape[0])]

In [107]:
sparse_repr.toarray()

array([[0.4472136 , 0.        , 0.        , 0.4472136 , 0.4472136 ,
        0.4472136 , 0.        , 0.        , 0.4472136 ],
       [0.33369979, 0.41218562, 0.        , 0.33369979, 0.33369979,
        0.33369979, 0.        , 0.5228052 , 0.33369979],
       [0.5       , 0.        , 0.        , 0.5       , 0.5       ,
        0.        , 0.        , 0.        , 0.5       ],
       [0.        , 0.4530051 , 0.57457953, 0.        , 0.        ,
        0.36674667, 0.57457953, 0.        , 0.        ]])

In [125]:
vocab

{'hi': 3,
 'how': 4,
 'are': 0,
 'you': 8,
 'john': 5,
 'my': 7,
 'bro': 1,
 'call': 2,
 'me': 6}

In [108]:
sorted_index_term_doc

[array([3, 4, 0, 8, 5]),
 array([7, 1, 3, 4, 0, 8, 5]),
 array([3, 4, 0, 8]),
 array([2, 6, 1, 5])]

In [109]:
#For debugging
#np.max(sparse_repr.getcol(0))
#np.max(sparse_repr, axis=0).toarray()

In [110]:
sparse_repr.toarray()[:, 0]

array([0.4472136 , 0.33369979, 0.5       , 0.        ])

In [111]:
#TODO check if is right
d_star=sparse_repr.max(axis=0).tocsr()
b_d=[]
for i in range(sparse_repr.shape[0]):
    term_order=sorted_index_term_doc[i]
    sparse_repr_tmp=sparse_repr[i, term_order]
    d_star_tmp=d_star[0, term_order]
    mult = sparse_repr_tmp.multiply(d_star_tmp).toarray()
    cum_sum=np.cumsum(mult)
    index = np.argmax(cum_sum[cum_sum < threshold])
    b_d.append(index) #b(d) value or term_order[index]?

    print("sorted_index: ", term_order)
    print("SR: ",sparse_repr_tmp.toarray())
    print("D*: ",d_star_tmp.toarray())
    print("Mult:", mult)
    print("CumSum: ", cum_sum)
    print("index: ", index)
    print()

b_d=np.array(b_d)

sorted_index:  [3 4 0 8 5]
SR:  [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
D*:  [[0.5       0.5       0.5       0.5       0.4472136]]
Mult: [[0.2236068 0.2236068 0.2236068 0.2236068 0.2      ]]
CumSum:  [0.2236068  0.4472136  0.67082039 0.89442719 1.09442719]
index:  1

sorted_index:  [7 1 3 4 0 8 5]
SR:  [[0.5228052  0.41218562 0.33369979 0.33369979 0.33369979 0.33369979
  0.33369979]]
D*:  [[0.5228052 0.4530051 0.5       0.5       0.5       0.5       0.4472136]]
Mult: [[0.27332528 0.18672219 0.16684989 0.16684989 0.16684989 0.16684989
  0.14923508]]
CumSum:  [0.27332528 0.46004746 0.62689736 0.79374725 0.96059714 1.12744704
 1.27668212]
index:  1

sorted_index:  [3 4 0 8]
SR:  [[0.5 0.5 0.5 0.5]]
D*:  [[0.5 0.5 0.5 0.5]]
Mult: [[0.25 0.25 0.25 0.25]]
CumSum:  [0.25 0.5  0.75 1.  ]
index:  1

sorted_index:  [2 6 1 5]
SR:  [[0.57457953 0.57457953 0.4530051  0.36674667]]
D*:  [[0.57457953 0.57457953 0.4530051  0.4472136 ]]
Mult: [[0.33014163 0.33014163 0.20521362 0.1640141 ]]

In [112]:
d_star.toarray()

array([[0.5       , 0.4530051 , 0.57457953, 0.5       , 0.5       ,
        0.4472136 , 0.57457953, 0.5228052 , 0.5       ]])

In [113]:
b_d

array([1, 1, 1, 0], dtype=int64)

In [114]:
rdd_forMap=sc.parallelize([(doc_ids[i], (docs[i], sorted_index_term_doc[i], b_d[i])) for i in range(sparse_repr.shape[0])])

In [115]:
rdd_forMap.first()

(100,
 (SparseVector(9, {0: 0.4472, 3: 0.4472, 4: 0.4472, 5: 0.4472, 8: 0.4472}),
  array([3, 4, 0, 8, 5]),
  1))

Map( (doc_id, (doc, sorted_index_term_doc, b_d)) )
    return (index_term, (doc_id, doc) )

GroupBy: done by Spark

Reduce( (index_term,  list( (doc_id, doc) )) )



In [130]:
def my_map(elem):
    result=[]
    for i, t_idx in enumerate(elem[1][1]):
        if i>elem[1][2]:
            result.append( (t_idx, (elem[0], elem[1][0])) )
    return result
rdd_forReduce=rdd_forMap.flatMap(my_map)

In [131]:
tmp=rdd_forReduce.collect()

In [118]:
tmp

[(0,
  (100,
   SparseVector(9, {0: 0.4472, 3: 0.4472, 4: 0.4472, 5: 0.4472, 8: 0.4472}))),
 (8,
  (100,
   SparseVector(9, {0: 0.4472, 3: 0.4472, 4: 0.4472, 5: 0.4472, 8: 0.4472}))),
 (5,
  (100,
   SparseVector(9, {0: 0.4472, 3: 0.4472, 4: 0.4472, 5: 0.4472, 8: 0.4472}))),
 (3,
  (101,
   SparseVector(9, {0: 0.3337, 1: 0.4122, 3: 0.3337, 4: 0.3337, 5: 0.3337, 7: 0.5228, 8: 0.3337}))),
 (4,
  (101,
   SparseVector(9, {0: 0.3337, 1: 0.4122, 3: 0.3337, 4: 0.3337, 5: 0.3337, 7: 0.5228, 8: 0.3337}))),
 (0,
  (101,
   SparseVector(9, {0: 0.3337, 1: 0.4122, 3: 0.3337, 4: 0.3337, 5: 0.3337, 7: 0.5228, 8: 0.3337}))),
 (8,
  (101,
   SparseVector(9, {0: 0.3337, 1: 0.4122, 3: 0.3337, 4: 0.3337, 5: 0.3337, 7: 0.5228, 8: 0.3337}))),
 (5,
  (101,
   SparseVector(9, {0: 0.3337, 1: 0.4122, 3: 0.3337, 4: 0.3337, 5: 0.3337, 7: 0.5228, 8: 0.3337}))),
 (0, (102, SparseVector(9, {0: 0.5, 3: 0.5, 4: 0.5, 8: 0.5}))),
 (8, (102, SparseVector(9, {0: 0.5, 3: 0.5, 4: 0.5, 8: 0.5}))),
 (6, (103, SparseVector(9,

In [120]:
def max_of_intersection(list1, list2):
    max=0
    i = 0
    j = 0
    while i < len(list1) and j < len(list2):
        elem1=list1[i]
        elem2=list2[j]

        if elem1 == elem2:
            if elem1>max:
                max=elem1
            i += 1
            j += 1
        elif elem1 < elem2:
            i += 1
        else:
            j += 1

    return max

def my_reduce(elem):
    result=[]
    for id1, d1 in elem[1]:
        for id2, d2 in elem[1]:
            if id1!=id2 and elem[0]==max_of_intersection(d1.indices, d2.indices):
                #TODO: review the comparison with the max (they are index!) check this
                sim=d1.dot(d2)/(d1.norm(2)*d2.norm(2))
                if sim>=threshold:
                    result.append((id1, id2, sim))
    return result
                
from collections import OrderedDict

def reduce_by_key(ls):
    d = OrderedDict()
    for key, sublist in ls:
        d.setdefault(key, []).extend([sublist])
    return list(d.items())
iter=reduce_by_key(tmp)

In [121]:
res=map(my_reduce, iter)

In [122]:
lista=[]
for e in res:
    lista.append(e)


In [123]:
lista

[[],
 [(100, 101, 0.7461754056903326),
  (100, 102, 0.8944271909999159),
  (101, 100, 0.7461754056903326),
  (101, 102, 0.6673995721048268),
  (102, 100, 0.8944271909999159),
  (102, 101, 0.6673995721048268)],
 [],
 [],
 [],
 [],
 []]

In [124]:
a

NameError: name 'a' is not defined

In [None]:
spark.stop()