In [1]:
import os
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.linalg import SparseVector
from scipy.sparse import load_npz
import utils


os.environ['PYSPARK_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:\ProgramData\mambaforge\envs\ML-base\Scripts\ipython.exe'

spark = SparkSession.builder.appName("MyApp").getOrCreate()
sc=spark.sparkContext
spark

In [2]:
#df_id=pd.read_parquet("ids_nfcorpus.parquet")
threshold=0.5

In [3]:
toy_df=pd.DataFrame([["Hi how are you John?", 100], ["Hi how are you John my bro?", 101], ["Bro, what girls?", 102]], columns=["text", "id"])

In [4]:
toy_df

Unnamed: 0,text,id
0,Hi how are you John?,100
1,Hi how are you John my bro?,101
2,"Bro, what girls?",102


In [5]:
sparse_repr, _=utils.compute_sparse_repr(toy_df["text"])
pairs, num_of_pairs=utils.compute_cosine_similarity(sparse_repr, toy_df["id"], 0.0)
pairs

[[1.     0.8043 0.    ]
 [0.8043 1.     0.1704]
 [0.     0.1704 1.    ]]


[(100, 101, 0.8043), (100, 102, 0.0), (101, 102, 0.1704)]

In [6]:
num_of_pairs

3

In [7]:
def csr_to_sparse_vector(row):
    return SparseVector(row.shape[1], list(zip(row.indices, row.data)))

docs_sparse_forSpark = [csr_to_sparse_vector(sparse_repr.getrow(i)) for i in range(sparse_repr.shape[0])]
doc_ids = toy_df["id"].reset_index(drop=True)

In [8]:
docs_sparse_forSpark[1]

SparseVector(9, {0: 0.3597, 1: 0.3597, 3: 0.3597, 4: 0.3597, 5: 0.3597, 6: 0.473, 8: 0.3597})

In [9]:
print(sparse_repr[1].indices)
print(sparse_repr[1].data)

[1 6 5 8 0 4 3]
[0.35970039 0.47296278 0.35970039 0.35970039 0.35970039 0.35970039
 0.35970039]


In [10]:
sparse_repr[1,6]

0.47296278161597527

In [11]:
sparse_repr[1].toarray()

array([[0.35970039, 0.35970039, 0.        , 0.35970039, 0.35970039,
        0.35970039, 0.47296278, 0.        , 0.35970039]])

In [12]:
def sparse_argsort(matrix, idx):
    row=matrix.getrow(idx)
    sorted_indices = np.argsort(row.data)[::-1]
    return row.indices[sorted_indices].tolist()
sorted_index_term_doc=[sparse_argsort(sparse_repr, idx) for idx in range(sparse_repr.shape[0])]

In [13]:
sorted_index_term_doc

[[3, 4, 0, 8, 5], [6, 3, 4, 0, 8, 5, 1], [7, 2, 1]]

In [14]:
d_star=sparse_repr.max(axis=0).toarray().reshape(-1)

In [15]:
sparse_repr.toarray()

array([[0.4472136 , 0.        , 0.        , 0.4472136 , 0.4472136 ,
        0.4472136 , 0.        , 0.        , 0.4472136 ],
       [0.35970039, 0.35970039, 0.        , 0.35970039, 0.35970039,
        0.35970039, 0.47296278, 0.        , 0.35970039],
       [0.        , 0.4736296 , 0.62276601, 0.        , 0.        ,
        0.        , 0.        , 0.62276601, 0.        ]])

In [16]:
sparse_repr[:,8].toarray()

array([[0.4472136 ],
       [0.35970039],
       [0.        ]])

In [17]:
d_star
#{0: 0.4472, 3: 0.4472, 4: 0.4472, 5: 0.4472, 8: 0.4472})
##[3, 4, 0, 8, 5]

array([0.4472136 , 0.4736296 , 0.62276601, 0.4472136 , 0.4472136 ,
       0.4472136 , 0.47296278, 0.62276601, 0.4472136 ])

In [18]:
d_star_sc=sc.broadcast(d_star)
rdd_forMap=sc.parallelize([(doc_ids[i], (docs_sparse_forSpark[i], sorted_index_term_doc[i])) for i in range(sparse_repr.shape[0])])

In [19]:
def b_d(sparse_repr, term_order):
    sparse_repr_tmp=[sparse_repr[t] for t in term_order]
    d_star_tmp=[d_star_sc.value[t] for t in term_order]
    cum_sum=0
    index=0

    for i in range(len(d_star_tmp)):
        mult_val=sparse_repr_tmp[i]*d_star_tmp[i]
        cum_sum+=mult_val
        index=i
        if cum_sum>=threshold:
            index=index-1
            break
    #print(index, term_order[index])
    return index


def my_map(elem):
    #result=[]
    doc_id=elem[0]
    sparse_repr=elem[1][0]
    sorted_index=elem[1][1]

    bound=b_d(sparse_repr, sorted_index)
    result=[((t_idx, (doc_id, sparse_repr))) for t_idx in sorted_index[bound+1::]]

    """ for i, t_idx in enumerate(sorted_index):
        if i>bound:
            result.append( (t_idx, (doc_id, sparse_repr)) ) """
    return result

rdd_forReduce=rdd_forMap.flatMap(my_map)


In [20]:
""" from itertools import chain

def flatmap(func, iterable):
    return list(chain.from_iterable(map(func, iterable)))

iter=flatmap(my_map, rdd_forMap) """

' from itertools import chain\n\ndef flatmap(func, iterable):\n    return list(chain.from_iterable(map(func, iterable)))\n\niter=flatmap(my_map, rdd_forMap) '

In [21]:
#iter

In [22]:
docs_sparse_forSpark[0].dot(docs_sparse_forSpark[1])

0.804314515185695

In [23]:
def max_of_intersection(list1, list2):
    max=0
    i = 0
    j = 0
    while i < len(list1) and j < len(list2):
        elem1=list1[i]
        elem2=list2[j]

        if elem1 == elem2:
            if elem1>max:
                max=elem1
            i += 1
            j += 1
        elif elem1 < elem2:
            i += 1
        else:
            j += 1

    return max


def my_reduce(elem):
    result=[]
    #For directly prune the symmetric pairs 
    #pairs_dict={}
    key=elem[0]
    values=elem[1]
    print(values)
    for id1, d1 in values:
        for id2, d2 in values:
            
            #print("CIAO", (id1,id2))
            #print(key, d1.indices, d2.indices)
            if id1!=id2 and key==max_of_intersection(d1.indices, d2.indices): #and (not pairs_dict.get((id2, id1), False))
                sim=round(d1.dot(d2), 4)
                #because vector are already normalized
                if sim>=threshold:
                    #pairs_dict[(id1, id2)]=True
                    result.append((id1, id2, sim))

    return result

#result_pairs=rdd_forReduce.groupByKey().flatMap(my_reduce)


""" from itertools import groupby
grouped_data = groupby(iter, key=lambda x: x[0])

for key, group in grouped_data:
    pairs = list(group)
    #print(pairs)
    #print([e[1] for e in pairs])
    print(my_reduce((key, [e[1] for e in pairs]))) """


' from itertools import groupby\ngrouped_data = groupby(iter, key=lambda x: x[0])\n\nfor key, group in grouped_data:\n    pairs = list(group)\n    #print(pairs)\n    #print([e[1] for e in pairs])\n    print(my_reduce((key, [e[1] for e in pairs]))) '

In [24]:
risultato=rdd_forReduce.groupByKey().mapValues(list).flatMap(my_reduce).collect()
print(risultato)

[(100, 101, 0.8043), (101, 100, 0.8043)]


In [25]:
len(risultato)

2

In [26]:
spark.stop()