In [1]:
from pyspark import SparkContext, SparkConf
from fnvhash import fnv1a_32
from glob import glob
import numpy as np

#Configure pyspark
conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)

#Create rdds for documents
rdd_documents = []
all_documents = glob('*.txt')
for document in all_documents:
    rdd_documents.append(sc.textFile(document))

#Set the value k (the length of the shingles); k-shingles
k = 9

In [2]:
#Method that takes an rdd and returns a set of shingles.
def Shingling(rdd):
    return set(rdd.map(lambda x: [fnv1a_32(str.encode(x[i:i+k])) for i in range(0,len(x)-k+1)]).flatMap(lambda list: list).distinct().collect())

#Method that takes two shingled documents and computes the Jaccard similarity
def CompareSets(setA,setB):
    union = len(setA.union(setB))
    intersection = len(setA.intersection(setB))
    return intersection/union


In [3]:
#Create hashed shinglings from documents
shingled_documents = []
for rdd_doc in rdd_documents:
    shingled_documents.append(Shingling(rdd_doc))

#Get the Jaccard similarity
#CompareSets(shingles_document1,shingles_document2)

In [4]:
#Create union
union_shingles = set()
for document in shingled_documents:
    union_shingles = union_shingles.union(document)
    
def createSimilarityMatrix(document,union):
    column = []
    for shingle in union:
        if shingle in document:
            column.append(1)
        else:
            column.append(0)
    return column

matrix = list(union_shingles)
for document in shingled_documents:
    column = createSimilarityMatrix(document, union_shingles)
    matrix = np.column_stack((matrix,column))

matrix

array([[1777336321,          0,          1,          1,          0],
       [  10223617,          0,          1,          1,          0],
       [2160066563,          0,          1,          1,          0],
       ...,
       [ 252182522,          1,          0,          0,          0],
       [1053294587,          0,          1,          1,          0],
       [1083703295,          0,          1,          1,          0]])

In [5]:
def MinHash(matrix,k):
    signature = []
    for i in range(0,k):
        currentSignature = []
        np.random.shuffle(matrix)
        for currDocument in range(1,matrix.shape[1]):
            
            for index in range(0,len(matrix)):
                value = matrix[index][currDocument]
                if value != 0:
                    currentSignature.append(index + 1)
                    break;
        signature.append(currentSignature)
    return np.array(signature)
signatures = MinHash(matrix,100)
signatures[:,1]

array([1, 1, 1, 1, 1, 6, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 3, 1, 1, 1,
       1, 1, 4, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 3, 1,
       5, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3,
       4, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 4, 1, 3, 3, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [6]:
def jaccardApproximation(signature1, signature2):
    return np.sum(signature1== signature2)/signature1.shape[0]

In [7]:
signatures[:5]

array([[5, 1, 1, 9],
       [3, 1, 1, 3],
       [6, 1, 1, 6],
       [6, 1, 1, 6],
       [2, 1, 1, 2]])

In [8]:
def compareSignatures(signatures, threshold):
    for i in range(0,signatures.shape[1]):
        for k in range(i+1,signatures.shape[1]):
            jaccardSimilarity = jaccardApproximation(signatures[:,i],signatures[:,k])
            if jaccardSimilarity > threshold:
                print("Found two similar documents:",all_documents[i],all_documents[k],
                     "Similarity:", jaccardSimilarity)
compareSignatures(signatures, 0.8)

Found two similar documents: example2.txt example2(kopia).txt Similarity: 0.97
