# Similarity Join

In [23]:
import numpy as np

## Basic

In [24]:
def compute_similarity(document1, document2):
    similarity = 0
    for word1 in document1:
        for word2 in document2:
            if word1[0] == word2[0]:
                similarity = similarity + word1[1] * word2[1]
    return similarity

In [25]:
d1 = [("andrew", 6), ("teach", 4), ("logic", 2), ("database", 5)]
d2 = [("peter", 6), ("study", 2), ("python", 6), ("logic", 3)]
d3 = [("andrew", 5), ("teach", 1), ("python", 5)]
d4 = [("peter", 5), ("study", 4), ("like", 2), ("python", 5)]

d1d2 = compute_similarity(d1, d2)
print("d1d2:", d1d2)
d1d3 = compute_similarity(d1, d3)
print("d1d3:", d1d3)
d1d4 = compute_similarity(d1, d4)
print("d1d4:", d1d4)
d2d3 = compute_similarity(d2, d3)
print("d2d3:", d2d3)
d2d4 = compute_similarity(d2, d4)
print("d2d4:", d2d4)
d3d4 = compute_similarity(d3, d4)
print("d3d4:", d3d4)

d1d2: 6
d1d3: 34
d1d4: 0
d2d3: 30
d2d4: 68
d3d4: 25


## Map Reduce

In [26]:
def construct_maximum_vector(documents, dimension_indexes):
    maximum_vector = {}
    for document in documents:
        for word in document:
            if word[0] in maximum_vector:
                if word[1] > maximum_vector[word[0]]:
                    maximum_vector[word[0]] = word[1]
            else:
                maximum_vector[word[0]] = word[1]
    return maximum_vector


def construct_signature(document, dimension_indexes, maximum_vector, threshold):
    signature = []
    similarity = 0
    for word1 in dimension_indexes:
        for index in range(len(document)):
            word2 = document[index]
            if word1[0] == word2[0]:
                similarity = similarity + word2[1] * maximum_vector[word2[0]]
                if similarity > threshold:
                    signature = document[index:]
                    return signature
    return signature


def is_candidate_pair(document1, document2):
    for word1 in document1:
        for word2 in document2:
            if word1[0] == word2[0]:
                return True
    return False

In [27]:
dimension_indexes = [("andrew", 1), ("peter", 2), ("teach", 3), ("study", 4), ("like", 5), ("logic", 6), ("database", 7), ("python", 8)]
threshold = 37

documents = [d1, d2, d3, d4]
print("Maximum Vector:")
maximum_vector = construct_maximum_vector(documents, dimension_indexes)
print(maximum_vector)
print()
print("Signatures:")
d1_signature = construct_signature(d1, dimension_indexes, maximum_vector, threshold)
print("d1:", d1_signature)
d2_signature = construct_signature(d2, dimension_indexes, maximum_vector, threshold)
print("d1:", d2_signature)
d3_signature = construct_signature(d3, dimension_indexes, maximum_vector, threshold)
print("d1:", d3_signature)
d4_signature = construct_signature(d4, dimension_indexes, maximum_vector, threshold)
print("d1:", d4_signature)
print()
print("Candidate Pairs:")
signatures = [d1_signature, d2_signature, d3_signature, d4_signature]
for i in range(len(signatures)):
    doc1 = signatures[i]
    for j in range(len(signatures)):
        if i >= j:
            continue
        else:
            doc2 = signatures[j]
            if is_candidate_pair(doc1, doc2):
                print("(d", i+1, ", d", j+1, ")", sep="")


Maximum Vector:
{'andrew': 6, 'teach': 4, 'logic': 3, 'database': 5, 'peter': 6, 'study': 4, 'python': 6, 'like': 2}

Signatures:
d1: [('teach', 4), ('logic', 2), ('database', 5)]
d1: [('study', 2), ('python', 6), ('logic', 3)]
d1: [('python', 5)]
d1: [('study', 4), ('like', 2), ('python', 5)]

Candidate Pairs:
(d1, d2)
(d2, d3)
(d2, d4)
(d3, d4)
