# Shingling with Jaccard

Comparing document similarities where the set of objects is word or character ngrams taken over a sliding window from the document (shingles). The set of shingles is used to determine the document similarity, Jaccard similarity, between a pair of documents.

In [80]:
from tabulate import tabulate


shingle_size = 5

In [81]:
def shingler(doc, size):
    return [doc[i:i+size] for i in range(len(doc))][:-size]

In [82]:
def jaccard_dist(shingle1, shingle2):
    return len(set(shingle1) & set(shingle2)) / len(set(shingle1) | set(shingle2))

In [83]:
document1 = """An elephant slept in his bunk
              And in slumber his chest rose and sunk
              But he snored how he snored
              All the other beasts roared
              So his wife tied a knot in his trunk"""

document2 = """A large red cow
               Tried to make a bow
               But did not know how
               They say
               For her legs got mixed
               And her horns got fixed
               And her tail would get
               In her way"""

document3 = """An walrus slept in his bunk
              And in slumber his chest rose and sunk
              But he snored how he snored
              All the other beasts roared
              So his wife tied a knot in his whiskers"""

In [84]:
# shingle and discard the last x as these are just the last n<x characters from the document
shingle1 = shingler(document1, shingle_size)
shingle1[0:10]

['An el',
 'n ele',
 ' elep',
 'eleph',
 'lepha',
 'ephan',
 'phant',
 'hant ',
 'ant s',
 'nt sl']

In [85]:
# shingle and discard the last x as these are just the last n<x characters from the document
shingle2 = shingler(document2, shingle_size)
shingles[0:10]

['An wa',
 'n wal',
 ' walr',
 'walru',
 'alrus',
 'lrus ',
 'rus s',
 'us sl',
 's sle',
 ' slep']

In [86]:
# shingle and discard the last x as these are just the last n<x characters from the document
shingle3 = shingler(document3, shingle_size)
shingles[0:10]

['An wa',
 'n wal',
 ' walr',
 'walru',
 'alrus',
 'lrus ',
 'rus s',
 'us sl',
 's sle',
 ' slep']

In [87]:
# Jaccard distance is the size of set intersection divided by the size of set union
print(f"Document 1 and Document 2 Jaccard Distance: {jaccard_dist(shingle1, shingle2)}")

Document 1 and Document 2 Jaccard Distance: 0.047619047619047616


In [88]:
# Jaccard distance is the size of set intersection divided by the size of set union
print(f"Document 1 and Document 3 Jaccard Distance: {jaccard_dist(shingle1, shingle3)}")

Document 1 and Document 3 Jaccard Distance: 0.8181818181818182


In [89]:
# Jaccard distance is the size of set intersection divided by the size of set union
print(f"Document 2 and Document 3 Jaccard Distance: {jaccard_dist(shingle2, shingle3)}")

Document 2 and Document 3 Jaccard Distance: 0.047619047619047616


In [90]:
shingle_sizes = [1,2,3,4,5,6,7,8,9,10,11,12,13,15]
jaccard_list = []
for s in shingle_sizes:
    temp_shingle_1 = shingler(document1, s)
    temp_shingle_2 = shingler(document2, s)
    temp_shingle_3 = shingler(document3, s)
    j1 = jaccard_dist(temp_shingle_1, temp_shingle_2)
    j2 = jaccard_dist(temp_shingle_2, temp_shingle_3)
    j3 = jaccard_dist(temp_shingle_1, temp_shingle_3)
    temp_list = []
    temp_list.append(j1)
    temp_list.append(j2)
    temp_list.append(j3)
    temp_list.append(s)
    jaccard_list.append(temp_list)
    

In [98]:
print("1:2\t\t2:3\t1:3\tShingle Size")
print(tabulate(jaccard_list))

1:2		2:3	1:3	Shingle Size
---------  ---------  --------  --
0.727273   0.727273   1          1
0.300699   0.314685   0.858586   2
0.146018   0.150442   0.816327   3
0.0817844  0.0817844  0.809524   4
0.047619   0.047619   0.818182   5
0.0451613  0.0451613  0.825137   6
0.0434783  0.0433437  0.825397   7
0.042042   0.0419162  0.829897   8
0.0406977  0.0405797  0.834171   9
0.0394366  0.0393258  0.838235  10
0.0383562  0.0382514  0.841346  11
0.0374332  0.0373333  0.843602  12
0.0365535  0.0364583  0.845794  13
0.032419   0.0323383  0.849315  15
---------  ---------  --------  --
