# Shingling with Jaccard

Comparing document similarities where the set of objects is word or character ngrams taken over a sliding window from the document (shingles). The set of shingles is used to determine the document similarity, Jaccard similarity, between a pair of documents.

In [None]:
shingle_size = 10

In [50]:
def shingler(doc, size):
    return [doc[i:i+size] for i in range(len(doc))][:-size]

In [51]:
def jaccard_dist(shingle1, shingle2):
    return len(set(shingle1) & set(shingle2)) / len(set(shingle1) | set(shingle2))

In [52]:
document1 = """An elephant slept in his bunk,
              And in slumber his chest rose and sunk.
              But he snored how he snored!
              All the other beasts roared,
              So his wife tied a knot in his trunk."""

document2 = """A large red cow
               Tried to make a bow,
               But did not know how,
               They say.
               For her legs got mixed,
               And her horns got fixed,
               And her tail would get
               In her way."""

document3 = """An walrus slept in his bunk,
              And in slumber his chest rose and sunk.
              But he snored how he snored!
              All the other beasts roared,
              So his wife tied a knot in his whiskers."""

In [53]:
# shingle and discard the last x as these are just the last n<x characters from the document
shingle1 = shingler(document1, shingle_size)
shingle1[0:10]

['An elephan',
 'n elephant',
 ' elephant ',
 'elephant s',
 'lephant sl',
 'ephant sle',
 'phant slep',
 'hant slept',
 'ant slept ',
 'nt slept i']

In [54]:
# shingle and discard the last x as these are just the last n<x characters from the document
shingle2 = shingler(document2, shingle_size)
shingles[0:10]

['An wa',
 'n wal',
 ' walr',
 'walru',
 'alrus',
 'lrus ',
 'rus s',
 'us sl',
 's sle',
 ' slep']

In [55]:
# shingle and discard the last x as these are just the last n<x characters from the document
shingle3 = shingler(document3, shingle_size)
shingles[0:10]

['An wa',
 'n wal',
 ' walr',
 'walru',
 'alrus',
 'lrus ',
 'rus s',
 'us sl',
 's sle',
 ' slep']

In [56]:
# Jaccard distance is the size of set intersection divided by the size of set union
print(f"Document 1 and Document 2 Jaccard Distance: {jaccard_dist(shingle1, shingle2)}")

Document 1 and Document 2 Jaccard Distance: 0.03943661971830986


In [57]:
# Jaccard distance is the size of set intersection divided by the size of set union
print(f"Document 1 and Document 3 Jaccard Distance: {jaccard_dist(shingle1, shingle3)}")

Document 1 and Document 3 Jaccard Distance: 0.8382352941176471


In [58]:
# Jaccard distance is the size of set intersection divided by the size of set union
print(f"Document 2 and Document 3 Jaccard Distance: {jaccard_dist(shingle2, shingle3)}")

Document 2 and Document 3 Jaccard Distance: 0.03932584269662921


In [65]:
shingle_sizes = [1,2,3,4,5,6,7,8,9,10,11,12,13,15]
jaccard_list = []
for s in shingle_sizes:
    temp_shingle_1 = shingler(document1, s)
    temp_shingle_2 = shingler(document2, s)
    temp_shingle_3 = shingler(document3, s)
    j1 = jaccard_dist(temp_shingle_1, temp_shingle_2)
    j2 = jaccard_dist(temp_shingle_2, temp_shingle_3)
    j3 = jaccard_dist(temp_shingle_1, temp_shingle_3)
    temp_list = []
    temp_list.append(j1)
    temp_list.append(j2)
    temp_list.append(j3)
    temp_list.append(s)
    jaccard_list.append(temp_list)
    

In [66]:
for e in jaccard_list:
    print(e)

[0.7272727272727273, 0.7272727272727273, 1.0, 1]
[0.3006993006993007, 0.3146853146853147, 0.8585858585858586, 2]
[0.14601769911504425, 0.1504424778761062, 0.8163265306122449, 3]
[0.08178438661710037, 0.08178438661710037, 0.8095238095238095, 4]
[0.047619047619047616, 0.047619047619047616, 0.8181818181818182, 5]
[0.04516129032258064, 0.04516129032258064, 0.825136612021858, 6]
[0.043478260869565216, 0.043343653250773995, 0.8253968253968254, 7]
[0.042042042042042045, 0.041916167664670656, 0.8298969072164949, 8]
[0.040697674418604654, 0.04057971014492753, 0.8341708542713567, 9]
[0.03943661971830986, 0.03932584269662921, 0.8382352941176471, 10]
[0.038356164383561646, 0.03825136612021858, 0.8413461538461539, 11]
[0.0374331550802139, 0.037333333333333336, 0.8436018957345972, 12]
[0.03655352480417755, 0.036458333333333336, 0.8457943925233645, 13]
[0.032418952618453865, 0.03233830845771144, 0.8493150684931506, 15]
