In [1]:
from datasketch import MinHash, MinHashLSH

# Small MinHash Example

In [13]:
set1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'datasets'])
set2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'documents'])
set3 = set(['minhash', 'is', 'probability', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'documents'])

In [14]:
print(set1)

{'between', 'is', 'structure', 'for', 'datasets', 'probabilistic', 'similarity', 'minhash', 'estimating', 'data', 'the', 'a'}


In [15]:
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)

In [16]:
for d in set1:
    m1.update(d.encode('utf8'))
for d in set2:
    m2.update(d.encode('utf8'))
for d in set3:
    m3.update(d.encode('utf8'))

In [17]:
# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with Jaccard similarity > 0.5", result)

Approximate neighbours with Jaccard similarity > 0.5 ['m3', 'm2']


# MinHash LSH with Poem Docs

In [19]:
document1 = """An elephant slept in his bunk
              And in slumber his chest rose and sunk
              But he snored how he snored
              All the other beasts roared
              So his wife tied a knot in his trunk"""

document2 = """A large red cow
               Tried to make a bow
               But did not know how
               They say
               For her legs got mixed
               And her horns got fixed
               And her tail would get
               In her way"""

document3 = """An walrus slept in his bunk
              And in slumber his chest rose and sunk
              But he snored how he snored
              All the other beasts roared
              So his wife tied a knot in his whiskers"""

doc_list = [document1, document2, document3]

In [20]:
doc_set = []
for d in doc_list:
    exploded = d.split(' ')
    doc_set.append(set(exploded))

In [25]:
minhash_list = []
for s in doc_set:
    m = MinHash(num_perm=128)
    for d in s:
        m.update(d.encode('utf8'))
    minhash_list.append(m)

In [29]:
# Create LSH index
lsh = MinHashLSH(threshold=0.75, num_perm=128)
lsh.insert("m2", minhash_list[1])
lsh.insert("m3", minhash_list[2])
result = lsh.query(minhash_list[0])
print("Approximate neighbours with Jaccard similarity > 0.75", result)

Approximate neighbours with Jaccard similarity > 0.75 ['m3']
