**Enables hashing text close together in the same hashing bucket**

In [5]:
# 11th

# needed to get prob structures needed to process and search large datset fast and accurately
!pip install datasketch



In [7]:
from datasketch import MinHash, MinHashLSH  # minHashLSH: min hash locallity sensitive hashing

from nltk import ngrams
from nltk import word_tokenize

In [10]:
text_array = ["A bird in hand is worth two in the bush.",
       "Good things come to those who hustle.",
       "There are other fish in the sea.",
       "The ball is in your court."]

In [11]:
# step 1:
word_token_array = [word_tokenize(text) for text in text_array]

word_token_array

[['A', 'bird', 'in', 'hand', 'is', 'worth', 'two', 'in', 'the', 'bush', '.'],
 ['Good', 'things', 'come', 'to', 'those', 'who', 'hustle', '.'],
 ['There', 'are', 'other', 'fish', 'in', 'the', 'sea', '.'],
 ['The', 'ball', 'is', 'in', 'your', 'court', '.']]

In [13]:
# step 2: shingling(gen n_grams of words)

for index, word_tokens in enumerate(word_token_array):
    for n_gram in ngrams(word_tokens, 3):  # gen tri gram(i n practice 8 - 10 it 9ice value of n for process)
        print(index, n_gram)

0 ('A', 'bird', 'in')
0 ('bird', 'in', 'hand')
0 ('in', 'hand', 'is')
0 ('hand', 'is', 'worth')
0 ('is', 'worth', 'two')
0 ('worth', 'two', 'in')
0 ('two', 'in', 'the')
0 ('in', 'the', 'bush')
0 ('the', 'bush', '.')
1 ('Good', 'things', 'come')
1 ('things', 'come', 'to')
1 ('come', 'to', 'those')
1 ('to', 'those', 'who')
1 ('those', 'who', 'hustle')
1 ('who', 'hustle', '.')
2 ('There', 'are', 'other')
2 ('are', 'other', 'fish')
2 ('other', 'fish', 'in')
2 ('fish', 'in', 'the')
2 ('in', 'the', 'sea')
2 ('the', 'sea', '.')
3 ('The', 'ball', 'is')
3 ('ball', 'is', 'in')
3 ('is', 'in', 'your')
3 ('in', 'your', 'court')
3 ('your', 'court', '.')


In [14]:
# step 3: locality sensitive hashing
min_hash_lsh= MinHashLSH(threshold = 0.5, # thresh is based on jaccard index value
                         num_perm=128)  # perm increases hashing accuracy, but preformance will be slower

In [17]:
# step 4: calc min hash for every shingle in input text
min_hashes = {}

for index, text in enumerate(text_array):  # create a min hash obj for every sen in text_array
    min_hash = MinHash(num_perm=128)
    
    for n_gram in ngrams(text, 3):  # gen n_grams for each input text sen
        min_hash.update("".join(n_gram).encode("utf-8"))  # updates min hash for a sentence with min hash shingle in focus
        
    min_hash_lsh.insert(index, min_hash)  # feed to lsh obj
    min_hashes[index] = min_hash
        

In [18]:
min_hashes

{0: <datasketch.minhash.MinHash at 0x1f71a091df0>,
 1: <datasketch.minhash.MinHash at 0x1f71a06d760>,
 2: <datasketch.minhash.MinHash at 0x1f7196ac1f0>,
 3: <datasketch.minhash.MinHash at 0x1f7196b7310>}

In [20]:
# ites through all min hashes for input text
for i in min_hashes.keys():
    result = min_hash_lsh.query(min_hashes[i])  # query for similar doc
    print("Candidate tri gram with jaccard sim index > 0.5 for input", i, ";", result)
    
    # shows sentence is similar to only setence one and so on

Candidate tri gram with jaccard sim index > 0.5 for input 0 ; [0]
Candidate tri gram with jaccard sim index > 0.5 for input 1 ; [1]
Candidate tri gram with jaccard sim index > 0.5 for input 2 ; [2]
Candidate tri gram with jaccard sim index > 0.5 for input 3 ; [3]


In [21]:
# using more similar text_data
text_array = ["A bird in the hand is worth two in the bush.",
             "A bird in hands is worth three in the bushes.",
             "Good things come to those who wait.",
             "Good tpings cxme to those who wait long.",
             "There are other fish in the sea.",
             "The ball is in your court"]

In [25]:
# thresh is 0.5, so points of jaca index similarity of 0.5 and higher suppose to belong to same hash bucket.

min_hash_lsh = MinHashLSH(threshold = 0.5, num_perm=128)

In [26]:
# step 4: calc min hash for every shingle in input text
min_hashes = {}

for index, text in enumerate(text_array):  # create a min hash obj for every sen in text_array
    min_hash = MinHash(num_perm=128)
    
    for n_gram in ngrams(text, 3):  # gen n_grams for each input text sen
        min_hash.update("".join(n_gram).encode("utf-8"))  # updates min hash for a sentence with min hash shingle in focus
        
    min_hash_lsh.insert(index, min_hash)  # feed to lsh obj
    min_hashes[index] = min_hash
        

In [27]:
# ites through all min hashes for input text
for i in min_hashes.keys():
    result = min_hash_lsh.query(min_hashes[i])  # query for similar doc
    print("Candidate tri gram with jaccard sim index > 0.5 for input", i, ";", result)
    
    # shows sentence 0 is similar to 0 and 1 and so on

Candidate tri gram with jaccard sim index > 0.5 for input 0 ; [0, 1]
Candidate tri gram with jaccard sim index > 0.5 for input 1 ; [0, 1]
Candidate tri gram with jaccard sim index > 0.5 for input 2 ; [2, 3]
Candidate tri gram with jaccard sim index > 0.5 for input 3 ; [2, 3]
Candidate tri gram with jaccard sim index > 0.5 for input 4 ; [4]
Candidate tri gram with jaccard sim index > 0.5 for input 5 ; [5]
