### Task: Implement Minhash algoritm

In [1]:
!pip install kshingle
import glob
import re
import kshingle as ks
import numpy as np
import pandas as pd
from google.colab import drive
from functools import partial
from random import randint
from tqdm import tqdm



In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp '/content/drive/MyDrive/gutenberg.zip' .

In [4]:
!unzip -qq gutenberg.zip

MinHash is algorithm used for finding similarity between large documents (or large number of documents). It's idea is to encode similarity into signatures and then compute similarity of these signatures. We first create incidence matrix (shingles x documents) and we find first non-zero value, in order of permutation. We will get signature matrix of size (number of permutations x documents). To test MinHash, we will use documents from Gutenberg Project.

In [6]:
def truncate_text(text, start, end): #cut unnecessary parts that will artificially boost similarity of documents
  start_index = re.search(start, text).end()
  end_index = re.search(end, text).start()
  extracted_text = text[start_index:end_index]
  return extracted_text

start = "START OF (THE|THIS) PROJECT GUTENBERG EBOOK"
end = "END OF (THE|THIS) PROJECT GUTENBERG EBOOK"

In [7]:
def jaccard_index(A, B):
  return len(A.intersection(B))/len(A.union(B))

In [8]:
#book function extracts neccesary data from books
def book(directory):
    book_shinglets = {} #shinglets in each book
    titles = [] #titles of books
    shinglets = set() #all shinglets from all books
    book_data = [] #list of dictionaries shinglet:1

    for f in glob.glob(f"{directory}/*.txt"):
        titles.append(f)
        data = open(f, encoding='utf-8').read()
        trunc_data = truncate_text(data, start, end)
        book_shinglets[f] = set(ks.shingleseqs_range(trunc_data, 10, 10)[0]) #shinglets of length 10
        shinglets = shinglets.union(book_shinglets[f])

        s = {shinglet:1 for shinglet in book_shinglets[f]}
        book_data.append(s)

    return book_shinglets, titles, shinglets, book_data

In [9]:
book_shinglets, titles, shinglets, book_data = book('gutenberg')

In [14]:
#incidence matrix - might be more efficient with bitmap
incidence = pd.DataFrame(book_data).T
incidence.fillna(0, inplace=True)
incidence = incidence.reset_index()
incidence

Unnamed: 0,index,0,1,2,3,4,5,6
0,me\nlike yo,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,winter we,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,\nher terro,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ill all go,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,engines--’,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2215490,til and th,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2215491,ic arms on,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2215492,of\nhis kn,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2215493,0: See the,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [15]:
def hash_abp(x, a, b, p ,N):
  return ((a*x+b) % p) % N

In [19]:
#initialize 50 hash functions - these will be our permutations
H = []
for i in range(50):
  h = partial(hash_abp, a = randint(1, 1000), b = randint(0, 1000), p = 14161729, N = len(shinglets)) # set p to be large prime number, preferably p >> N
  H.append(h)

In [20]:
signatures = np.array([[len(shinglets)+1 for _ in range(len(titles))] for _ in range(len(H))])
#for min to work properly, we set inital values of signatures to "infinity" - here "infinity" means more than number of shinglets

In [21]:
for i, h in tqdm(enumerate(H)):
  perm = incidence.index.map(h)
  for j in range(len(titles)):
    signatures[i, j] = perm[incidence[j] == 1].min()

50it [01:47,  2.15s/it]


In [22]:
signatures #we get 50x7 signature matrix - now we can compute similarities on documents of length 50

array([[  2,   1,   0,  19,  19,   0,   0],
       [  6,   1,   1,   5,  27,   0,   0],
       [  6,   1,   0,   2,   3,   0,   0],
       [ 13,   1,   0,  19,   1,   0,   0],
       [ 19,   1,   0,  36,   3,   1,   1],
       [ 16,   4,   0,  16,   7,   1,   1],
       [ 10,   3,   0,  41,   4,   0,   0],
       [  6,   0,   3,   9,  11,   1,   1],
       [128,   1,   8,   6,   2,   0,   0],
       [  0,   1,   3,  18,  87,   1,   1],
       [ 10,   2,   3,   4,   7,   0,   0],
       [  6,   1,   3,  14,   8,   1,   1],
       [  3,   0,   0,  10,  14,   0,   0],
       [ 18,   2,   2,  35,  36,   1,   1],
       [  4,  10,   0,  55,   6,   1,   1],
       [ 51,   3,   0, 102,  10,   2,   2],
       [ 11,   0,   3,  23,  11,   0,   0],
       [  8,   0,   1,  21,  13,   0,   0],
       [  0,   7,   0,  47,  11,   3,   3],
       [ 17,   1,   1,  26,   4,   0,   0],
       [  5,   8,   2,   5,   1,   3,   3],
       [ 13,   4,   0,   0,  20,   0,   0],
       [  5,   2,   1,  13,   2,

In [23]:
def jaccard_signatures(doc1_idx, doc2_idx):
  #return [signatures[:, doc1_idx]-signatures[:, doc2_idx] == 0]
  return np.mean([signatures[:, doc1_idx]-signatures[:, doc2_idx] == 0])

In [24]:
sig_sim = np.zeros((len(titles), len(titles)))
for i in range(len(titles)):
  for j in range(i+1, len(titles)):
    sig_sim[i,j] = jaccard_signatures(i, j)
    sig_sim[j,i] = jaccard_signatures(i, j)
  sig_sim[i,i] = jaccard_signatures(i, i)
sig_sim = pd.DataFrame(sig_sim)
sig_sim.index = titles
sig_sim.columns = titles
sig_sim

Unnamed: 0,gutenberg/12-0.txt,gutenberg/pg22764.txt,gutenberg/56734-0.txt,gutenberg/pg35688.txt,gutenberg/pg5001.txt,gutenberg/pg732.txt,gutenberg/pg731.txt
gutenberg/12-0.txt,1.0,0.06,0.04,0.1,0.04,0.02,0.02
gutenberg/pg22764.txt,0.06,1.0,0.2,0.06,0.08,0.18,0.18
gutenberg/56734-0.txt,0.04,0.2,1.0,0.04,0.08,0.28,0.28
gutenberg/pg35688.txt,0.1,0.06,0.04,1.0,0.04,0.06,0.06
gutenberg/pg5001.txt,0.04,0.08,0.08,0.04,1.0,0.02,0.02
gutenberg/pg732.txt,0.02,0.18,0.28,0.06,0.02,1.0,1.0
gutenberg/pg731.txt,0.02,0.18,0.28,0.06,0.02,1.0,1.0


In [25]:
#we will compute true similarities to see how well MinHash works
jac_sim = np.zeros((len(titles), len(titles)))
for i in range(len(titles)):
  for j in range(i+1, len(titles)):
    jac_sim[i,j] = jaccard_index(book_shinglets[titles[i]], book_shinglets[titles[j]])
    jac_sim[j,i] = jaccard_index(book_shinglets[titles[i]], book_shinglets[titles[j]])
  jac_sim[i,i] = jaccard_index(book_shinglets[titles[i]], book_shinglets[titles[i]])
jaccard_sim = pd.DataFrame(jac_sim)
jaccard_sim.index = titles
jaccard_sim.columns = titles
jaccard_sim

Unnamed: 0,gutenberg/12-0.txt,gutenberg/pg22764.txt,gutenberg/56734-0.txt,gutenberg/pg35688.txt,gutenberg/pg5001.txt,gutenberg/pg732.txt,gutenberg/pg731.txt
gutenberg/12-0.txt,1.0,0.015418,0.013061,0.062369,0.014845,0.011694,0.011694
gutenberg/pg22764.txt,0.015418,1.0,0.045754,0.008466,0.036997,0.056758,0.056758
gutenberg/56734-0.txt,0.013061,0.045754,1.0,0.006377,0.020157,0.089387,0.089387
gutenberg/pg35688.txt,0.062369,0.008466,0.006377,1.0,0.011391,0.005629,0.005629
gutenberg/pg5001.txt,0.014845,0.036997,0.020157,0.011391,1.0,0.022538,0.022538
gutenberg/pg732.txt,0.011694,0.056758,0.089387,0.005629,0.022538,1.0,1.0
gutenberg/pg731.txt,0.011694,0.056758,0.089387,0.005629,0.022538,1.0,1.0


In [43]:
np.sqrt(((jaccard_sim - sig_sim)**2).values.mean())

0.07764271937014905

RMSE of similarity is around 0.078 - it's not terrible, but considering that the highest similarity (not including when it is $1$) is around 0.09 it's not the best either. The result may be better with different shinglet length or more hash functions.