In [2]:
%config IPCompleter.greedy=True

In [3]:
import numpy as np
from string import punctuation
import random

## Exercise 18.2

In [4]:
shingle_size = 3

In [5]:
def process_words(words):
    filtered_words = [word.lower().translate(str.maketrans('','', punctuation)) for word in words]
    return filtered_words

In [6]:
with open('../data/chapter1.txt') as f:
    data1 = [word for line in f for word in line.split()]
    data1 = process_words(data1)

In [7]:
with open('../data/chapter2.txt') as f:
    data2 = [word for line in f for word in line.split()]
    data2 = process_words(data2)

In [8]:
def get_shingles(text):
    shingles = []

    for i in range(len(text)):
        shingle = []

        if i + shingle_size >= len(text):
            break

        for j in range(i, i + shingle_size):
            shingle.append(text[j])

        shingles.append(' '.join(shingle))
        
    return shingles

In [9]:
shingles1 = get_shingles(data1)
shingles2 = get_shingles(data2)

In [10]:
s1 = set(shingles1)
s2 = set(shingles2)

jaccard = float(len(s1.intersection(s2))) / float(len(s1.union(s2)))

In [11]:
print(jaccard)

0.0033644859813084112


## Exercise 19

In [12]:
class MinHash:
    def __init__(self, k, seed=10):
        self._k = k
        self._seed = seed
        
        min_int = np.iinfo(np.int64).min
        max_int = np.iinfo(np.int64).max
        
        self._masks = np.random.RandomState(seed=self._seed).randint(min_int, max_int, self._k)
        
        self._hashes = np.empty(self._k, dtype=np.int64)
        self._hashes.fill(max_int)
        
    def add(self, v):
        hashes = np.bitwise_xor(self._masks, hash(v))
        self._hashes = np.minimum(self._hashes, hashes)
        
    def jaccard(self, other):
        if np.any(self._masks != other._masks):
            raise Exception('Can only calculate similarity between min-hashes with the same hash functions!')
            
        return (self._hashes == other._hashes).sum() / float(self._k)

In [13]:
a = [2, 1, 3, 7]
b = [6, 9, 4, 2]

# number of hashesh
ks = [50, 100, 250]

In [14]:
def compare(left, right, k):
    left_min_hash = MinHash(k)
    right_min_hash = MinHash(k)
    
    for l in left:
        left_min_hash.add(l)
        
    for r in right:
        right_min_hash.add(r)
    
    return left_min_hash.jaccard(right_min_hash)

In [15]:
for k in ks:
    jaccard = compare(a, b, k)
    print(jaccard)

0.04
0.05
0.048


In [16]:
# bigger sets
big_a = []
big_b = []

for i in range(5000):
    big_a.append(random.randint(0, 10000))
    big_b.append(random.randint(0, 10000))

In [17]:
for k in ks:
    jaccard = compare(big_a, big_b, k)
    print(jaccard)

0.3
0.3
0.244


In [19]:
for k in ks:
    jaccard = compare(data1, data2, k)
    print(jaccard)

0.16
0.13
0.12
