In [None]:
!pip install numpy
!pip install ordered_set

In [1]:
import numpy as np
import os
import hashlib
from ordered_set import OrderedSet

In [3]:
#set of shingles without duplicates, in order of insertion
#generate the same hash values for equivalent shingles each time this function is called
#(using hashlib, generate hexadecimal values)
def Shingling(k, file_path):
    with open(file_path, 'r') as file:
        words = [word for line in file for word in line.strip().split()]
    ordered_shingles = OrderedSet()
    #shingles = []
    for i in range(len(words)-k):
        # need to convert sublist to string in order to use encode:
        shingle = ' '.join(words[i:i+k-1])
        """
        hash_value = hashlib.md5(shingle.encode()).hexdigest() #hash(tuple(shingle))
        ordered_shingles.add(hash_value)
        shingles.append(hash_value)
        """
        hash_value_0 = hashlib.md5(shingle.encode())
        # converting to integer within range (0,2^(32)-1)
        #hash_value = int(hash_value_0.hexdigest(), 16) % (2**32)
        hash_value = int(hash_value_0.hexdigest(), 16) % (2**16)
        ordered_shingles.add(hash_value)
    
    return ordered_shingles

In [4]:

def CompareSets(hashed_shingle_set1, hashed_shingle_set2):
    # Compute the jaccard similarity of the hash values(shingles) in the sets:
    jacc = hashed_shingle_set1.intersection(hashed_shingle_set2)/hashed_shingle_set1.union(hashed_shingle_set2)
    return jacc    

In [5]:
# generate a list of all documents; ordered sets of integers (hashed shingles)
k = 7 # (de använder 9 i boken)
folder_path = 'data/'
hashed_shingle_sets = [] # length (nr od documents): 99, ranging in length between 23 and 4505 unique shingles
#universal_set = [] # length: 40094
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        ordered_shingles = Shingling(k, file_path)
        hashed_shingle_sets.append(ordered_shingles)
        #universal_set.extend(shingles)
        
        
#universal_set = OrderedSet(universal_set)


In [11]:
# builds minHash signatures from given set of integers (i.e. documents)
# The minhash value of any column is the number of
# the first row, in the permuted order, in which the column has a 1.
def MinHashing(sets_of_integers):
    """
    * Length of signatures, n, can be selected randomly
    * Theoretically: from the column representing set S, construct the minhash signature for S, the
      vector [h1(S), h2(S), . . . , hn(S)]
    * In practise: instead of picking n random permutations of rows, we pick n randomly
      chosen hash functions h1, h2, . . . , hn on the rows.
    * If the number of rows in the characteristic matrix is not a prime number,
      there will be collisions, where two rows get the same hash value.
    * Compression, n = 50 (att gå från 40094 till 1000 känns bättre)
    * The minHash signature can be a set or array
    """
    #enabling minHash generation: reconstruct the list of hashed_shingle_sets into a characteristic matrix:
    #dimensions: (len(universal_set), len(hashed_shingle_sets)) = (2^(32), 99) (2^32=4294967296, 2^16=65536, har 40094 unika shingles)
    #characteristic_matrix = np.zeros((2**32,len(sets_of_integers)), dtype=int)
    characteristic_matrix = np.zeros((2**16,len(sets_of_integers)), dtype=int)
    for i in range(len(sets_of_integers)):
        characteristic_matrix[sets_of_integers[i],i] = 1
        
    #heuristic functions: 
    # ai, bi randomly set integers, p prime number
    #hi = (ai*x+bi) % p
    n=10
    scales = np.random.choice(a=10*n, size=n, replace=False)
    bias = np.random.choice(a=10*n, size=n, replace=False)
    #chose a prime number close to 10*n, n=50: 503, 509, 521
    prime_numb = 509

    #signature matrix:
    SIG = np.full((n, len(sets_of_integers)), np.inf)
    print(SIG[1,1])
    for r in range(len(characteristic_matrix)):
        h_r = np.zeros(n)
        for i in range(n):
            h_r[i] = (scales[i]*r+bias[i])%prime_numb
        
        for c in range(len(sets_of_integers)):
            if characteristic_matrix[r,c] == 1:
                #decide if should change value in the signature matrix:
                for i in range(n):
                    if h_r[i]<SIG[i,c]:
                       SIG[i,c] = h_r[i]
                    
    return SIG

In [12]:
SIG = MinHashing(hashed_shingle_sets)
#print(SIG)

inf
[[ 0.  4.  8.  4.  0.  2.  7.  4.  5.  1.  2.  0.  0.  0.  5.  1.  0.  0.
   1.  0.  0.  0. 18.  0.  3.  4.  0.  0.  6.  1.  0.  1.  8.  1.  0.  9.
   2.  0.  0.  1.  0. 11.  8.  7.  5.  0.  8.  1.  1.  0.  2.  0.  6.  2.
   0. 10.  2.  0.  4.  0.  4.  0.  1.  0.  0.  0.  8.  1.  0.  0.  0.  2.
  10.  4.  4.  0.  0. 26.  3.  0.  0.  0.  2.  0.  3.  3.  0. 10.  8.  0.
  10.  2.  0.  6.  1.  4.  6.  0.  6.]
 [ 0.  2.  0.  5.  3.  0.  5.  2.  1.  4.  1.  8.  0.  1.  1.  5.  0.  3.
   2.  1.  0.  0.  7.  0.  1.  0.  0.  0.  0.  0.  0.  4.  5.  1.  0.  3.
   6.  3.  2.  1.  0. 12.  3.  4.  2.  0.  0.  0. 15. 11.  2.  0.  3.  1.
  32.  4.  2.  0.  2.  1.  8.  4.  2.  2.  3.  1.  5.  1.  1.  6.  1.  8.
  14.  0.  1.  0.  0. 71.  0.  1.  0.  0.  5.  8.  0.  8.  2.  2.  2.  0.
  11. 20.  0.  2.  0.  0.  1.  0.  1.]
 [ 1.  1. 18.  5.  6.  2.  2.  3.  1.  0.  0.  0.  1.  0.  0.  7.  0.  0.
   4.  5.  0.  0. 13.  0.  2. 16.  0.  0.  0.  4.  0.  0.  6.  0.  0.  0.
   3.  5.  3.  1.  2.  2.  7. 

In [19]:
"""
Estimating the Jaccard similarity between two sets of integers:
  * the probability that two columns have the same value in a given row of
    the signature matrix equals the Jaccard similarity of the sets corresponding to
    those columns
  * the expected number of rows in
    which two columns agree equals the Jaccard similarity of their corresponding
    sets
"""
def CompareSignatures(sig_1, sig_2, n):
    similarity = len(np.intersect1d(sig_1, sig_2))/n

In [20]:
print(SIG.shape)
sig_1 = SIG[:,30]
sig_2 = SIG[:,60]
n = 10
similarity = CompareSignatures(sig_1, sig_2, n)
print(f"similarity: {similarity}")

(10, 99)
similarity: None
