In [None]:
!pip install numpy
!pip install ordered_set

In [19]:
import numpy as np
import os
import hashlib
from ordered_set import OrderedSet
from collections import defaultdict

In [20]:
#set of shingles without duplicates, in order of insertion
#generate the same hash values for equivalent shingles each time this function is called
#(using hashlib, generate hexadecimal values)
def Shingling(k, file_path):
    with open(file_path, 'r') as file:
        words = [word for line in file for word in line.strip().split()]
    ordered_shingles = OrderedSet()
    #shingles = []
    for i in range(len(words)-k):
        # need to convert sublist to string in order to use encode:
        shingle = ' '.join(words[i:i+k])
        #generating a 128 bit hash value:
        hash_value_0 = hashlib.md5(shingle.encode())
        #using only 4B:
        #converting hexadecimal string to integer from range (0,2^(128)-1) to (0,2^(32)-1) using mdulo
        hash_value = int(hash_value_0.hexdigest(), 16) % (2**32)
        ordered_shingles.add(hash_value)
    
    return ordered_shingles

In [21]:

def CompareSets(hashed_shingle_set1, hashed_shingle_set2):
    # Compute the jaccard similarity of the hash values(shingles) in the sets:
    jacc = len(hashed_shingle_set1.intersection(hashed_shingle_set2))/len(hashed_shingle_set1.union(hashed_shingle_set2))
    return jacc    

In [22]:
# builds minHash signatures from given set of integers (i.e. documents)
# The minhash value of any column is the number of
# the first row, in the permuted order, in which the column has a 1.
def MinHashing(universal_set, sets_of_integers, n):
    """
    * Length of signatures, n, can be selected randomly
    * Theoretically: from the column representing set S, construct the minhash signature for S, the
      vector [h1(S), h2(S), . . . , hn(S)]
    * In practise: instead of picking n random permutations of rows, we pick n randomly
      chosen hash functions h1, h2, . . . , hn on the rows.
    * If the number of rows in the characteristic matrix is not a prime number,
      there will be collisions, where two rows get the same hash value.
    * Compression, n = 50 (att gå från 40094 till 1000 känns bättre)
    * The minHash signature can be a set or array
    """
    #enabling minHash generation: reconstruct the list of hashed_shingle_sets into a characteristic matrix:
    #dimensions: (len(universal_set), len(hashed_shingle_sets)) 
    characteristic_matrix = np.zeros((len(universal_set),len(sets_of_integers)), dtype=int)
    for i in range(len(sets_of_integers)):
        #get array of all indices in universal_set that sets_of_integers[i] has the same value as
        overlaps_i = np.where(np.isin(universal_set, list(sets_of_integers[i])))[0]
        characteristic_matrix[overlaps_i,i] = 1
        
    #heuristic functions: 
    # ai, bi randomly set integers, p prime number
    #hi = (ai*x+bi) % p
    
    scales = np.random.choice(a=10*n, size=n, replace=False)
    bias = np.random.choice(a=10*n, size=n, replace=False)
    #chose a prime number close to 10*n, n=50: 503, 509, 521
    prime_numb = 509

    #signature matrix:
    SIG = np.full((n, len(sets_of_integers)), np.inf)
    
    for r in range(len(characteristic_matrix)):
        h_r = np.zeros(n)
        for i in range(n):
            h_r[i] = (scales[i]*r+bias[i])%prime_numb
        
        for c in range(len(sets_of_integers)):
            if characteristic_matrix[r,c] == 1:
                #decide if should change value in the signature matrix:
                for i in range(n):
                    if h_r[i]<SIG[i,c]:
                       SIG[i,c] = h_r[i]
                    
    return SIG

In [23]:
# generate a list of all documents; ordered sets of integers (hashed shingles)
k = 9 # (de använder 9 i boken)
folder_path = 'data/'
hashed_shingle_sets = [] # length (nr of documents): 99, ranging in length between 23 and 4505 unique shingles
universal_set = OrderedSet() # length: 40094
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        ordered_shingles = Shingling(k, file_path)
        hashed_shingle_sets.append(ordered_shingles)
        universal_set.update(ordered_shingles)
        
universal_set = np.array(list(universal_set))       

In [24]:
n=50
SIG = MinHashing(universal_set, hashed_shingle_sets, n)
print(SIG)

[[ 0. 22. 15. ... 18.  0. 37.]
 [ 0.  1. 11. ... 16.  0. 11.]
 [ 1.  0. 10. ...  0.  1.  4.]
 ...
 [ 5. 12. 19. ... 17.  0.  8.]
 [ 3.  6.  0. ...  0.  1.  4.]
 [ 1.  0.  2. ...  6.  1.  4.]]


In [25]:
"""
Estimating the Jaccard similarity between two sets of integers:
  * the probability that two columns have the same value in a given row of
    the signature matrix equals the Jaccard similarity of the sets corresponding to
    those columns
  * the expected number of rows in
    which two columns agree equals the Jaccard similarity of their corresponding
    sets
"""
def CompareSignatures(sig_1, sig_2):
    n = len(sig_1)
    similarity = len(np.where(sig_1==sig_2)[0])/n
    return similarity

In [12]:
"""
folder_path = 'data/'
files = sorted(os.listdir(folder_path))
for i in range(SIG.shape[1]):
    for j in range(SIG.shape[1]):
        if i != j:
            compr_int_set_1 = SIG[:,i]
            compr_int_set_2 = SIG[:,j]
            similarity = CompareSignatures(compr_int_set_1, compr_int_set_2)
            shingle_set_1 = hashed_shingle_sets[i]
            shingle_set_2 = hashed_shingle_sets[j]
            jacc = CompareSets(shingle_set_1, shingle_set_2)
            t = 0.3
            if jacc > t:
                print(f"signature similarity: {similarity}")
                print(f"jaccard similarity: {jacc}")
                
                file_path_1 = os.path.join(folder_path, files[i])
                file_path_2 = os.path.join(folder_path, files[i_2])
                if os.path.exists(file_path_1):
                    with open(file_path_1, 'r') as file1:
                        print(f"Contents of {files[i]}:")
                        print(file1.read())
        
                if os.path.exists(file_path_2):
                    with open(file_path_2, 'r') as file2:
                        print(f"Contents of {files[i_2]}:")
                        print(file2.read())
"""               

signature similarity: 0.84
jaccard similarity: 0.36048526863084923
Contents of medical_529.txt:
In article <1993Apr21.143910.5826@wvnvms.wvnet.edu> 
pk115050@wvnvms.wvnet.edu writes:
> My girlfriend is in pain from kidney stones. She says that because she 
has no
> medical insurance, she cannot get them removed.
> My question: Is there any way she can treat them herself, or at least 
mitigate
> their effects? Any help is deeply appreciated. (Advice, referral to 
literature,
> etc...)
> Thank you,
> Dave Carvell
> pk115050@wvnvms.wvnet.edu
First, let me offer you my condolences.  I've had kidney stones 4 times 
and I know the pain she is going through.  First, it is best that she see 
a doctor.  However, every time I had kidney stones, I saw my doctor and the
only thing they did was to prescribe some pain killers and medication for a
urinary tract infection.  The pain killers did nothing for me...kidney stones
are extremely painful.  My stones were judged passable, so we just waited it


In [30]:
#uses a hash function to divide its sub-columns into different buckets
def LSH(b):
    bucket_array = defaultdict(list)
    for i in range(b.shape[1]):
        hash_value_0 = hashlib.md5(b[:,i].tobytes())
        hash_value = int(hash_value_0.hexdigest(), 16) % (2**32)
        bucket_array[hash_value].append(b[:,i]) 
    return bucket_array

In [31]:
"""
* LSH: Locality Sensitivity Hashing, a theory for investigating only the pairs that are likely to be similar
* Candidate pairs: documents hashed to the same bucket
    - False positives: candidate pairs that are dissimilar (estimately low fraction)
    - False negatives: similar pairs that that aren't hashed to the same bucket (estimately low fraction)
    - Normally assume that to sub-columns hash to the same bucket only if they are identical
* Procedure:
    - Choose a threshold t that defines how similar documents have to be in
      order for them to be regarded as a desired “similar pair.”
    - Pick a number of bands b and a number of rows r such that br = n, and the threshold
      t is approximately (1/b)^(1/r).
    - Divide the signature matrix into b nr of bands, each consisting of r rows
    - Each band uses a hash function to divide its sub-columns into different buckets (can use the same hash function as long as use different bucket arrays)
"""
# n = 50
b = 5
r = 10
t = 0.85 
for i in range(b):
    b_i = SIG[i:i+r]
    bucket_array_i = LSH(b_i)
    #loop over key-value pairs
    #check if value=list has more then 1 item
    for hash_value, compr_int_set in bucket_array.items():
        if len(compr_int_set)>1:
            for i in range(len(compr_int_set)):
                for j in range(len(compr_int_set)):
                    if i != j:
                        compr_int_set_1 = SIG[:,i]
                        compr_int_set_2 = SIG[:,j]
                        similarity = CompareSignatures(compr_int_set_1, compr_int_set_2)
                        shingle_set_1 = hashed_shingle_sets[i]
                        shingle_set_2 = hashed_shingle_sets[j]
                        jacc = CompareSets(shingle_set_1, shingle_set_2)
                        if jacc > t:
                            print(f"signature similarity: {similarity}")
                            print(f"jaccard similarity: {jacc}")
                            
                            file_path_1 = os.path.join(folder_path, files[i])
                            file_path_2 = os.path.join(folder_path, files[i_2])
                            if os.path.exists(file_path_1):
                                with open(file_path_1, 'r') as file1:
                                    print(f"Contents of {files[i]}:")
                                    print(file1.read())
                    
                            if os.path.exists(file_path_2):
                                with open(file_path_2, 'r') as file2:
                                    print(f"Contents of {files[i_2]}:")
                                    print(file2.read())
                        