In [None]:
import os
import itertools
import numpy as np

In [98]:
def read_documents(path):
    documents = []
    for file in os.listdir(path):
        if file.endswith('.txt'):
            with open(os.path.join(path, file), 'r') as f:
                documents.append(f.read())
    return documents
    

In [99]:
class Tokenizer:
    def __init__(self):
        self.vocabulary = {}
    
    def fit(self, documents):
        for doc in documents:
            doc = doc.lower()
            for word in doc.split():
                if word not in self.vocabulary:
                    self.vocabulary[word] = len(self.vocabulary)
        
    def encode(self, text):
        return [self.vocabulary[word.lower()] for word in text.split()]

    def decode(self, tokens):
        return ' '.join([self.vocabulary[token] for token in tokens])

In [100]:

class Shingling:
    def __init__(self):
        self.shingles = set()
        self.hashed_shingles = set()
    
    def create_shingles(self, text, k):

        # Create k-shingles as tuples
        self.shingles = {tuple(text[i:i+k]) for i in range(len(text) - k + 1)}
        
        # Hash each shingle and store in sorted order
        self.hashed_shingles = {hash(shingle) for shingle in self.shingles}
        
        return self.hashed_shingles


In [101]:
class CompareSets:
    def jaccard(self, a, b):
        return len(a.intersection(b)) / len(a.union(b))


In [84]:
from typing import List

class MinHashing:
    def __init__(self):
        self.unique_shingles = set()
        
    def fit(self, sets:List[set]):
        for set_ in sets:
            self.unique_shingles.update(set_)

    def characteristic_vector(self, shingles: set):
        shingles_array = np.array(list(self.unique_shingles))
        return np.isin(shingles_array, list(shingles)).astype(np.int8)
        

    def signature(self, n, shingles:set):
        np.random.seed(n)
        
        permutations = np.array([np.random.permutation(len(self.unique_shingles)) for _ in range(n)])
        characteristic_vector = self.characteristic_vector(shingles)
        permuted_shingles = permutations * characteristic_vector
        minhash = np.array([np.min(row[row != 0]) if np.any(row != 0) else 0 for row in permuted_shingles])
        return minhash
    

In [85]:
class CompareSignatures:
    def jaccard(self, a:np.array, b:np.array):
        return np.mean(a == b)


In [138]:
class LSH:
    def candidate_pairs(self, signatures:np.array, b, r):
        # Use a set to store unique pairs
        unique_pairs = set()
        
        for i in range(b):
            band = signatures[:, i*r:(i+1)*r]
            for j in range(len(band)):
                for k in range(j+1, len(band)):
                    if np.array_equal(band[j], band[k]):
                        unique_pairs.add((j, k))
        
        yield from unique_pairs
        
    

In [153]:
N = 50
B = 10
R = 5

docs = read_documents("data/medical")

t = Tokenizer()
t.fit(docs)

s = Shingling()
mh = MinHashing()

shingled_docs = [s.create_shingles(t.encode(doc), 3) for doc in docs]
mh.fit(shingled_docs)

signatures = [(mh.signature(N, doc)) for doc in shingled_docs]

c = CompareSignatures()

sigs = np.array(signatures)
candidate_pairs = list(LSH().candidate_pairs(sigs, b=B, r=R))
for x, y in candidate_pairs:
    print(f"Candidate pair: {x}, {y} \t MinHash Jaccard similarity: {c.jaccard(signatures[x], signatures[y])} \t Shingle Jaccard similarity: {CompareSets().jaccard(shingled_docs[x], shingled_docs[y])}")



Candidate pair: 64, 77 	 MinHash Jaccard similarity: 0.56 	 Shingle Jaccard similarity: 0.4943609022556391
Candidate pair: 46, 77 	 MinHash Jaccard similarity: 0.52 	 Shingle Jaccard similarity: 0.5170068027210885


Here we've discovered some similarity for medical docs 64 and 77, and 46 and 77. Suprisingly, 46 is not considered similar to 64

In [157]:
def view_documents(docs, x, y):
    doc1_lines = docs[x].split("\n")
    doc2_lines = docs[y].split("\n")
    
    # Get max length for padding
    max_len = max(len(max(doc1_lines, key=len)), len(max(doc2_lines, key=len)))
    
    print(f"\n{'='*120}\nComparing documents {x} and {y}\n{'='*120}")
    print(f"{'Document ' + str(x):<{max_len+5}} | {'Document ' + str(y)}")
    print(f"{'-'*(max_len+5)}-+-{'-'*max_len}")
    
    for line1, line2 in itertools.zip_longest(doc1_lines, doc2_lines, fillvalue=""):
        print(f"{line1:<{max_len+5}} | {line2}")
    print()

for x, y in candidate_pairs:
    view_documents(docs, x, y)



Comparing documents 64 and 77
Document 64                                                                                        | Document 77
---------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------
In article <1993Apr22.202051.1@vms.ocom.okstate.edu>, banschbach@vms.ocom.okstate.edu writes:      | In article <1993Apr22.202051.1@vms.ocom.okstate.edu>,
> In article <1r6g8fINNe88@ceti.cs.unc.edu>, jge@cs.unc.edu (John Eyles) writes:                   | banschbach@vms.ocom.okstate.edu wrote:
>> A friend has what is apparently a fairly minor case of Crohn's                                  | > In article <1r6g8fINNe88@ceti.cs.unc.edu>, jge@cs.unc.edu (John Eyles) writes:
>> disease.                                                                                        | > > A friend has what is apparently a fairly minor case of Crohn's
>> But she can't

In [158]:
view_documents(docs, 46, 77)



Comparing documents 46 and 77
Document 46                                                                           | Document 77
--------------------------------------------------------------------------------------+---------------------------------------------------------------------------------
In article <1r6g8fINNe88@ceti.cs.unc.edu>, jge@cs.unc.edu (John Eyles) writes:        | In article <1993Apr22.202051.1@vms.ocom.okstate.edu>,
> A friend has what is apparently a fairly minor case of Crohn's                      | banschbach@vms.ocom.okstate.edu wrote:
> disease.                                                                            | > In article <1r6g8fINNe88@ceti.cs.unc.edu>, jge@cs.unc.edu (John Eyles) writes:
> But she can't seem to eat certain foods, such as fresh vegetables,                  | > > A friend has what is apparently a fairly minor case of Crohn's
> without discomfort, and of course she wants to avoid a recurrence.                  | > > disease.
> Her 

Lets investigate the minhash similarities

In [150]:
# compute the pair wise signature similarities
pairs = list(itertools.combinations(range(len(signatures)), 2))
pair_similarities = [((x,y), c.jaccard(signatures[x], signatures[y])) for x, y in pairs]

pair_similarities.sort(key=lambda x: x[1], reverse=True)

pair_similarities[:10]



[((64, 77), np.float64(0.56)),
 ((46, 77), np.float64(0.52)),
 ((46, 64), np.float64(0.46)),
 ((69, 97), np.float64(0.38)),
 ((40, 49), np.float64(0.34)),
 ((76, 98), np.float64(0.34)),
 ((3, 12), np.float64(0.32)),
 ((56, 73), np.float64(0.32)),
 ((6, 67), np.float64(0.24)),
 ((61, 85), np.float64(0.24))]

Apparently we found the most similar documents earlier, with similarities higher than 0.5