In [98]:
import os

def read_documents(path):
    documents = []
    for file in os.listdir(path):
        if file.endswith('.txt'):
            with open(os.path.join(path, file), 'r') as f:
                documents.append(f.read())
    return documents
    

In [99]:
class Tokenizer:
    def __init__(self):
        self.vocabulary = {}
    
    def fit(self, documents):
        for doc in documents:
            doc = doc.lower()
            for word in doc.split():
                if word not in self.vocabulary:
                    self.vocabulary[word] = len(self.vocabulary)
        
    def encode(self, text):
        return [self.vocabulary[word.lower()] for word in text.split()]

    def decode(self, tokens):
        return ' '.join([self.vocabulary[token] for token in tokens])

In [100]:

class Shingling:
    def __init__(self):
        self.shingles = set()
        self.hashed_shingles = set()
    
    def create_shingles(self, text, k):
        """Create k-shingles from text and store their hashed values.
        
        Args:
            text: List of tokens or string to create shingles from
            k: Length of each shingle
            
        Returns:
            Set of hashed k-shingles in sorted order
        """
        # Create k-shingles as tuples
        self.shingles = {tuple(text[i:i+k]) for i in range(len(text) - k + 1)}
        
        # Hash each shingle and store in sorted order
        self.hashed_shingles = {hash(shingle) for shingle in self.shingles}
        
        return self.hashed_shingles



In [101]:
class CompareSets:
    def jaccard(self, a, b):
        return len(a.intersection(b)) / len(a.union(b))



In [84]:
from typing import List
import numpy as np

class MinHashing:
    def __init__(self):
        self.unique_shingles = set()
        
    def fit(self, sets:List[set]):
        for set_ in sets:
            self.unique_shingles.update(set_)

    def characteristic_vector(self, shingles: set):
        shingles_array = np.array(list(self.unique_shingles))
        return np.isin(shingles_array, list(shingles)).astype(np.int8)
        

    def signature(self, n, shingles:set):
        np.random.seed(n)
        
        permutations = np.array([np.random.permutation(len(self.unique_shingles)) for _ in range(n)])
        characteristic_vector = self.characteristic_vector(shingles)
        permuted_shingles = permutations * characteristic_vector
        minhash = np.array([np.min(row[row != 0]) if np.any(row != 0) else 0 for row in permuted_shingles])
        return minhash
    

In [85]:
class CompareSignatures:
    def jaccard(self, a:np.array, b:np.array):
        return np.mean(a == b)


In [97]:
docs = [
    "The cat in the hat comes back",
    "The cat in the hat comes tomorrow",
    "I will read this book",
    "I will read this book again",
    "This is a book",
]

t = Tokenizer()
t.fit(docs)

s = Shingling()
mh = MinHashing()

shingled_docs = [s.create_shingles(t.encode(doc), 3) for doc in docs]
mh.fit(shingled_docs)

signatures = [(mh.signature(50, doc)) for doc in shingled_docs]

c = CompareSignatures()

[c.jaccard(signatures[0], signatures[i]) for i in range(len(signatures))]


[np.float64(1.0),
 np.float64(0.62),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0)]