In [1]:
import os

def read_documents(path):
    documents = []
    for file in os.listdir(path):
        if file.endswith('.txt'):
            with open(os.path.join(path, file), 'r') as f:
                documents.append(f.read())
    return documents
    

In [2]:
docs = read_documents("data/business")

In [3]:
class Tokenizer:
    def __init__(self):
        self.vocabulary = {}
    
    def fit(self, documents):
        for doc in documents:
            doc = doc.lower()
            for word in doc.split():
                if word not in self.vocabulary:
                    self.vocabulary[word] = len(self.vocabulary)
        
    def encode(self, text):
        return [self.vocabulary[word.lower()] for word in text.split()]

    def decode(self, tokens):
        return ' '.join([self.vocabulary[token] for token in tokens])

In [4]:
t = Tokenizer()
t.fit(docs)
t.encode(docs[0])


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 8,
 14,
 15,
 16,
 0,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 0,
 3,
 36,
 17,
 12,
 37,
 38,
 39,
 8,
 40,
 41,
 42,
 31,
 5,
 43,
 44,
 45,
 46,
 47,
 48,
 8,
 49,
 0,
 50,
 51,
 52,
 53,
 54,
 55,
 22,
 56,
 23,
 12,
 43,
 8,
 14,
 57,
 22,
 58,
 8,
 29,
 30,
 59,
 31,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 8,
 30,
 13,
 15,
 16,
 67,
 68,
 69,
 70,
 51,
 8,
 71,
 72,
 73,
 74,
 8,
 75,
 76,
 77,
 78,
 32,
 33,
 25,
 79,
 80,
 66,
 81,
 43,
 82,
 26,
 83,
 84,
 31,
 85,
 86,
 87,
 1,
 3,
 88,
 89,
 90,
 8,
 91,
 22,
 15,
 39,
 25,
 92,
 93,
 13,
 94,
 95,
 12,
 96,
 97,
 88,
 98,
 99,
 37,
 13,
 8,
 75,
 100,
 14,
 101,
 85,
 102,
 103,
 104,
 105,
 106,
 107,
 11,
 39,
 108,
 73,
 109,
 110,
 111,
 37,
 112,
 113,
 114,
 8,
 0,
 115,
 116,
 48,
 108,
 3,
 117,
 43,
 118,
 22,
 119,
 66,
 8,
 120,
 13,
 15,
 121,
 25,
 59,
 27,
 8,
 122,
 123,
 110,
 30,
 43,
 124,
 31,


In [5]:
t.vocabulary

{'christmas': 0,
 'shoppers': 1,
 'flock': 2,
 'to': 3,
 'tills': 4,
 'shops': 5,
 'all': 6,
 'over': 7,
 'the': 8,
 'uk': 9,
 'reported': 10,
 'strong': 11,
 'sales': 12,
 'on': 13,
 'last': 14,
 'saturday': 15,
 'before': 16,
 'with': 17,
 'some': 18,
 'claiming': 19,
 'record-breaking': 20,
 'numbers': 21,
 'of': 22,
 'festive': 23,
 'shoppers.': 24,
 'a': 25,
 'spokesman': 26,
 'for': 27,
 "manchester's": 28,
 'trafford': 29,
 'centre': 30,
 'said': 31,
 'it': 32,
 'was': 33,
 '"the': 34,
 'biggest': 35,
 'date"': 36,
 'up': 37,
 '5%.': 38,
 'and': 39,
 'regent': 40,
 'street': 41,
 'association': 42,
 'in': 43,
 'central': 44,
 'london': 45,
 'were': 46,
 'also': 47,
 'expecting': 48,
 '"best': 49,
 'ever".': 50,
 'that': 51,
 'picture': 52,
 'comes': 53,
 'despite': 54,
 'reports': 55,
 'disappointing': 56,
 'couple': 57,
 'weeks.': 58,
 'spokeswoman': 59,
 'about': 60,
 '8,500': 61,
 'thousand': 62,
 'vehicles': 63,
 'had': 64,
 'arrived': 65,
 'at': 66,
 '1130': 67,
 'gmt.': 68

In [27]:

class Shingling:
    def __init__(self):
        self.shingles = set()
        self.hashed_shingles = set()
    
    def create_shingles(self, text, k):
        """Create k-shingles from text and store their hashed values.
        
        Args:
            text: List of tokens or string to create shingles from
            k: Length of each shingle
            
        Returns:
            Set of hashed k-shingles in sorted order
        """
        # Create k-shingles as tuples
        self.shingles = {tuple(text[i:i+k]) for i in range(len(text) - k + 1)}
        
        # Hash each shingle and store in sorted order
        self.hashed_shingles = {hash(shingle) for shingle in self.shingles}
        
        return self.hashed_shingles



In [29]:
s = Shingling()
shingles = s.create_shingles(t.encode(docs[0]), 7)



In [35]:
class CompareSets:
    def jaccard(self, set1, set2):
        return len(set1.intersection(set2)) / len(set1.union(set2))

c = CompareSets()
c.jaccard(shingles, s.create_shingles(t.encode(docs[1]), 7))



0.0