In [1]:
import pandas as pd
import re
import time
import binascii
import random

In [2]:
small_dataset = 'data/news_articles_small.csv'
df_small_dataset = pd.read_csv(small_dataset)

In [3]:
"""
    Pre-process data:
        1. convert all to lowercase
        2. remove punctuation
"""

#Convert to lowercase.
df_small_dataset['article'] = df_small_dataset['article'].str.lower()

#Remove punctuation
p = re.compile(r'[^\w\s]+')
df_small_dataset['article'] = [p.sub('', x) for x in df_small_dataset['article'].tolist()]

In [4]:
"""
    Split each document in a list of words

    small_dataset_split = [
        [documentID, document_text]
    ]
"""

small_dataset_split = []
for idx, row in df_small_dataset.iterrows():
    small_dataset_split.append([row[0], row[1].split()])

In [5]:
"""
    createShingles

    To create the shingles for the articles in the dataframe
    @:param small_dataset_split - The dataframe with the articles
"""

def createShingles(small_dataset_split):
#Add shingles with ngram 4
#Source: https://github.com/chrisjmccormick/MinHash/blob/master/runMinHashExample.py
    shingledDocs = {}
    docIds = []

    t0 = time.time()

    totalShingles = 0
    for docId, article in small_dataset_split:
        shingles = set()
        for i in range(0, len(article) - 3):
            shingle = article[i]+ " " + article[i + 1] + " " + article[i + 2] + " " + article[i + 3]

            crc =  binascii.crc32(shingle.encode()) & 0xffffffff
            shingles.add(crc)

        shingledDocs[docId]= shingles
        docIds.append(docId)
        totalShingles = totalShingles + (len(article) - 3)

    t1 = time.time()
    print('Time spent: ', t1-t0)
    return shingledDocs, docIds, totalShingles

Time spent:  0.3011922836303711


In [7]:
"""
    randomHash

    To create random hash functions
    @:param value
    @:param rand_value
"""
def randomHash(value, rand_value):
#     return int.from_bytes(hashlib.md5(str(value).encode()).digest(), "big")
    return binascii.crc32(value.to_bytes(32, "little")) & rand_value

"""
    randomList

    To create random hash functions
    @:param value
    @:param seed
"""
def randomList(n, seed=10):
    random.seed(10)
    l = []
    for i in range(n):
        r = random.getrandbits(32)
        l.append(r)
    return l

In [None]:
shingledDocs, docIds, totalShingles = createShingles(small_dataset_split)

In [None]:
print("Generating random hash functions...")
# Number of hash functions
M = 5
random_values = randomList(5)
signatures = []

In [None]:
"""
    MinHashing from shingles
"""
t0 = time.time()
for doc in docIds:
    signature = []
#     print(shingledDocs[doc])
    for hash_fun in range(M):
        min_value = 1e11
        random_value = random_values[hash_fun]
        print("random_value ", random_value)
        for shingle in shingledDocs[doc]:
            hash_value = randomHash(shingle, random_value)
#             print("shingle", shingle)
#             print("h_value ", hash_value)
            if hash_value < min_value:
                min_value = hash_value
        signature.append(min_value)
        print(min_value, " hash number: ", hash_fun, " sign", signature)
    signatures.append(signature)
    print(signatures)

t1= time.time()

print('Time spent: ', t1-t0)