In [1]:
import pandas as pd
import re
import time
import binascii
import random
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = 'data/news_articles_large.csv'
df_dataset = pd.read_csv(dataset)

In [3]:
"""
    Pre-process data:
        1. convert all to lowercase
        2. remove punctuation
"""

#Convert to lowercase.
df_dataset['article'] = df_dataset['article'].str.lower()

#Remove punctuation
p = re.compile(r'[^\w\s]+')
df_dataset['article'] = [p.sub('', x) for x in df_dataset['article'].tolist()]

In [4]:
"""
    Split each document in a list of words

    small_dataset_split = [
        [documentID, document_text]
    ]
"""

dataset_split = []
for idx, row in df_dataset.iterrows():
    dataset_split.append([row[0], row[1].split()])

In [5]:
"""
    createShingles

    To create the shingles for the articles in the dataframe
    @:param small_dataset_split - The dataframe with the articles
"""

def createShingles(small_dataset_split):
#Add shingles with ngram 5
#Source: https://github.com/chrisjmccormick/MinHash/blob/master/runMinHashExample.py
    shingledDocs = {}
    docIds = []

    t0 = time.time()

    totalShingles = 0
    for docId, article in small_dataset_split:
        shingles = set()
        for i in range(0, len(article) - 5):
            shingle = article[i]+ " " + article[i + 1] + " " + article[i + 2] + " " + article[i + 3] + " " + article[i + 4] 

            crc =  binascii.crc32(shingle.encode()) & 0xffffffff
            shingles.add(crc)

        shingledDocs[docId]= shingles
        docIds.append(docId)
        totalShingles = totalShingles + (len(article) - 5)

    t1 = time.time()
    print('Time spent: ', t1-t0)
    return shingledDocs, docIds, totalShingles

In [6]:
"""
    randomHash

    To create random hash functions
    @:param value
    @:param rand_value
"""
def randomHash(value, rand_value):
    return binascii.crc32(value.to_bytes(32, "little")) ^ rand_value

"""
    randomList

    To create random hash functions
    @:param value
    @:param seed
"""
def randomList(n, seed=10):
    random.seed(10)
    l = []
    for i in range(n):
        r = random.getrandbits(32)
        l.append(r)
    return l


In [7]:
shingledDocs, docIds, totalShingles = createShingles(dataset_split)

Time spent:  4.267371892929077


In [8]:
print("Generating random hash functions...")
# Number of hash functions
M = 576
random_values = randomList(M)

Generating random hash functions...


In [9]:
"""
    MinHashing from shingles
"""
signatures = []
t0 = time.time()
for doc in docIds:
    signature = []
    for hash_fun in range(M):
        min_value = 0
        random_value = random_values[hash_fun]
        for shingle in shingledDocs[doc]:
            hash_value = randomHash(shingle, random_value)
            if hash_value < min_value or min_value == 0:
                min_value = hash_value
        
        signature.append(min_value)
    signatures.append(signature)

t1= time.time()

print('Time spent: ', t1-t0)

Time spent:  1349.1122035980225


## Method 1


In [10]:
from numpy import long

"""
    LSH
"""
from itertools import combinations

class LSH:
    def __init__(self, b, r):
        self.counter = 0
        self.b = b
        self.r = r
        self.hash_tables= {}

        self.create_hash_tables()

    def create_hash_tables(self):
        # The hash function must be (s1, s2, p1, p2) sensitive
        # We need to have r hash functions
        # Per band we have a hash table

        # Add hash tables
        for i in range(self.b):
            self.hash_tables[i] = {}

    def hash(self, i, subvec):
        acc = ""
        for value in subvec:
            acc += str(int(float(value)))
        return acc

    def add_to_hash_table(self, i, subvec):
        if subvec not in self.hash_tables[i]:
            self.hash_tables[i][subvec] = []
            self.hash_tables[i][subvec].append(self.counter)
        else:
            self.hash_tables[i][subvec].append(self.counter)

    def make_subvecs(self, signature):
        l = len(signature)
        assert l % self.b == 0
        r = self.r
        # break signature into subvectors
        subvecs = []
        for i in range(0, l, r):
            subvecs.append(signature[i:i+r])
        return np.stack(subvecs)

    def add_hash(self, signature):
        subvecs = self.make_subvecs(signature).astype(str)
        for i, subvec in enumerate(subvecs):
            #Hash every subvector using another hash function
            #and add it to a different hash table that corresponds
            #to that hash function
            hashed_subvec = self.hash(i+1, subvec)
            self.add_to_hash_table(i, hashed_subvec)
        self.counter += 1

    def check_candidates(self):
        candidates = []
        for i in self.hash_tables:
            keys = self.hash_tables[i].keys()
            for bucket in keys:
                hits = self.hash_tables[i][bucket]
                if len(hits) > 1:
                    candidates.extend(combinations(hits, 2))
        return set(candidates)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


In [11]:
t0 = time.time()
b = 96
r = 6
lsh = LSH(b,r)
l = b * r
for signature in signatures:
    lsh.add_hash(signature[:l])

candidate_pairs = lsh.check_candidates()
print(f"Number of candidate pairs are {len(candidate_pairs)}")
t1 = time.time()
print("time spent: ", t1-t0)

Number of candidate pairs are 342
time spent:  29.346665859222412


In [14]:
pairs = pd.DataFrame(list(candidate_pairs), columns=["doc1", "doc2"])

In [15]:
pairs.shape

(342, 2)

In [16]:
pairs.head()

Unnamed: 0,doc1,doc2
0,2062,5649
1,2435,8217
2,3714,5000
3,1544,7730
4,725,5515


In [17]:
pairs.to_csv("final_pairs.csv")