In [1]:
from icecream import ic
import numpy as np
import random
import math
from collections import defaultdict

In [40]:
text1 = open('t1.txt', 'r').read()
text2 = open('t2.txt', 'r').read()
text3 = open('ca1851-match.txt', 'rb').read()
text4 = open('ny1850-match.txt', 'rb').read()

In [41]:
do = [text1, text2, text3, text4]

In [44]:
print(do[1])

A CSV file (Comma Separated Values File) is Kobe James by spreadsheet programs such as Microsoft Excel or OpenOffice Calc. If I want you to test download of a file which contains plain text data sets separated by commas
(database) just ddownload this file. I am on this way that.


### Shingling

In [2]:
class K_Shingles():
    def __init__(self, documents, k):
        """
        documents: the input documents
        k: number of shingles
        """
        self.documents = documents
    
        self._k_shingle_ = [
            [
                d[i:i+k] for i in range(len(d) - k + 1)
            ] for d in documents
        ]
        
        res = set()
        for d in self._k_shingle_:
            res.update(set(d))
        self.dict_ = {}
        for k, v in enumerate(res):
            self.dict_[v] = k  
    
    def get_hash_shingle(self):
        return [sorted([self.dict_[x] for x in set(sh)]) for sh in self._k_shingle_]
    
    def compare_sets(self, s1, s2):
        # computes the Jaccard similarity of two sets of integers
        s1 = set(s1)
        s2 = set(s2)
        return len(s1.intersection(s2))/len(s1.union(s2))

In [6]:
shingle_documents = K_Shingles(do, 5)

In [7]:
hashed_shingle = shingle_documents.get_hash_shingle()

In [9]:
ja = shingle_documents.compare_sets(hashed_shingle[0], hashed_shingle[1])
print(ja)

0.7412587412587412


In [10]:
ja34 = shingle_documents.compare_sets(hashed_shingle[2], hashed_shingle[3])
print(ja34)

0.5007718431614696


### MinHashing

In [3]:
class MinHashing():
    def __init__(self, func_num, num_shingle, random_generate=True, hash_functions=None):
        '''
        func_num: number of hash functions
        num_shingle: total number of all shingles
        random_generate: if False, the hash_functions will be the pre-defined ones
        hash_functions: '(ax+b)%c'
        '''
        self.func_num = func_num
        self.num_shingle = num_shingle
        self.functions_params = []
        
        if not random_generate:
            assert hash_functions is not None
            assert self.func_num == len(hash_functions)
            for hash_function in hash_functions:
                remain = hash_function.split('%')[0]
                c = int(hash_function.split('%')[1])
                a = remain.split('+')[0][1:-1]
                a = int(a) if a else 1
                b = int(remain.split('+')[1][0:-1])
                self.functions_params.append((a, b, c))
        else:
            for _ in range(func_num):
                c = self.num_shingle + random.randint(0, 5)
                self.functions_params.append((random.randint(1, c - 1), random.randint(1, c - 1), c))
    
    def min_hashing(self, labeled_shingles):
        signatures = []
        for shingle in labeled_shingles:
            row = []
            for i in range(self.func_num):
                #ic(min([(self.functions_params[i][0] * x + self.functions_params[i][1]) % self.functions_params[i][2] for x in shingle]))
                row.append(min([(self.functions_params[i][0] * x + self.functions_params[i][1]) % self.functions_params[i][2] for x in shingle]))
            #ic(row)
            signatures.append(row)
        return signatures
    
    def compare_signatures(self, s1, s2):
        # computes the Jaccard similarity of two sets of integers
        same_c = 0
        for i in range(len(s1)):
            if s1[i] == s2[i]:
                same_c += 1
        return same_c/len(s1)

In [12]:
functions = ['(x+1)%5', '(3x+1)%5', '(4x+11)%3']
func_num = 1500
num_shingle = len(shingle_documents.dict_)
random_generate = True
k = MinHashing(func_num, num_shingle, random_generate, functions)

In [13]:
# k.functions_params

In [14]:
signatures = k.min_hashing(hashed_shingle)

In [15]:
# signatures

In [16]:
similarites = k.compare_signatures(signatures[2], signatures[3])

In [17]:
similarites

0.6666666666666666

## LSH

In [4]:
class LSH():
    def __init__(self, t, func_num, mode, signatures):
        '''
        t: similarity threshold
        n / func_num: number of functions used in minhashing
        mode: chose to avoid false negatives or false positives
        '''
        self.t = t
        self.n = func_num
        self.mode = mode
        self.signatures = np.array(signatures)
        
        self.b, self.r = self.chose_b_r()

    def chose_b_r(self):
        # use binary search to find b accorfing to t and search mode
        t = self.t
        n = self.n
        left = 1
        right = n
        middle = int((right - left) / 2)
        
        r = int(n / middle)
        similarity = (1/middle)**(1/r)
            
        if self.mode == 'false_negatives':
            while similarity > t or similarity < t - 0.05:
                if similarity < t - 0.005:
                    right = middle
                    middle = left + int((middle - left)/2)
                    r = int(n / middle)
                    similarity = (1/middle)**(1/r)
                else:
                    left = middle
                    middle = int(middle + (right - middle) / 2)
                    r = int(n / middle)
                    similarity = (1/middle)**(1/r)
        else:
            while similarity > t + 0.05 or similarity < t:
                if similarity < t:
                    right = middle
                    middle = left + int((middle - left)/2)
                    r = int(n / middle)
                    similarity = (1/middle)**(1/r)
                else:
                    left = middle
                    middle = int(middle + (right - middle) / 2)
                    r = int(n / middle)
                    similarity = (1/middle)**(1/r)
        return middle, r
    
    def find_candidate_pairs(self):
        candidate_dict = defaultdict(set)
        num_doc = self.signatures.shape[0]
        
        def create_dict(sig_part):
            for j in range(num_doc):
                for k in range(j+1, num_doc):
                    if k not in candidate_dict[(j)] and all(sig_part[j] == sig_part[k]):
                        candidate_dict[(j)].add(k)
                        candidate_dict[(k)].add(j)
        
        for i in range(self.b):
            sig_part = self.signatures[:, i:i+self.r]
            create_dict(sig_part)
        
        if self.b * self.r < self.n:
            sig_part = self.signatures[:, self.b * self.r:]
            create_dict(sig_part)
        
        return candidate_dict

In [19]:
mode = 'false_positives' # 'false_negatives'
lsh = LSH(0.5, 1500, mode, signatures)

In [25]:
candidate_dict = lsh.find_candidate_pairs()

In [26]:
candidate_dict

defaultdict(set, {0: {1}, 1: {0}, 2: {3}, 3: {2}})

## Test for scalability

In [5]:
import time
import re
import pandas as pd

from bs4 import BeautifulSoup
from urllib.request import urlopen
import random
import requests

In [6]:
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36"
headers = {'User-Agent':user_agent}
def craw(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
    return soup

In [27]:
text1 = open('t1.txt', 'r').read()
text2 = open('t2.txt', 'r').read()
text3 = open('ca1851-match.txt', 'r').read()
text4 = open('ny1850-match.txt', 'r').read()
text5  = str(craw("https://en.wikipedia.org/wiki/Boston_Celtics"))
text6 = str(craw('https://en.wikipedia.org/wiki/Los_Angeles_Lakers'))
text7 = open('ca1851-nomatch.txt', 'r').read()

In [28]:
do = [text1, text2, text3, text4, text5, text6, text7]

In [44]:
NUM_SHINGLES = 5
FUNC_NUM = 1000
THRESHOLD = 0.4

time_begin = time.time()

shingle_documents = K_Shingles(do, NUM_SHINGLES)
hashed_shingle = shingle_documents.get_hash_shingle()

num_shingle = len(shingle_documents.dict_)
random_generate = True
min_hashing = MinHashing(FUNC_NUM, num_shingle, random_generate)
signatures = min_hashing.min_hashing(hashed_shingle)

mode = 'false_negatives' # 'false_positives'
lsh = LSH(THRESHOLD, FUNC_NUM, mode, signatures)
candidate_dict = lsh.find_candidate_pairs()

similar_doc = defaultdict(set)
for k, v in candidate_dict.items():
    for candidate in v:
        if candidate in similar_doc[k]: continue
        
        similarites = min_hashing.compare_signatures(signatures[k], signatures[candidate])
#         print(similarites)
        if similarites > THRESHOLD:
            similar_doc[k].add(candidate)
            similar_doc[candidate].add(k)
print(similar_doc)

time_end = time.time()

defaultdict(<class 'set'>, {0: {1}, 1: {0}, 2: {3}, 3: {2}, 4: {5}, 5: {4}})


In [50]:
scalability = (time_end - time_begin) / len(do)
print(scalability)

7.218701975686209
