In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
from functions import *
import numpy as np
import os
import pandas as pd
import zlib
from functools import reduce
from itertools import combinations

In [18]:
document1 = "This is a sample text containing some random words"
document2 = "This is a sample text with different random words"
s1 = shingle(document1, k=9)
s2 = shingle(document2, k=9)
print(f'Jaccard similarity: {compareSets(s1, s2)}')

Jaccard similarity: 0.21818181818181817


In [28]:
min_hasher = MinHash(n=200)
sig1 = min_hasher.create_signature(s1)
sig2 = min_hasher.create_signature(s2)
print(f'Estimated Jaccard similarity: {compare_signatures(sig1, sig2)}')

Estimated Jaccard similarity: 0.215


Document 'MIX_OHIO_BROWN_CHEMLAWN.txt' is a mix of documents 'OHIO_MATTRESS.txt', 'BROWN_FORMAN.txt', and 'CHEMLAWN.txt'

Document 'MIX_CHEMLAWN_SHAMROCK.txt' is 'DIAMOND_SHAMROCK.txt' inserted into 'CHEMLAWN.txt'.

In [41]:
data = dict()
print('Documents in dataset:\n')
for i, filename in enumerate(os.listdir("dataset")):
    print(f'{i} {filename}')
    with open(os.path.join("dataset", filename), 'r') as f:
        data[filename[:-4]] = f.read()



Documents in dataset:

0 COMPUTER_TERMINAL_SYSTEMS.txt
1 OHIO_MATTRESS.txt
2 ARGENTINA_COULD_SUSPEND_DEBT PAYMENTS.txt
3 MIX_OHIO_BROWN_CHEMLAWN.txt
4 BRAZIL_ANTI_INFLATION_PLAN.txt
5 KEY_US_TAX_WRITERS_SEEK_ESTATE_TAX_CURBS.txt
6 TOWER_REPORT_DIMINISHES_REAGANS_HOPES_OF_REBOUND.txt
7 DEBT_DOWGRADED_BY_MOODYS.txt
8 CHEMLAWN.txt
9 US_BANK_DISCOUNT_BORROWINGS_310_MLN_DLRS.txt
10 QUEBECOR_HEAD_SEES_NEW_VENTURES_LIKELY.txt
11 DIAMOND_SHAMROCK.txt
12 MIX_CHEMLAWN_SHAMROCK.txt
13 BROWN_FORMAN.txt
14 TALKING_POINT_BANKAMERICA.txt


In [73]:
#Shingling the dataset
list_of_data = [c for c in data]
shingled_data = dict()
for c in data:
    shingled_data[c] = shingle(data[c], k=9)
    
#Creating signatures for the dataset
min_hasher = MinHash(n=200)
signed_data = dict()
for c in data:
    signed_data[c] = min_hasher.create_signature(shingled_data[c])

In [50]:
#Code for searching for similar pairs
def jaccard(c1, c2):
    return compareSets(shingled_data[c1], shingled_data[c2])

def jaccard_estimate(c1, c2):
    return compare_signatures(signed_data[c1], signed_data[c2])

def calculate_similarity_of_all_pairs(method):
    result = pd.DataFrame(columns=list_of_data)
    for row in list_of_data:
        result[row] = [method(row, col) for col in list_of_data]
    return result

In [61]:
def find_pairs_above_threshold(df, threshold):
    checked = set()
    result = set()
    for col in df:
        checked.add(col)
        for i, sim in enumerate(df[col]):
            curr = list_of_data[i]
            if curr in checked: continue
            if sim > threshold: result.add((col, curr))
    return result
            

In [74]:
#Calculating the Jaccard similarity for every pair of documents
jaccard_sim_map = calculate_similarity_of_all_pairs(jaccard)

print(jaccard_sim_map)
gt_result = find_pairs_above_threshold(jaccard_sim_map, 0.07)
print(gt_result)
    

    COMPUTER_TERMINAL_SYSTEMS  OHIO_MATTRESS  \
0                    1.000000       0.032844   
1                    0.032844       1.000000   
2                    0.027453       0.023948   
3                    0.041905       0.137717   
4                    0.031505       0.028986   
5                    0.034010       0.022397   
6                    0.053157       0.034633   
7                    0.025797       0.023445   
8                    0.039463       0.028425   
9                    0.037639       0.026209   
10                   0.043356       0.036420   
11                   0.025580       0.010811   
12                   0.046276       0.028271   
13                   0.037304       0.026866   
14                   0.046143       0.033172   

    ARGENTINA_COULD_SUSPEND_DEBT PAYMENTS  MIX_OHIO_BROWN_CHEMLAWN  \
0                                0.027453                 0.041905   
1                                0.023948                 0.137717   
2                    

In [98]:
#Estimating the Jaccard similarity for every pair of documents
jaccard_sim_estim_map = calculate_similarity_of_all_pairs(jaccard_estimate)

print(jaccard_sim_estim_map)
est_result = find_pairs_above_threshold(jaccard_sim_estim_map, 0.07)
print(est_result)

print('Results of estimate')
acum = 0
for p in est_result:
    if p in gt_result or (p[1], p[0]) in gt_result: acum += 1
print(f'len of est_result: {len(est_result)}')
print(f'found {acum}/{len(gt_result)}')

    COMPUTER_TERMINAL_SYSTEMS  OHIO_MATTRESS  \
0                       1.000          0.020   
1                       0.020          1.000   
2                       0.025          0.025   
3                       0.025          0.120   
4                       0.030          0.015   
5                       0.030          0.020   
6                       0.040          0.045   
7                       0.050          0.005   
8                       0.030          0.015   
9                       0.040          0.040   
10                      0.025          0.015   
11                      0.010          0.025   
12                      0.030          0.015   
13                      0.030          0.020   
14                      0.045          0.035   

    ARGENTINA_COULD_SUSPEND_DEBT PAYMENTS  MIX_OHIO_BROWN_CHEMLAWN  \
0                                   0.025                    0.025   
1                                   0.025                    0.120   
2                    

COMPUTER_TERMINAL_SYSTEMS
OHIO_MATTRESS
ARGENTINA_COULD_SUSPEND_DEBT PAYMENTS
MIX_OHIO_BROWN_CHEMLAWN
BRAZIL_ANTI_INFLATION_PLAN
KEY_US_TAX_WRITERS_SEEK_ESTATE_TAX_CURBS
TOWER_REPORT_DIMINISHES_REAGANS_HOPES_OF_REBOUND
DEBT_DOWGRADED_BY_MOODYS
CHEMLAWN
US_BANK_DISCOUNT_BORROWINGS_310_MLN_DLRS
QUEBECOR_HEAD_SEES_NEW_VENTURES_LIKELY
DIAMOND_SHAMROCK
MIX_CHEMLAWN_SHAMROCK
BROWN_FORMAN
TALKING_POINT_BANKAMERICA


In [102]:
class LSH:
    def __init__(self, signatures, num_of_bands, threshold):
        self.sig = signatures
        self.num_of_bands = num_of_bands
        self.threshold = threshold
        self.buckets = dict()
        for k in self.sig: 
            assert len(self.sig[k]) % self.num_of_bands == 0
            self.width_of_band = int(len(self.sig[k]) / self.num_of_bands)
    
    def clear_buckets(self):
        for k in self.buckets:
            self.buckets[k].clear()
    
    def add_to_bucket(self, bucket, candidate):
        if not bucket in self.buckets: self.buckets[bucket] = set()
        self.buckets[bucket].add(candidate)
    
    def hash_band(self, band):
        reduced_band = reduce(lambda accum, x: f'{accum}{x}', band)
        return zlib.adler32(reduced_band.encode('utf8'))
    
    def get_candidates(self):
        result = set()
        for i in range(self.num_of_bands):
            self.clear_buckets()
            for doc in self.sig:
                #print(f'doc:{doc}, i:{i}, band:{self.sig[doc]}')
                hashed_band = self.hash_band(self.sig[doc][i*self.width_of_band:((i+1)*self.width_of_band)])
                self.add_to_bucket(bucket=hashed_band, candidate=doc)
            for b in self.buckets:
                result.update(set(combinations(self.buckets[b],2)))
        #print(f'len of result: X : {len(result)}')
        return result
    
    def get_candidates_above_threshold(self):
        result = set()
        candidates = self.get_candidates()
        for c in candidates:
            if jaccard(c[0], c[1]) > self.threshold:
                result.add(c)
        return result
                
            
        

In [109]:
lsh = LSH(signatures=signed_data, num_of_bands=100, threshold=0.07)
lsh_result = lsh.get_candidates_above_threshold()
print(lsh_result)

print('Results of LSH')
acum = 0
for p in lsh_result:
    if p in gt_result or (p[1], p[0]) in gt_result: acum += 1
print(f'len of est_result: {len(lsh_result)}')
print(f'found {acum}/{len(gt_result)}')

len of result: X : 20
{('MIX_OHIO_BROWN_CHEMLAWN', 'OHIO_MATTRESS'), ('DIAMOND_SHAMROCK', 'MIX_CHEMLAWN_SHAMROCK'), ('TALKING_POINT_BANKAMERICA', 'QUEBECOR_HEAD_SEES_NEW_VENTURES_LIKELY'), ('TOWER_REPORT_DIMINISHES_REAGANS_HOPES_OF_REBOUND', 'MIX_OHIO_BROWN_CHEMLAWN'), ('BROWN_FORMAN', 'MIX_OHIO_BROWN_CHEMLAWN'), ('MIX_CHEMLAWN_SHAMROCK', 'CHEMLAWN'), ('TALKING_POINT_BANKAMERICA', 'TOWER_REPORT_DIMINISHES_REAGANS_HOPES_OF_REBOUND'), ('QUEBECOR_HEAD_SEES_NEW_VENTURES_LIKELY', 'TOWER_REPORT_DIMINISHES_REAGANS_HOPES_OF_REBOUND'), ('CHEMLAWN', 'MIX_OHIO_BROWN_CHEMLAWN'), ('MIX_CHEMLAWN_SHAMROCK', 'MIX_OHIO_BROWN_CHEMLAWN')}
Results of LSH
len of est_result: 10
found 10/15
