## Importing packages

In [1]:
import json
import pandas as pd
import re
import time
import numpy as np
import random
import collections
import itertools
import math
from difflib import SequenceMatcher

## Importing and cleaning the data

In [2]:
#import json data
with open('TVs-all-merged.json') as file:
    original_data=json.load(file)

new_data = {}
i = 1
for key in original_data.keys():
    for description in original_data[key]:
        new_data[i] = description
        i+=1
print(len(new_data.keys()))

1624


In [3]:
#Obtain bootstrapped data (about 60%)
bootData={}
numBootstraps=int(len(new_data.keys())*0.63) #63% of data for bootstrap
for i in range(1, numBootstraps+1):
    index=random.randint(1, len(new_data.keys()))
    #bootData[i]=new_data[index].items()
    bootData[i]=(new_data[index])

data=bootData

## Cleaning title and featuresMap input for hashing

In [4]:
#universal term for 'inch' and 'hz' and make everything lower case
def preprocess(text):
    CleaningDictionary = {'inch':['Inch', 'inches', '"', '-inch', ' inch', 'inch']
                    ,'hz':['Hertz', 'hertz', 'Hz', 'HZ', ' hz', '-hz', 'hz']}
    for normVal in CleaningDictionary.keys():
        possible=CleaningDictionary[normVal]
        for notNormVal in possible:
            text=text.replace(notNormVal, normVal)
    title=text.lower()
    title=re.sub("[^a-zA-Z0-9\s\.]","",title)
    return title

for i in range(1, len(data.keys())+1):
    data[i]['title']=preprocess(data[i]['title'])
    for feature in data[i]['featuresMap']:
        data[i]['featuresMap'][feature]=preprocess(data[i]['featuresMap'][feature])


In [5]:
pattern='([a-zA-Z0-9]*(([0-9]+)|([0-9]+))[a-zA-Z0-9]*)'
shingles_title=[]
num_sig=512

for i in range(1, len(data.keys())+1):
    for titleWords in re.findall(pattern, data[i]['title']):
        shingles_title.append(titleWords[0].strip()) #collection of all words
        
shingles=list(set(shingles_title))

#add all shingles of lenght 8 or longer twice; modelID or specific inch; let count more
for i in range(0, len(shingles)-1):
    if len(shingles[i])>=8:
        shingles.append(shingles[i])

for i in range(1, len(data.keys())+1):
    all_prods=[]
    bin_vecs=[]
    
    for titleWords in re.findall(pattern, data[i]['title']):
        all_prods.append(titleWords[0].strip())
    for word in shingles:
        if word in all_prods:
            bin_vecs.append(1)
        else:
            bin_vecs.append(0)
            
    data[i]['vector']=bin_vecs
    #data[i]['signature']=[int(len(bin_vecs)/2)] * (int(len(bin_vecs) /2)) 
    data[i]['signature']=[num_sig] * (num_sig)

In [510]:
shingles

['e423',
 '46k316',
 'led3dtv5586',
 'kdl32r400a',
 '45910inch',
 'un55d7000',
 'lc40le835u',
 'le32f2220',
 '58',
 '50l1350u',
 'led42c45rq',
 '46',
 'avk10s22w',
 'ln46c750',
 'e291ia1',
 '2158inch',
 '32le5300',
 'lc60le835u',
 'un55es7500',
 'm190mv',
 '50k360g',
 'un75f7100',
 '19',
 '55lm7600',
 'e550ia0',
 'un60es6500',
 '50ln5700',
 'tcp42s30',
 'un39eh5003',
 'sled3900',
 '232750inch',
 'le40d3281',
 'pn50c7000',
 'le55fhdf3310',
 'un65f7050a',
 '2010',
 '60la6200',
 '50pm9700',
 'pn51f8500afxza',
 '29l1350u',
 'le39f32800',
 '55lk520',
 'lc90le657u',
 'un55d8000',
 'pn43d490',
 '40l2200u',
 'pled1960a',
 '64',
 'sc462tc',
 'un46f5500',
 '42lt770h',
 'un75f7100afxza',
 'un40f5500afxza',
 '47la6900',
 'm550sl',
 '55lm6200',
 '50l5200u',
 'lc60le550u',
 'e463',
 'tcl32e3',
 '46pfl3708f7',
 '32ls349c',
 '50ln5750',
 '22lb45rqd',
 'un40es6580',
 '9inchdiagonal',
 'ln32d403fxza',
 'un22f5000afxza',
 'un32eh4003fxza',
 '27inch',
 '60la7400',
 '42pj350c',
 'e470vle',
 'un50eh5300rb',

In [637]:
shingles_title[0]

'50l2300u'

[]

## MinHash algorithm, creating signatures

In [513]:
def isPrime(n):
    for i in range(2,int(n**0.5)+1):
        if n%i==0:
            return False
    if n < int(len(bin_vecs) /2):
        return False
        
    return True

def HashFunction(a,b,p,r):
    return (a + b*r) % p 

primes = [i for i in range(0,len(shingles)) if isPrime(i)]

HashFrame = pd.DataFrame({'HashFunction':list(range(0,num_sig))
              ,'a': np.random.randint(0,len(shingles),size=num_sig)
              ,'b': np.random.randint(0,len(shingles),size=num_sig)
              ,'p': random.choices(primes,k=num_sig)})


integercounter = 0

for i in range(1, len(data.keys())+1):
    if (integercounter == 50):
            print('50 done')
            integercounter = 0
    for index,row in HashFrame.iterrows():
        x = 0
        for entry in data[i]['vector']:
            if entry == 1:
                data[i]['signature'][index] = min(data[i]['signature'][index],
                    HashFunction(int(row['a'])
                    ,int(row['b'])
                    ,int(row['p'])
                    ,x))
            x=x+1
    integercounter+=1

50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done
50 done


In [514]:
signatures=pd.DataFrame(None)
for i in range(1, len(data.keys())+1):
    signatures=pd.concat([signatures, pd.DataFrame({i:data[i]['signature']})], axis=1)

In [515]:
signatures

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,244,285,259,244,512,512,153,512,303,13,...,24,153,22,58,235,399,190,242,244,491
1,512,351,19,45,378,186,22,45,0,36,...,440,36,36,178,71,109,313,371,435,121
2,43,43,34,43,43,43,43,43,43,43,...,3,43,43,43,196,43,43,43,43,41
3,26,237,50,26,52,55,164,473,182,83,...,319,512,99,26,116,47,0,26,26,16
4,197,95,27,492,28,281,81,95,186,33,...,7,88,281,122,61,95,95,116,492,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,380,475,56,431,41,289,43,512,204,349,...,329,57,6,103,99,399,120,298,83,135
508,16,16,160,42,64,241,3,264,16,11,...,268,11,11,38,39,16,69,42,42,175
509,107,111,206,206,111,129,27,111,87,206,...,155,206,14,8,309,70,111,162,177,206
510,255,255,148,68,255,159,99,68,224,161,...,255,161,161,199,26,74,111,105,104,46


In [516]:
len(data[1]['vector'])

1263

## LSH procedure using the generated signatures

In [646]:
#code based on https://towardsdatascience.com/locality-sensitive-hashing-how-to-find-similar-items-in-a-large-set-with-precision-d907c52b05fc
#don't forget to cite

def LSHCandidates(sigs, b, r):
    n, prod=sigs.shape
    assert(n==b*r)
    hashbuckets=collections.defaultdict(set)
    bands=np.array_split(sigs, b, axis=0) #split the signatures into the b bands
    for i,band in enumerate(bands):
        for j in range(prod):
            
            #hashbuckets[tuple(bands[j])].append(j)
            
            band_id=tuple(list(band.iloc[:,j])+[str(i)])
            hashbuckets[band_id].add(j) #add bands to buckets with same band_ids
    candidate_pairs=set()
    for bucket in hashbuckets.values():
        if len(bucket)>1:
            for pair in itertools.combinations(bucket,2): #for all productbands in same bucket, make candidate pairs
                candidate_pairs.add(pair)
    return candidate_pairs


candidate_pairs=LSHCandidates(signatures, 32, 16)
len(candidate_pairs)

#elements in candidate_pairs need to be incremented by 1 to relate back to "data"
cands=list(candidate_pairs)
cands1=np.array(cands)
candidates=[x+1 for x in cands1] #array of candidate pairs
candidates=[c.tolist() for c in candidates]

same=[]
for i in range(0, len(candidates)-1):
    for j in range(0, len(candidates)-1):
        if candidates[i][0]==candidates[j][1] and candidates[i][1]==candidates[j][0] and i!=j:
            same.append(candidates[i])

In [647]:
test=[]
for item in candidates:
    if item not in same:
        test.append(item)
        
len(test)
candidates=test

In [648]:
def keyToString(key):
    key=re.sub(r'[^\w\s]','',key)
    string=key.lower()
    string=string.replace(" ", "")
    return string

similarities=[]
for pair in range(0, len(candidates)-1):
    init1=preprocess(data[candidates[pair][0]]['title'])
    init2=preprocess(data[candidates[pair][1]]['title'])
    title1=keyToString(init1)
    title2=keyToString(init2)
    sim=SequenceMatcher(None, title1, title2).ratio()
    similarities.append(sim)
    
sameProducts=[i for i,e in enumerate(similarities) if e>=0]
    
vectorOfPairs=[]
for index, value in enumerate(sameProducts):
    vectorOfPairs.append(candidates[value])
    
modelIDs=[]
for i in range(1, len(data.keys())+1):
    #modelIDs[i]=data[i]['modelID']
    modelIDs.append(data[i]['modelID'])
    
def listDuplicates(modelIDs):
    dups=collections.defaultdict(set)
    for index, product in enumerate(modelIDs):
        dups[product].add(index)
    duplicates=set()
    for match in dups.values():
        if len(match)>1:
            for pair in itertools.combinations(match,2):
                duplicates.add(pair)
    return duplicates

duplicates=listDuplicates(modelIDs)

TP=0
truePos=[]
trueMod=[]
for i in range(0, len(vectorOfPairs)-1):
    if data[vectorOfPairs[i][0]]['modelID']==data[vectorOfPairs[i][1]]['modelID']:
        TP=TP+1 #number of true positives
        trueMod.append(data[vectorOfPairs[i][1]]['modelID'])
        truePos.append(vectorOfPairs[i])

FP=len(vectorOfPairs)-TP #number of false positives
FN=len(duplicates)-TP

pairCompl=TP/len(duplicates)
pairQual=TP/len(candidates)
precision=TP/(TP+FP)
recall=TP/(TP+FN)
F1=2*(precision*recall)/(precision+recall)
F1star=2*(pairQual*pairCompl)/(pairQual+pairCompl)

In [649]:
print('Pair completenes is: ' + str(pairCompl))
print('Pair quality is: ' + str(pairQual))
print('F1-measure is: ' + str(F1))
print('F1*-measure is: ' + str(F1star))

Pair completenes is: 0.7006651884700665
Pair quality is: 0.5039872408293461
F1-measure is: 0.5868152274837511
F1*-measure is: 0.5862708719851577


0.7006651884700665

In [625]:
k=preprocess(data[112]['title'])
keyToString(k)

'dynex40inchclass1080p60hzlcdhdtv'

In [626]:
def keyToString(key):
    key=re.sub(r'[^\w\s]','',key)
    string=key.lower()
    string=string.replace(" ", "")
    return string

similarities=[]
for pair in range(0, len(candidates)-1):
    init1=preprocess(data[candidates[pair][0]]['title'])
    init2=preprocess(data[candidates[pair][1]]['title'])
    title1=keyToString(init1)
    title2=keyToString(init2)
    sim=SequenceMatcher(None, title1, title2).ratio()
    similarities.append(sim)
    
sameProducts=[i for i,e in enumerate(similarities) if e>=0.70]
    
vectorOfPairs=[]
for index, value in enumerate(sameProducts):
    vectorOfPairs.append(candidates[value])
    
modelIDs=[]
for i in range(1, len(data.keys())+1):
    #modelIDs[i]=data[i]['modelID']
    modelIDs.append(data[i]['modelID'])
    
def listDuplicates(modelIDs):
    dups=collections.defaultdict(set)
    for index, product in enumerate(modelIDs):
        dups[product].add(index)
    duplicates=set()
    for match in dups.values():
        if len(match)>1:
            for pair in itertools.combinations(match,2):
                duplicates.add(pair)
    return duplicates

duplicates=listDuplicates(modelIDs)

TP=0
truePos=[]
trueMod=[]
for i in range(0, len(vectorOfPairs)-1):
    if data[vectorOfPairs[i][0]]['modelID']==data[vectorOfPairs[i][1]]['modelID']:
        TP=TP+1 #number of true positives
        trueMod.append(data[vectorOfPairs[i][1]]['modelID'])
        truePos.append(vectorOfPairs[i])

FP=len(vectorOfPairs)-TP #number of false positives
FN=len(duplicates)-TP

pairCompl=TP/len(duplicates)
pairQual=TP/len(candidates)
precision=TP/(TP+FP)
recall=TP/(TP+FN)
F1=2*(precision*recall)/(precision+recall)
F1star=2*(pairQual*pairCompl)/(pairQual+pairCompl)

In [627]:
print('Pair completenes is: ' + str(pairCompl))
print('Pair quality is: ' + str(pairQual))
print('F1-measure is: ' + str(F1))
print('F1*-measure is: ' + str(F1star))

Pair completenes is: 0.2660753880266075
Pair quality is: 0.0024052916416115455
F1-measure is: 0.0487408610885459
F1*-measure is: 0.004767485747204068


In [591]:
print('kk ' + str(TP))

kk 266


In [None]:
def keyToString(key):
    key=re.sub(r'[^\w\s]','',key)
    string=key.lower()
    string=string.replace(" ", "")
    return string

similarities=[]
for pair in range(0, len(candidates)-1):
    title1=keyToString(data[candidates[pair][0]]['title'])
    title2=keyToString(data[candidates[pair][1]]['title'])
    sim=SequenceMatcher(None, title1, title2).ratio()
    similarities.append(sim)
    
sameProducts=[i for i,e in enumerate(similarities) if e>=0.75]

vectorOfPairs=[]
for same in range(0, len(sameProducts)-1):
    vectorOfPairs.append(candidates[sameProducts[same]])
    
modelIDs=[]
for i in range(1, len(data.keys())+1):
    #modelIDs[i]=data[i]['modelID']
    modelIDs.append(data[i]['modelID'])

def listDuplicates(modelIDs):
    duplicates=collections.defaultdict(list)
    for index, product in enumerate(modelIDs):
        duplicates[product].append(index)
    return((key,locs) for key,locs in duplicates.items() if len(locs)>1)

duplicates=[]
for dup in sorted(listDuplicates(modelIDs)):
    duplicates.append(dup)
    
duplicates_combinations=[]
for i in range(0, len(duplicates)-1):
    if len(duplicates[i][1])==2:
        duplicates_combinations.append(duplicates[i])
    else:
        for pair in itertools.combinations(duplicates[i][1],2):
            duplicates_combinations.append(pair)
    
TP=0
truePos=[]
for i in range(0, len(vectorOfPairs)-1):
    if data[vectorOfPairs[i][0]]['modelID']==data[vectorOfPairs[i][1]]['modelID']:
        TP=TP+1 #number of true positives
        #truePos.append(data[vectorOfPairs[i][1]]['modelID'])
        truePos.append(vectorOfPairs[i])

FP=len(vectorOfPairs)-TP #number of false positives

measure=TP/len(duplicates_combinations)
measure

## Similarity

In [19]:
#1024; (1,1024), (2,512), (4,256), (8,128), (16,64), (32,32), etc.

for i in range(0,11):
    b=2**i
    c=2**(11-i)
    #candidate_pairs=LSHCandidates(signatures, , 4)
    print(b)
    print(c)


1
2048
2
1024
4
512
8
256
16
128
32
64
64
32
128
16
256
8
512
4
1024
2
