In [0]:
from pyspark.sql import functions as f
from pyspark.sql import Window as w
from typing import Dict, List
import logging
import re
import time
import unicodedata
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import itertools
import collections
from sklearn.utils import resample
from scipy.spatial.distance import cosine, jaccard
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score

How many true duplicates 2422

In [0]:
df_master = spark.table('data_user_hien.master_data_lsh').select('shop','title','modelID')


In [0]:
display(df_master)

shop,title,modelID
bestbuy.com,"""Philips 4000 Series 29"""" Class 2812"""" Diag. LED 720p 60Hz HDTV 29PFL4508F7 - Best Buy""",29PFL4508/F7
newegg.com,"""Newegg.com - SuperSonic 32"""" 720p LED HDTV SC-3211""",SC-3211
bestbuy.com,"""Sharp AQUOS 90"""" Class 90"""" Diag. LED 1080p 120Hz Smart 3D HDTV LC-90LE657U - Best Buy""",LC-90LE657U
bestbuy.com,"""Philips 2000 Series 39"""" Class 3858"""" Diag. LED 1080p 60Hz Smart HDTV 39PFL2908F7 - Best Buy""",39PFL2908/F7
newegg.com,"""Newegg.com - Sharp Aquos 70"""" Class (69.5"""" Diagonal) 1080p 120Hz LED-LCD HDTV - LC70LE550U""",LC70LE550U
newegg.com,"""Newegg.com - Refurbished: Samsung 40"""" 1080p 120Hz Smart LED HDTV - UN40F6350A""",UN40F6350A
bestbuy.com,"""Samsung 40"""" Class 40"""" Diag. LEDLCD TV 1080p HDTV 1080p HG40NA577LF - Best Buy""",HG40NA577LF
bestbuy.com,"""Samsung 46"""" Class 46"""" Diag. LEDLCD TV 1080p HDTV 1080p Rose Black UN46F5000AF - Best Buy""",UN46F5000AF
bestbuy.com,"""NEC Refurbished 46"""" Class 46"""" Diag. LEDLCD TV 1080p HDTV 1080p E463 - Best Buy""",E463
amazon.com,Samsung UN46ES6580 46-Inch 1080p 120Hz 3D Slim LED HDTV (Black),UN46ES6580


Which columns we select to generate the input text -> select title

In [0]:
def generate_text(df, merge_columns):
    return df[merge_columns].apply(lambda row: ' '.join(row.values.astype(str)).lower(), axis=1)

df_input         = df_master.orderBy(f.rand()).toPandas()
df_input['text'] = generate_text(df_input,  ["title"])


In [0]:
df_input

Unnamed: 0,shop,title,modelID,text
0,bestbuy.com,Naxa Nt1506 16 In. Widescreen HD LED Televisio...,NT-1506,naxa nt1506 16 in. widescreen hd led televisio...
1,newegg.com,"""Sansui 29"""" 720p 60Hz LED-LCD HDTV SLED2900 -...",SLED2900,"""sansui 29"""" 720p 60hz led-lcd hdtv sled2900 -..."
2,newegg.com,"""Newegg.com - Samsung 55"""" 1080p LED 3D TV""",UN55F6400,"""newegg.com - samsung 55"""" 1080p led 3d tv"""
3,bestbuy.com,"""LG 60"""" Class 60"""" Diag. LEDLCD TV 1080p 120 ...",60GA6400,"""lg 60"""" class 60"""" diag. ledlcd tv 1080p 120 ..."
4,newegg.com,"""Newegg.com - Samsung 60"""" Class (59.9"""" Diago...",PN60F5300AFXZA,"""newegg.com - samsung 60"""" class (59.9"""" diago..."
...,...,...,...,...
1619,bestbuy.com,"""Supersonic 19"""" Class 19"""" Diag. LEDLCD TV 72...",SC-1911,"""supersonic 19"""" class 19"""" diag. ledlcd tv 72..."
1620,bestbuy.com,"""RCA 22"""" Class 2112"""" Diag. LED 1080p 60Hz HD...",LED22B45RQD,"""rca 22"""" class 2112"""" diag. led 1080p 60hz hd..."
1621,newegg.com,"""Newegg.com - Refurbished: LG 55"""" Class (54.6...",55G2,"""newegg.com - refurbished: lg 55"""" class (54.6..."
1622,newegg.com,"""Newegg.com - Samsung 478 Series 32"""" Direct-L...",HG32NA478PFXZA,"""newegg.com - samsung 478 series 32"""" direct-l..."


Support Cleaning Text Function

In [0]:
def clean_text(sentence):
    SUBSTITUTE_REGEX = re.compile(r' +')
    KEEP_REGEX = re.compile(r'[a-zA-Z0-9\s]')
    MAX_CHARACTERS_ALLOWED_IN_THE_TITLE = 1000

    text = unicodedata.normalize('NFD', sentence)
    text = text.encode('ascii', 'ignore').decode('utf-8').lower().replace('-',
                                                                          ' ')  # Remove accents and convert to lower case
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = remove_stopwords(text)
    text = ''.join(KEEP_REGEX.findall(text))  # Extract only alphanumeric characters
    text = SUBSTITUTE_REGEX.sub(' ', text).strip()  # Replace multiple spaces with single space
    text = text[: MAX_CHARACTERS_ALLOWED_IN_THE_TITLE].strip()
    text = text.lower()

    return text

    
def remove_stopwords(text):
    text = text.replace('neweggcom',"")
    text = text.replace('best buy',"")
    text = text.replace('thenerdsnet',"")
    stop_words = set(stopwords.words('english'))
    for stop_word in stop_words:
      if stop_word in text.lower():
         text.replace(stop_word, "")
    return text

def correctDuplicates(id_1,id_2):
  if id_1 == id_2:
    isDup = 1
  else:
    isDup = 0
  return isDup

In [0]:
df_input['text'] = df_input.text.apply(lambda x: clean_text(x))


In [0]:
df_input

Unnamed: 0,shop,title,modelID,text
0,bestbuy.com,Naxa Nt1506 16 In. Widescreen HD LED Televisio...,NT-1506,naxa nt1506 16 in widescreen hd led television...
1,newegg.com,"""Sansui 29"""" 720p 60Hz LED-LCD HDTV SLED2900 -...",SLED2900,sansui 29 720p 60hz led lcd hdtv sled2900
2,newegg.com,"""Newegg.com - Samsung 55"""" 1080p LED 3D TV""",UN55F6400,samsung 55 1080p led 3d tv
3,bestbuy.com,"""LG 60"""" Class 60"""" Diag. LEDLCD TV 1080p 120 ...",60GA6400,lg 60 class 60 diag ledlcd tv 1080p 120 hz 3d ...
4,newegg.com,"""Newegg.com - Samsung 60"""" Class (59.9"""" Diago...",PN60F5300AFXZA,samsung 60 class 599 diagonal size 1080p 600hz...
...,...,...,...,...
1619,bestbuy.com,"""Supersonic 19"""" Class 19"""" Diag. LEDLCD TV 72...",SC-1911,supersonic 19 class 19 diag ledlcd tv 720p hdt...
1620,bestbuy.com,"""RCA 22"""" Class 2112"""" Diag. LED 1080p 60Hz HD...",LED22B45RQD,rca 22 class 2112 diag led 1080p 60hz hdtv dvd...
1621,newegg.com,"""Newegg.com - Refurbished: LG 55"""" Class (54.6...",55G2,refurbished lg 55 class 546 diag 3 d ready 108...
1622,newegg.com,"""Newegg.com - Samsung 478 Series 32"""" Direct-L...",HG32NA478PFXZA,samsung 478 series 32 direct lit hospitality l...


Collect Shingles List

In [0]:
config ={
        'perms': 1000,
        'shingle': True,
        'strip': True,
        'k': 5,
    }

In [0]:
def preprocess(text,params):
    shingle = params.get("shingle", False)
    skip_cleaning =  params.get("skip_cleaning", False) 
    text = clean_text(text)
    if not shingle:
        tokens = text.split()
    else:
        k = params['k']
        tokens = to_shingle(text, k, params['strip'])
    return tokens

def to_shingle(text: str, k: int = 3, strip=True):
  if strip:
      text = re.compile(r' +').sub('', text)
  shingles = []
  limit = len(text)-k+1
  for i in range(limit):
      shingles += [text[i:i+k]]
  return shingles

def shingling(data:List,params):
    shingles = []
    for text in data:
        tokens = preprocess(text,params)
        shingles.append(tokens)
    return shingles

shingles = shingling(df_input['text'],config)

Construct model words

In [0]:
def extract_model_words(shingle_sets):
  full_set = {item for set_ in shingle_sets for item in set_}
  model_words = {}
  for i, shingle in enumerate(list(full_set)):
      model_words[shingle] = i
  return model_words

In [0]:
# model_words = extract_model_words(shingles)
# len(model_words)

after generating model words, use model words and shingles list to build Binary Vectors

In [0]:
def one_hot_encoder(shingles: set, model_words: dict):
    vec = np.zeros(len(model_words))
    for shingle in shingles:
        index = model_words[shingle]
        vec[index] = 1
    return vec

In [0]:
# binary_vectors = []
# for shingle in shingles:
#     binary_vectors.append(one_hot_encoder(shingle, model_words))

# binary_vectors = np.stack(binary_vectors)
# binary_vectors.shape

Binary Vectors to Signature Matrix

In [0]:
def minhash_permutation_rand(model_words: dict,perms):
    length = len(model_words.keys())
    row_hash = np.zeros((perms, length))
    for i in range(perms):
        permutation = np.random.permutation(len(model_words)) + 1
        row_hash[i,:] = permutation.copy()
    return row_hash.astype(int)

In [0]:
def get_signature(minhash, vector):
    # get index of 1 value in vector
    index = np.nonzero(vector)[0].tolist()
    shingles = minhash[:, index]
    # find minimum value in each hash vector
    signature = np.min(shingles, axis=1)
    return signature

In [0]:
# signatures = []
# for binary_vector in binary_vectors:
#      signatures.append(get_signature(row_hash, binary_vector))

# # merge signatures into single array
# signature_matrix = np.transpose(np.stack(signatures))
# signature_matrix.shape

In [0]:
def candidate_pairs(signature_matrix, b, r):
    #n: number of perms (hash functions)
    n,d = signature_matrix.shape
    assert(n==b*r)
    hashbuckets = collections.defaultdict(set)
    bands = np.array_split(signature_matrix, b, axis=0)
    
    for i,band in enumerate(bands):
        for j in range(d):
            band_id = tuple(list(band[:,j])+[str(i)])
            hashbuckets[band_id].add(j)
            
    candidate_pairs = set()
    for bucket in hashbuckets.values():
        if len(bucket) > 1:
            for pair in itertools.combinations(bucket, 2):
                candidate_pairs.add(pair)
    return candidate_pairs
                
def lsh_pairs(signature_matrix,candidate_pairs,t):
    lsh_pairs = set()
    for (i, j) in candidate_pairs:
      set_i = set(signature_matrix[:,i].flatten())
      set_j = set(signature_matrix[:,j].flatten())
      if jaccard(set_i,set_j) > t:
           lsh_pairs.add((i, j))
    return lsh_pairs

Fraction of comparisons

In [0]:
# count_possible_comparisons = candidate_pairs(signature_matrix,1000,6)
# len(count_possible_comparisons)

In [0]:
# count_comparisons_made     = lsh_pairs(signature_matrix,count_possible_comparisons,0.4)
# len(count_comparisons_made)

In [0]:
# fraction_comparisons = len(count_comparisons_made)/len(count_possible_comparisons)
# print(fraction_comparisons)

output of LSH

In [0]:
def create_output_dataframe(df_input,count_comparisons_made):
    df_a = pd.DataFrame()
    df_b = pd.DataFrame()
    for a,b in count_comparisons_made:
        df_temp_a = df_input[df_input['row_index'] == a]
        df_temp_b = df_input[df_input['row_index'] == b]
        df_a = df_a.append(df_temp_a)
        df_b = df_b.append(df_temp_b)

    df_a = df_a.add_prefix('left_')
    df_b = df_b.add_prefix('right_')

    df_a.reset_index(drop=True, inplace=True)
    df_b.reset_index(drop=True, inplace=True)
    df_output = pd.concat([df_a, df_b], axis=1)
    df_output["isDup"] = df_output.apply(lambda x: correctDuplicates(x["left_modelID"], x["right_modelID"]), axis = 1)
    
    return df_output

In [0]:
def minhash_jaccard_similarity(text1: str, text2: str):
    list1 = text1.split()
    list2 = text2.split()
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection    
    return float(intersection) / union                                                                                      

In [0]:
def xgboost(df_output):
  df_output = df_output[['left_text','right_text','isDup']]
  df_output['len_left_text'] = df_output.left_text.apply(lambda x: len(str(x)))
  df_output['len_right_text'] = df_output.right_text.apply(lambda x: len(str(x)))
  df_output['diff_len'] = df_output.len_left_text - df_output.len_right_text
  df_output['len_char_left_text'] = df_output.left_text.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
  df_output['len_char_right_text'] = df_output.right_text.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
  df_output['len_word_left_text'] = df_output.left_text.apply(lambda x: len(str(x).split()))
  df_output['len_word_right_text'] = df_output.right_text.apply(lambda x: len(str(x).split()))
  df_output['common_words'] = df_output.apply(lambda x: len(set(str(x['left_text']).lower().split()).intersection(set(str(x['right_text']).lower().split()))), axis=1)
  df_output = df_output.reset_index()
  df_output = df_output[~df_output.isin([np.inf, -np.inf])]
  df_output['minhash_jaccard'] = df_output.apply(lambda x: minhash_jaccard_similarity(x["left_text"],x["right_text"]),axis = 1)
  df_output.drop(["left_text", "right_text"], axis=1, inplace=True)
  X = df_output.loc[:, df_output.columns != 'isDup']
  y = df_output.loc[:, df_output.columns == 'isDup']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
  model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) 
  model.fit(X_train,y_train)
  prediction = model.predict(X_test)
  cm=confusion_matrix(y_test, prediction)
  duplicates = cm.sum(axis=1) - np.diag(cm)
  duplicates_found = np.diag(cm)
  return duplicates, duplicates_found

In [0]:
def bootstrap(df_input):
  t_fraction_comparisons = []
  t_pair_quality = []
  t_pair_completeness =[]
  for threshold in np.arange(0.05,1, 0.05):
    print('starting', threshold)
    b_fraction_comparisons = []
    b_pair_quality = []
    b_pair_completeness = []
    for i in range(5):
      boot = df_input.sample(1000, replace=True)
      boot['row_index'] = np.arange(len(boot))
      boot['text'] = boot.text.apply(lambda x: clean_text(x))
      shingles = shingling(boot['text'],config)
      model_words = extract_model_words(shingles)
      binary_vectors = []
      for shingle in shingles:
          binary_vectors.append(one_hot_encoder(shingle, model_words))

      binary_vectors = np.stack(binary_vectors)
      row_hash = minhash_permutation_rand(model_words, perms= 6000)
      signatures = []
      for binary_vector in binary_vectors:
           signatures.append(get_signature(row_hash, binary_vector))
          
      signature_matrix = np.transpose(np.stack(signatures))
      count_possible_comparisons = candidate_pairs(signature_matrix,1000,6)
      count_comparisons_made     = lsh_pairs(signature_matrix,count_possible_comparisons,threshold)
      fraction_comparisons       = len(count_comparisons_made)/len(count_possible_comparisons)
      b_fraction_comparisons.append(fraction_comparisons)
      
      df_output = create_output_dataframe(boot,count_comparisons_made)
      duplicates,duplicates_found  = xgboost(df_output)
      pair_quality               = len(duplicates_found)/len(count_comparisons_made)
      pair_completeness          = len(duplicates_found)/len(duplicates)
      b_pair_quality.append(pair_quality)
      b_pair_completeness.append(pair_completeness)
    
    avg_fraction_comparisons = sum(b_fraction_comparisons)/5
    avg_pair_quality = sum(b_pair_quality)/5
    avg_pair_completeness = sum(b_pair_completeness)/5
    
    t_fraction_comparisons.append(avg_fraction_comparisons)
    t_pair_quality.append(avg_pair_quality)
    t_pair_completeness.append(avg_pair_completeness)
    print('done', threshold)
  return t_fraction_comparisons,t_pair_quality,t_pair_completeness

In [0]:
a,b,c = bootstrap(df_input)

In [0]:
print(a,b,c)

In [0]:
#t_fraction_comparisons
print(a)

In [0]:
#t_pair_quality
print(b)


In [0]:
#t_pair_completeness
print(c)


In [0]:
def calculateF1(PQ,PC):
  F1_list = []
  for index in range(len(PQ)):
    first_part = 2*PQ[index]*PC[index]
    second_part = PQ[index] + PC[index]
    F1 = first_part/second_part
    F1_list.append(F1)
  return F1_list

In [0]:
F1 = calculateF1(b,c)
print(F1)