In [1]:
import pandas as pd
import numpy as np

In [2]:
amazon_train = pd.read_csv("preprocessed_amazon_train.csv", delimiter = ",")
google_train = pd.read_csv("preprocessed_google_train.csv", delimiter = ",")
amazon_test = pd.read_csv("preprocessed_amazon_test.csv", delimiter = ",")
google_test = pd.read_csv("preprocessed_google_test.csv", delimiter = ",")
train_perfect_matching = pd.read_csv("preprocessed_train_perfect_matching.csv", delimiter = ",")
test_perfect_matching = pd.read_csv("preprocessed_test_perfect_matching.csv", delimiter = ",")

In [21]:
# Calculate the jaccard distance
def jaccard_similarity(str1,str2):
    a = set(str1.split())
    b = set(str2.split())
    c = a.intersection(b)
    jaccard_similarity = float(len(c)) / (len(a) + len(b) - len(c))
    return jaccard_similarity

In [22]:
# Get the distance matrix
def similarity_matrix_generator(amazon_key,google_key):
    similarity_matrix = np.zeros((len(amazon_key),len(google_key)))
    for i in range(0,len(amazon_key)):
        for j in range(0,len(google_key)):
            similarity_matrix[i][j] = jaccard_similarity(amazon_key[i],google_key[j])
    return similarity_matrix

# Get the potential candidate
def potential_matching(amazon_data,google_data,similarity_matrix,threshold):
    candidate_index = np.where(similarity_matrix >threshold)
    # retrieve index for each set
    amazon_index = candidate_index[0]
    google_index = candidate_index[1]
    print("length of amazon index: "+str(len(amazon_index)))
    print("length of google index: "+str(len(google_index)))
    # retrieve id for each set
    amazon_id = (amazon_data["idAmazon"][amazon_index]).tolist()
    google_id = (google_data["idGoogle"][google_index]).tolist()
    # calculate the similarity for each pair
    similarity = []
    for i in range(0,len(amazon_index)):
         similarity.append(round(similarity_matrix[amazon_index[i]][google_index[i]],2))
    # potential candidate
    potential_pairs = pd.DataFrame({"idAmazon":amazon_id,"idGoogle":google_id,"similarity":similarity})
    return potential_pairs

# Generate the labels
def negatives_generator(perfect_matching,potential_matching):
    # check the quality of blcoking
    auxiliary = pd.merge(perfect_matching,potential_matching, on=["idAmazon","idGoogle"], how="outer", indicator=True)
    print("true positve/recall: "+str(len(*np.where(auxiliary["_merge"]=="both"))))
    print("false positive/- samples: "+str(len(*np.where(auxiliary["_merge"]=="right_only"))))
    print("false negative/+ lost: "+str(len(*np.where(auxiliary["_merge"]=="left_only")))+"\n")
    # labelling
    auxiliary["label"] = np.where(auxiliary["_merge"]=="both",1,0) 
    print("No. of positives: "+str(len(*np.where(auxiliary["label"]==1))))
    print("No. of negatives: "+str(len(*np.where(auxiliary["label"]==0)))+"\n")
    auxiliary = auxiliary[["similarity","idAmazon","idGoogle","label"]]
    auxiliary['similarity'].fillna(1,inplace=True) 
    return auxiliary

In [23]:
# jaccard similarity matrix for training sets
train_jac_sim = similarity_matrix_generator(amazon_train["name"],google_train["name"])
# jaccard similarity matrix for testing sets
test_jac_sim = similarity_matrix_generator(amazon_test["name"],google_test["name"])

In [36]:
# potential candidates for training sets
train_potential_matching = potential_matching(amazon_train,google_train,train_jac_sim,0.5)
print("No. of potential pairs in training set: "+str(len(train_potential_matching)))
# potential candidates for testing sets
test_potential_matching = potential_matching(amazon_test,google_test,test_jac_sim,0.5)
print("No. of potential pairs in testing set: "+str(len(test_potential_matching)))
# [idAmazon, idGoogle, similarity]

length of amazon index: 843
length of google index: 843
No. of potential pairs in training set: 843
length of amazon index: 124
length of google index: 124
No. of potential pairs in testing set: 124


In [37]:
# Label
train_index_labels = negatives_generator(train_perfect_matching,train_potential_matching)
# train_index_labels
test_index_labels = negatives_generator(test_perfect_matching,test_potential_matching)
#[similarity, idAmazon, idGoogle, label]

true positve/recall: 413
false positive/- samples: 430
false negative/+ lost: 653

No. of positives: 413
No. of negatives: 1083

true positve/recall: 103
false positive/- samples: 21
false negative/+ lost: 131

No. of positives: 103
No. of negatives: 152



### Training Set

In [43]:
# Threshold 0.1, 0.2, 0.3, 0.4, 0.5
total_pairs = len(amazon_train)*len(google_train)
total_generated = np.array([61177,9877,4124,1904,843])
true_positive = np.array([1045,953,821,641,413])
false_positive = np.array([60132,8924,3303,1263,430])
false_negative = np.array([21,113,245,425,653])

In [59]:
# Reduction Rate
reduction_rate = 1 - total_generated/total_pairs
# Pair Completeness
recall = true_positive/(true_positive+false_negative)
# pairs quality
precision = true_positive/total_generated
# harmonic_mean
f = recall*precision/(recall+precision)

In [60]:
print("No. of total pairs: "+str(total_pairs))
print("Reduction Rate: "+str(reduction_rate))
print("Pair Completeness: "+str(recall))
print("Pairs Quality: "+str(precision))
print("Harmonic Mean: "+str(f))

No. of total pairs: 2880444
Reduction Rate: [0.97876126 0.99657101 0.99856828 0.99933899 0.99970734]
Pair Completeness: [0.98030019 0.89399625 0.77016886 0.60131332 0.38742964]
Pairs Quality: [0.01708158 0.09648679 0.19907856 0.33665966 0.48991696]
Harmonic Mean: [0.01678904 0.08708764 0.15818882 0.21582492 0.21634364]


In [61]:
threshold = [0.1,0.2,0.3,0.4,0.5]
jaccard_analysis = pd.DataFrame({"threshold":threshold, "reduction_rate":reduction_rate,"recall":recall,"precision":precision,"harmonic_mean":f})
jaccard_analysis

Unnamed: 0,threshold,reduction_rate,recall,precision,harmonic_mean
0,0.1,0.978761,0.9803,0.017082,0.016789
1,0.2,0.996571,0.893996,0.096487,0.087088
2,0.3,0.998568,0.770169,0.199079,0.158189
3,0.4,0.999339,0.601313,0.33666,0.215825
4,0.5,0.999707,0.38743,0.489917,0.216344


In [62]:
jaccard_analysis.to_csv("jaccard_analysis.csv",index=False)