In [1]:
import pandas as pd
import numpy as np

In [2]:
amazon_train = pd.read_csv("preprocessed_amazon_train.csv", delimiter = ",")
google_train = pd.read_csv("preprocessed_google_train.csv", delimiter = ",")
amazon_test = pd.read_csv("preprocessed_amazon_test.csv", delimiter = ",")
google_test = pd.read_csv("preprocessed_google_test.csv", delimiter = ",")
train_perfect_matching = pd.read_csv("preprocessed_train_perfect_matching.csv", delimiter = ",")
test_perfect_matching = pd.read_csv("preprocessed_test_perfect_matching.csv", delimiter = ",")

In [4]:
import math
import re
from collections import Counter

WORD = re.compile(r"\w+")

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)


def cosine_similarity(str1, str2):
    vec1 = text_to_vector(str1)
    vec2 = text_to_vector(str2)
    cosine_similarity = get_cosine(vec1, vec2)
    return cosine_similarity

In [5]:
# Get the distance matrix
def similarity_matrix_generator(amazon_key,google_key):
    similarity_matrix = np.zeros((len(amazon_key),len(google_key)))
    for i in range(0,len(amazon_key)):
        for j in range(0,len(google_key)):
            similarity_matrix[i][j] = cosine_similarity(amazon_key[i],google_key[j])
    return similarity_matrix

# Get the potential candidate
def potential_matching(amazon_data,google_data,similarity_matrix,threshold):
    candidate_index = np.where(similarity_matrix >threshold)
    # retrieve index for each set
    amazon_index = candidate_index[0]
    google_index = candidate_index[1]
    print("length of amazon index: "+str(len(amazon_index)))
    print("length of google index: "+str(len(google_index)))
    # retrieve id for each set
    amazon_id = (amazon_data["idAmazon"][amazon_index]).tolist()
    google_id = (google_data["idGoogle"][google_index]).tolist()
    # calculate the similarity for each pair
    similarity = []
    for i in range(0,len(amazon_index)):
         similarity.append(round(similarity_matrix[amazon_index[i]][google_index[i]],2))
    # potential candidate
    potential_pairs = pd.DataFrame({"idAmazon":amazon_id,"idGoogle":google_id,"similarity":similarity})
    return potential_pairs

# Generate the labels
def negatives_generator(perfect_matching,potential_matching):
    # check the quality of blcoking
    auxiliary = pd.merge(perfect_matching,potential_matching, on=["idAmazon","idGoogle"], how="outer", indicator=True)
    print("true positve/recall: "+str(len(*np.where(auxiliary["_merge"]=="both"))))
    print("false positive/- samples: "+str(len(*np.where(auxiliary["_merge"]=="right_only"))))
    print("false negative/+ lost: "+str(len(*np.where(auxiliary["_merge"]=="left_only")))+"\n")
    # labelling
    auxiliary["label"] = np.where(auxiliary["_merge"]=="both",1,0) 
    print("No. of positives: "+str(len(*np.where(auxiliary["label"]==1))))
    print("No. of negatives: "+str(len(*np.where(auxiliary["label"]==0)))+"\n")
    auxiliary = auxiliary[["similarity","idAmazon","idGoogle","label"]]
    auxiliary['similarity'].fillna(1,inplace=True) 
    return auxiliary

In [8]:
# cosine similarity matrix for training sets
train_cos_sim = similarity_matrix_generator(amazon_train["name"],google_train["name"])
# cosine similarity matrix for testing sets
test_cos_sim = similarity_matrix_generator(amazon_test["name"],google_test["name"])

In [10]:
train_cos_sim.shape

(1113, 2588)

In [23]:
# potential candidates for training sets
train_potential_matching = potential_matching(amazon_train,google_train,train_cos_sim,0.5)
print("No. of potential pairs in training set: "+str(len(train_potential_matching)))
# potential candidates for testing sets
test_potential_matching = potential_matching(amazon_test,google_test,test_cos_sim,0.5)
print("No. of potential pairs in testing set: "+str(len(test_potential_matching)))
# [idAmazon, idGoogle, similarity]

length of amazon index: 3833
length of google index: 3833
No. of potential pairs in training set: 3833
length of amazon index: 325
length of google index: 325
No. of potential pairs in testing set: 325


In [24]:
# Label
train_index_labels = negatives_generator(train_perfect_matching,train_potential_matching)
# train_index_labels
test_index_labels = negatives_generator(test_perfect_matching,test_potential_matching)
#[similarity, idAmazon, idGoogle, label]

true positve/recall: 806
false positive/- samples: 3027
false negative/+ lost: 260

No. of positives: 806
No. of negatives: 3287

true positve/recall: 173
false positive/- samples: 152
false negative/+ lost: 61

No. of positives: 173
No. of negatives: 213



### Training Set

In [25]:
# Threshold 0.1, 0.2, 0.3, 0.4, 0.5
total_pairs = len(amazon_train)*len(google_train)
total_generated = np.array([170886,61589,18613,8387,3833])
true_positive = np.array([1055,1043,999,945,806])
false_positive = np.array([169831,60546,17614,7442,3027])
false_negative = np.array([11,23,67,121,260])

In [26]:
# Reduction Rate
reduction_rate = 1 - total_generated/total_pairs
# Pair Completeness
recall = true_positive/(true_positive+false_negative)
# pairs quality
precision = true_positive/total_generated
# harmonic_mean
f = recall*precision/(recall+precision)

In [27]:
print("No. of total pairs: "+str(total_pairs))
print("Reduction Rate: "+str(reduction_rate))
print("Pair Completeness: "+str(recall))
print("Pairs Quality: "+str(precision))
print("Harmonic Mean: "+str(f))

No. of total pairs: 2880444
Reduction Rate: [0.94067373 0.97861823 0.99353815 0.9970883  0.9986693 ]
Pair Completeness: [0.98968105 0.97842402 0.93714822 0.88649156 0.75609756]
Pairs Quality: [0.00617371 0.01693484 0.05367216 0.11267438 0.21027915]
Harmonic Mean: [0.00613543 0.01664672 0.05076477 0.09996826 0.16452337]


In [29]:
threshold = [0.1,0.2,0.3,0.4,0.5]
cosine_analysis = pd.DataFrame({"threshold":threshold, "reduction_rate":reduction_rate,"recall":recall,"precision":precision,"harmonic_mean":f})
cosine_analysis

Unnamed: 0,threshold,reduction_rate,recall,precision,harmonic_mean
0,0.1,0.940674,0.989681,0.006174,0.006135
1,0.2,0.978618,0.978424,0.016935,0.016647
2,0.3,0.993538,0.937148,0.053672,0.050765
3,0.4,0.997088,0.886492,0.112674,0.099968
4,0.5,0.998669,0.756098,0.210279,0.164523


In [30]:
cosine_analysis.to_csv("cosine_analysis.csv",index=False)