In [95]:
# Import the packages required
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from collections import Counter

# Read data

In [None]:
path = "../input/amazon-google/"
amazon_train = pd.read_csv(path+"Amazon_train.csv")
google_train = pd.read_csv(path+"Google_train.csv")
amazon_test = pd.read_csv(path+"Amazon_test.csv")
google_test = pd.read_csv(path+"Google_test.csv")
train_perfect_matching = pd.read_csv(path+"AG_perfect_matching_train.csv")
test_perfect_matching = pd.read_csv(path+"AG_perfect_matching_test.csv")

In [None]:
# Drop the first column of train-test datasets
amazon_train = amazon_train.drop(amazon_train.columns[0], axis=1)
google_train = google_train.drop(google_train.columns[0], axis=1)
amazon_test = amazon_test.drop(amazon_test.columns[0], axis=1)
google_test = google_test.drop(google_test.columns[0], axis=1)
# Drop the first column of train-test matching datasets
train_perfect_matching = train_perfect_matching.drop(train_perfect_matching.columns[0], axis=1)
test_perfect_matching = test_perfect_matching.drop(test_perfect_matching.columns[0], axis=1)

# Unify the ID col_name (idAmazon & idGoogle)
amazon_train = amazon_train.rename(columns={"id":"idAmazon","title": "name"})
google_train = google_train.rename(columns={"id":"idGoogle"})
amazon_test = amazon_test.rename(columns = {"id":"idAmazon","title":"name"})
google_test = google_test.rename(columns={"id":"idGoogle"})
train_perfect_matching = train_perfect_matching.rename(columns={"idGoogleBase":"idGoogle"})
test_perfect_matching = test_perfect_matching.rename(columns={"idGoogleBase":"idGoogle"})

# The overview of the size of three training sets
print("No. of Samples in Amazon_Train: "+str(len(amazon_train)))
print("No. of Samples in Google_Train: "+str(len(google_train)))
print("No. of Matching Samples in AG_Train: "+str(len(train_perfect_matching)))
print("")

# The overview of the size of three testing sets
print("No. of Samples in Amazon_Test: "+str(len(amazon_test)))
print("No. of Samples in Google_Test: "+str(len(google_test)))
print("No. of Matching Samples in AG_Test: "+str(len(test_perfect_matching)))

# Preprocess text and numeric data

In [None]:
def text_preprocess(text):
    # remove numbers and convert to lowercase
    text_1 = (re.sub(r'\d+','',str(text))).lower()
    # replace NaN with blank
    if(text_1 == "nan"):
        return " "
    # remove punctuation
    text_2 = "".join([c for c in text_1 if c not in string.punctuation])
    # remove multiple space
    text_3 = re.sub(' +', ' ', text_2)
    # remove Stopwords
    text_tokens = word_tokenize(text_3)
    text_4 = " ".join([word for word in text_tokens if word not in stopwords.words('english')])
#     # Lemmatization
#     # initantiate lemmatizer
#     lemmatizer = WordNetLemmatizer()
#     text_5 = [lemmatizer.lemmatize(i) for i in text_4]
#     # Stem
#     # instantiate Stemmer
#     stemmer = PorterStemmer()
#     text_6 = " ".join([stemmer.stem(j) for j in text_5])
    return text_4.strip()

def num_preprocess(df_col):
    if(df_col.dtype != "float64"):
        # remove char in any
        num_string = '0123456789.'
        df_col = df_col.apply(lambda x: "".join([c for c in x if c in num_string]))
        # change the data type to numeric
        df_col = pd.to_numeric(df_col)
#     # normalization
#     df_col = (df_col - np.mean(df_col))/np.std(df_col)
    return df_col

In [None]:
# Preprocessing for text data
amazon_train["name"] = amazon_train["name"].apply(lambda x: text_preprocess(x))
amazon_train["description"] = amazon_train["description"].apply(lambda x: text_preprocess(x))
amazon_test["name"] = amazon_test["name"].apply(lambda x: text_preprocess(x))
amazon_test["description"] = amazon_test["description"].apply(lambda x: text_preprocess(x))

google_train["name"] = google_train["name"].apply(lambda x: text_preprocess(x))
google_train["description"] = google_train["description"].apply(lambda x: text_preprocess(x))
google_test["name"] = google_test["name"].apply(lambda x: text_preprocess(x))
google_test["description"] = google_test["description"].apply(lambda x: text_preprocess(x))

# Join the name and description data
amazon_train["amazon_text"] = amazon_train["name"]+" "+amazon_train["description"]
amazon_test["amazon_text"] = amazon_test["name"]+" "+amazon_test["description"]
google_train["google_text"] = google_train["name"]+" "+google_train["description"]
google_test["google_text"] = google_test["name"]+" "+google_test["description"]

# Standaization for numerical data
amazon_train["price"] = num_preprocess(amazon_train["price"])
amazon_test["price"] = num_preprocess(amazon_test["price"])

google_train["price"] = num_preprocess(google_train["price"])
google_test["price"] = num_preprocess(google_test["price"])

# Blocking / Indexing

In [None]:
# Calculate the jaccard distance
def jaccard_distance(str1,str2):
    a = set(str1.split())
    b = set(str2.split())
    c = a.intersection(b)
    jaccard_dist = float(len(c)) / (len(a) + len(b) - len(c))
    return jaccard_dist

# Get the distance matrix
def distance_matrix_generator(amazon_key,google_key):
    distance_matrix = np.zeros((len(amazon_key),len(google_key)))
    for i in range(0,len(amazon_key)):
        for j in range(0,len(google_key)):
            distance_matrix[i][j] = jaccard_distance(amazon_key[i],google_key[j])
    return distance_matrix

# Get the potential candidate
def potential_matching(amazon_data,google_data,dist_matrix,threshold):
    candidate_index = np.where(dist_matrix >threshold)
    # retrieve index for each set
    amazon_index = candidate_index[0]
    google_index = candidate_index[1]
    print("length of amazon index: "+str(len(amazon_index)))
    print("length of google index: "+str(len(google_index)))
    # retrieve id for each set
    amazon_id = (amazon_data["idAmazon"][amazon_index]).tolist()
    google_id = (google_data["idGoogle"][google_index]).tolist()
    # calculate the similarity for each pair
    jaccard_similarity = []
    for i in range(0,len(amazon_index)):
         jaccard_similarity.append(round(dist_matrix[amazon_index[i]][google_index[i]],2))
    # potential candidate
    potential_pairs = pd.DataFrame({"idAmazon":amazon_id,"idGoogle":google_id,"similarity":jaccard_similarity})
    return potential_pairs

# Generate the labels
def negatives_generator(perfect_matching,potential_matching):
    # check the quality of blcoking
    auxiliary = pd.merge(perfect_matching,potential_matching, on=["idAmazon","idGoogle"], how="outer", indicator=True)
    print("true positve/recall: "+str(len(*np.where(auxiliary["_merge"]=="both"))))
    print("false positive/- samples: "+str(len(*np.where(auxiliary["_merge"]=="right_only"))))
    print("false negative/+ lost: "+str(len(*np.where(auxiliary["_merge"]=="left_only")))+"\n")
    # labelling
    auxiliary["label"] = np.where(auxiliary["_merge"]=="both",1,0) 
    print("No. of positives: "+str(len(*np.where(auxiliary["label"]==1))))
    print("No. of negatives: "+str(len(*np.where(auxiliary["label"]==0)))+"\n")
    auxiliary = auxiliary[["similarity","idAmazon","idGoogle","label"]]
    auxiliary['similarity'].fillna(1,inplace=True) 
    return auxiliary

# Add fields
def add_fields(potential_index_labels, amazon_data, google_data):
    potential_pairs = potential_index_labels
    amazon_name = []
    amazon_description = []
    amazon_info = []
    amazon_prices = []
    google_name = []
    google_description = []
    google_info = []
    google_prices = []
    for i in range(0, len(potential_index_labels)):
        amazon_name.append((amazon_data.loc[amazon_data["idAmazon"]==potential_index_labels["idAmazon"][i],"name"]).item())
        amazon_description.append((amazon_data.loc[amazon_data["idAmazon"]==potential_index_labels["idAmazon"][i],"description"]).item())
        amazon_info.append((amazon_data.loc[amazon_data["idAmazon"]==potential_index_labels["idAmazon"][i],"amazon_text"]).item())
        amazon_prices.append((amazon_data.loc[amazon_data["idAmazon"]==potential_index_labels["idAmazon"][i],"price"]).item())
        google_name.append((google_data.loc[google_data["idGoogle"]==potential_index_labels["idGoogle"][i],"name"]).item())
        google_description.append((google_data.loc[google_data["idGoogle"]==potential_index_labels["idGoogle"][i],"description"]).item())
        google_info.append((google_data.loc[google_data["idGoogle"]==potential_index_labels["idGoogle"][i],"google_text"]).item())
        google_prices.append((google_data.loc[google_data["idGoogle"]==potential_index_labels["idGoogle"][i],"price"]).item())
    potential_pairs["amazon_name"] = amazon_name
    potential_pairs["google_name"] = google_name
    potential_pairs["amazon_description"] = amazon_description
    potential_pairs["google_description"] = google_description
    potential_pairs["amazon_info"] = amazon_info
    potential_pairs["google_info"] = google_info
    potential_pairs["amazon_price"] = amazon_prices
    potential_pairs["google_price"] = google_prices
    potential_pairs["price_diff"] = np.abs(potential_pairs["amazon_price"] - potential_pairs["google_price"])/potential_pairs[["amazon_price","google_price"]].max(axis=1)
    potential_pairs = potential_pairs[["similarity","idAmazon","idGoogle","amazon_name","google_name","amazon_description","google_description","amazon_info","google_info","amazon_price","google_price", "price_diff","label"]]
    return potential_pairs

In [None]:
# jaccard distance matrix for training sets
train_jaccard_dist = distance_matrix_generator(amazon_train["name"],google_train["name"])
# jaccard distance matrix for testing sets
test_jaccard_dist = distance_matrix_generator(amazon_test["name"],google_test["name"])

In [None]:
# potential candidates for training sets
train_potential_matching = potential_matching(amazon_train,google_train,train_jaccard_dist,0.1)
print("No. of potential pairs in training set: "+str(len(train_potential_matching)))
# potential candidates for testing sets
test_potential_matching = potential_matching(amazon_test,google_test,test_jaccard_dist,0.1)
print("No. of potential pairs in testing set: "+str(len(test_potential_matching)))
# [idAmazon, idGoogle, similarity]

print("")

# Label
train_index_labels = negatives_generator(train_perfect_matching,train_potential_matching)
# train_index_labels
test_index_labels = negatives_generator(test_perfect_matching,test_potential_matching)
#[similarity, idAmazon, idGoogle, label]

In [None]:
# Construct the complete training and testing dataset
train_data = add_fields(train_index_labels, amazon_train, google_train)
test_data = add_fields(test_index_labels, amazon_test, google_test)

#[similarity, idAmazon, idGoogle, amazon_info, google_info, amazon_price, google_price, price_diff, label]

# Split val set out of the train set first

In [114]:
path = '../input/prepared-data/'
train_data = pd.read_csv(path+'train_data.csv')
test_data = pd.read_csv(path+'test_data.csv')

## Create Price NAN indicator

In [116]:
def price_nan_indicator(amazon_price, google_price):
    if amazon_price == 0 or google_price == 0:
        return 1
    else:
        return 0
    
train_data["price_nan_indicator"] = train_data.apply(lambda x: price_nan_indicator(x.amazon_price, x.google_price), axis=1)
test_data["price_nan_indicator"] = test_data.apply(lambda x: price_nan_indicator(x.amazon_price, x.google_price), axis=1)

In [118]:
X = train_data[["similarity","label","amazon_name","google_name","amazon_description","google_description","amazon_info","google_info","price_diff","price_nan_indicator"]]
y = train_data["label"]

X, y = shuffle(X, y)
train_X, val_X, train_y, val_y = train_test_split(X, y,test_size=0.2,random_state=42)

print("val set: ",len(val_y))
print(Counter(val_y))
print("train set: ",len(train_y))
print(Counter(train_y))

val set:  12240
Counter({0: 12039, 1: 201})
train set:  48958
Counter({0: 48114, 1: 844})


In [104]:
train_set = pd.concat([train_X, train_y], axis=1)
val_set = pd.concat([val_X, val_y], axis=1)

In [108]:
train_set.columns

Index(['similarity', 'label', 'amazon_name', 'google_name',
       'amazon_description', 'google_description', 'amazon_info',
       'google_info', 'price_diff', 'price_nan_indicator', 'label'],
      dtype='object')

In [119]:
test_set = test_data[['similarity', 'label', 'amazon_name', 'google_name',
       'amazon_description', 'google_description', 'amazon_info',
       'google_info', 'price_diff', 'price_nan_indicator']]

In [121]:
test_set.head(1)

Unnamed: 0,similarity,label,amazon_name,google_name,amazon_description,google_description,amazon_info,google_info,price_diff,price_nan_indicator
0,1.0,1,clickart premier image pack dvdrom,clickart premier image pack dvdrom,,massive collection images fonts design needs o...,clickart premier image pack dvdrom,clickart premier image pack dvdrom massive col...,1.0,1


In [107]:
train_set.head(1)

Unnamed: 0,similarity,label,amazon_name,google_name,amazon_description,google_description,amazon_info,google_info,price_diff,price_nan_indicator,label.1
26488,0.12,0,quickbooks premier manufacturer wholesale edition,backyard soccer mls edition,quickbooks premier manufacturing wholesale edi...,kids get chance play alongside junior versions...,quickbooks premier manufacturer wholesale edit...,backyard soccer mls edition kids get chance pl...,0.977622,0,0


In [123]:
# Save train, val, test set
train_X.to_csv("train_set.csv", index = False)
val_X.to_csv("val_set.csv", index = False)
test_set.to_csv("test_set.csv", index = False)

# =============================Ignore Below=============================

In [None]:
data_amazon_text = train_data["amazon_info"].tolist()
data_google_text = train_data["google_info"].tolist()
# Combine all text data
all_data_text = data_amazon_text + data_google_text

# Retrieve all the tokens in the dataset
def create_tokenizer(all_text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_text)
    word_index = tokenizer.word_index
    print("Found %s unique tokens"%len(word_index))
    return tokenizer

# Create tokenizer according to all the words in train data
tokenizer = create_tokenizer(all_data_text)

def tokenization (tokenizer, text, maxlen):
    #convert to integer lists
    sequences = tokenizer.texts_to_sequences(text)
    # padding
    sequences = pad_sequences(sequences, maxlen)
    return sequences

In [None]:
# amazon_name = train_data["amazon_name"].tolist()
# amazon_name_sequence = tokenization(tokenizer,amazon_name, maxlen=200)
# google_name = train_data["google_name"].tolist()
# google_name_sequence = tokenization(tokenizer,google_name, maxlen=200)
# amazon_description = train_data["amazon_description"].tolist()
# amazon_desc_sequence =  tokenization(tokenizer,amazon_description, maxlen=200)
# google_description = train_data["google_description"].tolist()
# google_desc_sequence = tokenization(tokenizer,google_description, maxlen=200)
# amazon_info = train_data["amazon_info"].tolist()
# amazon_info_sequence = tokenization(tokenizer,amazon_info, maxlen=200)
# google_info = train_data["google_info"].tolist()
# google_info_sequence = tokenization(tokenizer,google_info, maxlen=200)

In [None]:
# train_data = train_data.drop("amazon_name_seq", axis=1)
# train_data = train_data.drop("google_name_seq", axis=1)
# train_data = train_data.drop("amazon_desc_seq", axis=1)
# train_data = train_data.drop("google_desc_seq", axis=1)
# train_data = train_data.drop("amazon_info_seq", axis=1)
# train_data = train_data.drop("google_info_seq", axis=1)

In [None]:
# amazon_name_list = []
# amazon_desc_list = []
# amazon_info_list = []
# google_name_list = []
# google_desc_list = []
# google_info_list = []
# for i in range(0,len(train_data)):
#     amazon_name_list.append(amazon_name_sequence[i].tolist())
#     amazon_desc_list.append(amazon_desc_sequence[i].tolist())
#     amazon_info_list.append(amazon_info_sequence[i].tolist())
#     google_name_list.append(google_name_sequence[i].tolist())
#     google_desc_list.append(google_desc_sequence[i].tolist())
#     google_info_list.append(google_info_sequence[i].tolist())
    
# amazon_name_list 
# train_data["amazon_name_seq"] = amazon_name_list
# train_data["google_name_seq"] = google_name_list
# train_data["amazon_desc_seq"] = amazon_desc_list
# train_data["google_desc_seq"] = google_desc_list
# train_data["amazon_info_seq"] = amazon_info_list
# train_data["google_info_seq"] = google_info_list

In [None]:
# train_data["amazon_name_seq"] = train_data["amazon_name_seq"].apply(lambda x: np.asarray(x))
# train_data["google_name_seq"] = train_data["google_name_seq"].apply(lambda x: np.asarray(x))
# train_data["amazon_desc_seq"] = train_data["amazon_desc_seq"].apply(lambda x: np.asarray(x))
# train_data["google_desc_seq"] = train_data["google_desc_seq"].apply(lambda x: np.asarray(x))
# train_data["amazon_info_seq"] = train_data["amazon_info_seq"].apply(lambda x: np.asarray(x))
# train_data["google_info_seq"] = train_data["google_info_seq"].apply(lambda x: np.asarray(x))

In [None]:
# train_data["amazon_name_seq"] = amazon_name_sequence.tolist()
# train_data["amazon_desc_seq"] = amazon_desc_sequence.tolist()
# train_data["google_name_seq"] = google_name_sequence.tolist()
# train_data["google_desc_seq"] = google_desc_sequence.tolist()
# train_data["amazon_info_seq"] = amazon_info_sequence.tolist()
# train_data["amazon_info_seq"] = amazon_info_sequence.tolist()

In [None]:
# train_data.to_csv("train_data_full.csv",index=False)

In [None]:
# test_amazon_name = test_data["amazon_name"].tolist()
# test_amazon_name_sequence = tokenization(tokenizer,test_amazon_name, maxlen=200)
# test_google_name = test_data["google_name"].tolist()
# test_google_name_sequence = tokenization(tokenizer,test_google_name, maxlen=200)
# test_amazon_description = test_data["amazon_description"].tolist()
# test_amazon_desc_sequence =  tokenization(tokenizer,test_amazon_description, maxlen=200)
# test_google_description = test_data["google_description"].tolist()
# test_google_desc_sequence = tokenization(tokenizer,test_google_description, maxlen=200)
# test_amazon_info = test_data["amazon_info"].tolist()
# test_amazon_info_sequence = tokenization(tokenizer,test_amazon_info, maxlen=200)
# test_google_info = test_data["google_info"].tolist()
# test_google_info_sequence = tokenization(tokenizer,test_google_info, maxlen=200)

In [None]:
# test_data["amazon_name_seq"] = test_amazon_name_sequence.tolist()
# test_data["amazon_desc_seq"] = test_amazon_desc_sequence.tolist()
# test_data["google_name_seq"] = test_google_name_sequence.tolist()
# test_data["google_desc_seq"] = test_google_desc_sequence.tolist()
# test_data["amazon_info_seq"] = test_amazon_info_sequence.tolist()
# test_data["amazon_info_seq"] = test_amazon_info_sequence.tolist()

In [None]:
print(test_data.shape)
test_data.head(1)

In [None]:
test_data.to_csv("test_data_full.csv", index=False)