In [1]:
# Import the packages required
import os
from pathlib import Path
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [2]:
data_folder = Path("/Users/jingjing/SEM2/Dissertations/Project 2/Data/Amazon_Google_train_test/")
amazon_train = pd.read_csv(data_folder/"Amazon_train.csv", delimiter = ",")
google_train = pd.read_csv(data_folder/"Google_train.csv", delimiter = ",")
amazon_test = pd.read_csv(data_folder/"Amazon_test.csv", delimiter = ",")
google_test = pd.read_csv(data_folder/"Google_test.csv", delimiter = ",")
train_perfect_matching = pd.read_csv(data_folder/"AG_perfect_matching_train.csv", delimiter = ",")
test_perfect_matching = pd.read_csv(data_folder/"AG_perfect_matching_test.csv", delimiter = ",")

In [3]:
# Drop the first column of train-test datasets
amazon_train = amazon_train.drop(amazon_train.columns[0], axis=1)
google_train = google_train.drop(google_train.columns[0], axis=1)
amazon_test = amazon_test.drop(amazon_test.columns[0], axis=1)
google_test = google_test.drop(google_test.columns[0], axis=1)
# Drop the first column of train-test matching datasets
train_perfect_matching = train_perfect_matching.drop(train_perfect_matching.columns[0], axis=1)
test_perfect_matching = test_perfect_matching.drop(test_perfect_matching.columns[0], axis=1)

# Unify the ID col_name (idAmazon & idGoogle)
amazon_train = amazon_train.rename(columns={"id":"idAmazon","title": "name"})
google_train = google_train.rename(columns={"id":"idGoogle"})
amazon_test = amazon_test.rename(columns = {"id":"idAmazon","title":"name"})
google_test = google_test.rename(columns={"id":"idGoogle"})
train_perfect_matching = train_perfect_matching.rename(columns={"idGoogleBase":"idGoogle"})
test_perfect_matching = test_perfect_matching.rename(columns={"idGoogleBase":"idGoogle"})

# The overview of the size of three training sets
print("No. of Samples in Amazon_Train: "+str(len(amazon_train)))
print("No. of Samples in Google_Train: "+str(len(google_train)))
print("No. of Matching Samples in AG_Train: "+str(len(train_perfect_matching)))
print("")

# The overview of the size of three testing sets
print("No. of Samples in Amazon_Test: "+str(len(amazon_test)))
print("No. of Samples in Google_Test: "+str(len(google_test)))
print("No. of Matching Samples in AG_Test: "+str(len(test_perfect_matching)))

No. of Samples in Amazon_Train: 1113
No. of Samples in Google_Train: 2588
No. of Matching Samples in AG_Train: 1066

No. of Samples in Amazon_Test: 250
No. of Samples in Google_Test: 638
No. of Matching Samples in AG_Test: 234


In [4]:
def text_preprocess(text):
    # remove numbers and convert to lowercase
    text_1 = (re.sub(r'\d+','',str(text))).lower()
    # replace NaN with blank
    if(text_1 == "nan"):
        return " "
    # remove punctuation
    text_2 = "".join([c for c in text_1 if c not in string.punctuation])
    # remove multiple space
    text_3 = re.sub(' +', ' ', text_2)
    # remove Stopwords
    text_tokens = word_tokenize(text_3)
    text_4 = " ".join([word for word in text_tokens if word not in stopwords.words('english')])
#     # Lemmatization
#     # initantiate lemmatizer
#     lemmatizer = WordNetLemmatizer()
#     text_5 = [lemmatizer.lemmatize(i) for i in text_4]
#     # Stem
#     # instantiate Stemmer
#     stemmer = PorterStemmer()
#     text_6 = " ".join([stemmer.stem(j) for j in text_5])
    return text_4.strip()

def standardization(df_col):
    if(df_col.dtype != "float64"):
        # remove char in any
        num_string = '0123456789.'
        df_col = df_col.apply(lambda x: "".join([c for c in x if c in num_string]))
        # change the data type to numeric
        df_col = pd.to_numeric(df_col)
    # standardization
    df_col = (df_col - np.mean(df_col))/np.std(df_col)
    return df_col

In [5]:
# Preprocessing for text data
amazon_train["name"] = amazon_train["name"].apply(lambda x: text_preprocess(x))
amazon_train["description"] = amazon_train["description"].apply(lambda x: text_preprocess(x))
amazon_train["manufacturer"] = amazon_train["manufacturer"].apply(lambda x: text_preprocess(x))
amazon_test["name"] = amazon_test["name"].apply(lambda x: text_preprocess(x))
amazon_test["description"] = amazon_test["description"].apply(lambda x: text_preprocess(x))
amazon_test["manufacturer"] = amazon_test["manufacturer"].apply(lambda x: text_preprocess(x))


google_train["name"] = google_train["name"].apply(lambda x: text_preprocess(x))
google_train["description"] = google_train["description"].apply(lambda x: text_preprocess(x))
google_train["manufacturer"] = google_train["manufacturer"].apply(lambda x: text_preprocess(x))
google_test["name"] = google_test["name"].apply(lambda x: text_preprocess(x))
google_test["description"] = google_test["description"].apply(lambda x: text_preprocess(x))
google_test["manufacturer"] = google_test["manufacturer"].apply(lambda x: text_preprocess(x))

# Join the name and description data
amazon_train["amazon_text"] = amazon_train["name"]+" "+amazon_train["description"]
amazon_test["amazon_text"] = amazon_test["name"]+" "+amazon_test["description"]
google_train["google_text"] = google_train["name"]+" "+google_train["description"]
google_test["google_text"] = google_test["name"]+" "+google_test["description"]

# Standaization for numerical data
amazon_train["price"] = standardization(amazon_train["price"])
amazon_test["price"] = standardization(amazon_test["price"])

google_train["price"] = standardization(google_train["price"])
google_test["price"] = standardization(google_test["price"])

In [6]:
amazon_train.to_csv("preprocessed_amazon_train.csv", index=False)
amazon_test.to_csv("preprocessed_amazon_test.csv", index=False)
google_train.to_csv("preprocessed_google_train.csv", index=False)
google_test.to_csv("preprocessed_google_test.csv", index=False)
train_perfect_matching.to_csv("preprocessed_train_perfect_matching.csv", index=False)
test_perfect_matching.to_csv("preprocessed_test_perfect_matching.csv", index=False)

In [7]:
amazon_train

Unnamed: 0,idAmazon,name,description,manufacturer,price,amazon_text
0,b0006zf55o,ca international arcserve lapdesktop oem pk,oem arcserve backup v win u laptops desktops,computer associates,-0.112693,ca international arcserve lapdesktop oem pk oe...
1,b00004tkvy,noahs ark activity center jewel case ages,,victory multimedia,-0.112693,noahs ark activity center jewel case ages
2,b000g80lqo,peachtree sage premium accounting nonprofits,peachtree premium accounting nonprofits afford...,sage software,0.062647,peachtree sage premium accounting nonprofits p...
3,b0006se5bq,singing coach unlimited,singing coach unlimited electronic learning pr...,carryatune technologies,-0.083472,singing coach unlimited singing coach unlimite...
4,b000ehpzv8,emc retrospect disk disk windows,emc retrospect disk diskcromwindows,dantz,-0.112693,emc retrospect disk disk windows emc retrospec...
...,...,...,...,...,...,...
1108,b000in8mj0,photostory cd dvd,magix photostory cd dvd answers question photo...,magix entertainment,-0.106851,photostory cd dvd magix photostory cd dvd answ...
1109,b000cs3s2c,flash remoting alp ret eng cd u,marketing information macromedia flash remotin...,adobe,0.855812,flash remoting alp ret eng cd u marketing info...
1110,b00005bigp,shapes,,school zone,-0.109773,shapes
1111,b000h1df7w,dragon naturally speaking standard v,dragon naturallyspeaking standard edition give...,nuance communications inc,-0.083472,dragon naturally speaking standard v dragon na...
