In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt


spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.1.2
      /_/

Using Python version 3.7.4 (default, Aug 13 2019 15:17:50)
Spark context Web UI available at http://169.254.224.157:4041
Spark context available as 'sc' (master = local[*], app id = local-1662221572861).
SparkSession available as 'spark'.


In [2]:
sdf = spark.read.parquet("../data/tables/tbl_merchants.parquet")
# sdf.printSchema
sdf.distinct().count()


4026

In [3]:
# convert to pandas dataframe for convenience
df = sdf.toPandas()


In [4]:
import re


def extract_tags(arr, category="tags"):

    # Split tags into the three components
    arr = arr[1:-1]
    split_arr = re.split("\), \(|\], \[", arr.strip("[()]"))

    if category == "take_rate":
        return re.findall("[\d\.\d]+", split_arr[2])[0]

    elif category == "revenue_level":
        return split_arr[1].lower()

    return split_arr[0].lower()


In [5]:
df["tag"] = df.apply(lambda row: extract_tags(row.tags, "tags"), axis=1)
df["revenue_lvl"] = df.apply(
    lambda row: extract_tags(row.tags, "revenue_level"), axis=1
)
df["take_rate"] = df.apply(lambda row: extract_tags(row.tags, "take_rate"), axis=1)

# df.head(1)
tag_col = df["tag"].copy()

In [6]:
# preprocess the data, with the following pipeline
from nltk.metrics.distance import jaccard_distance
from nltk.metrics.distance import edit_distance
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('words')
from autocorrect import Speller

# use a class notion so only need to init corpus words and lemmatizer once
# API: preprocess, which takes a single entry of tag, and returns preprocessed tag
class Preprocessor:
    def __init__(self, correct_method="auto"):
        self.lemmatizer = WordNetLemmatizer()
        self.correct_words = words.words()
        self.correct_method = correct_method
        self.stopwords = set(stopwords.words('english'))

        if correct_method == "auto":
            self.auto_corrector = Speller(lang='en')
    
    # jaccard distance is better for minor typos
    def __correct_spelling__(self,word):
        # match the first character
        if self.correct_method == "jaccard":
            similarity_list = [(jaccard_distance(set(ngrams(word, 2)),set(ngrams(w, 2))),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        if self.correct_method == "edit_distance":
            similarity_list = [(edit_distance(word,w),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        else:
            return self.auto_corrector(word)


    # case standardization -> puncuation, number removal -> 
    # tokenize -> spelling correction -> lemmatization -> minimum length
    def preprocess(self,tag):
        #case standardization
        tag = tag.lower()
        #puncuation, number removal
        tag= re.sub(r'[^a-zA-Z\s,]', '',tag)
#         #tokenize 
        tag = tag.split(',')
        
        
        #strip leading and ending of tag
        tag = [text.strip() for text in tag]

#         #correct spelling
#         tag = [self.__correct_spelling__(word) for word in tag ]

#         #stop word removal
#         tag = [word for word in tag if word not in self.stopwords]
        
#         #lemmatization
#         tag = [self.lemmatizer.lemmatize(word) for word in tag]

#         #minimum length of 2
#         tag = [word for word in tag if len(word) > 2]

        new_tag = []
        for word in tag:

            #correct spelling
            new_word = word_tokenize(word)
            
            new_word = [self.__correct_spelling__(text) for text in new_word ]
        
            #stop word removal
            new_word = [text for text in new_word if text not in self.stopwords]

            #lemmatization
            new_word = [self.lemmatizer.lemmatize(text) for text in new_word]

            #minimum length of 2
            new_word = " ".join([text for text in new_word if len(text) > 2])
            
            
            new_tag.append(new_word)

        return ",".join(new_tag)


[nltk_data] Downloading package words to /Users/oliver/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
# p = preprocessor()
# p.__correct_spelling__("friuts")
# # p.preprocess("appleee ,")


In [13]:
preprocessor = Preprocessor()
for i in range(tag_col.size):
    tag_col[i] = preprocessor.preprocess(tag_col[i])
    

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda text: re.split(',',text))
X = vectorizer.fit_transform(tag_col)

In [15]:
count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())

In [16]:
merchant_category = pd.concat([df, count_vect_df], axis=1)

In [17]:
merchant_category

Unnamed: 0,name,tags,merchant_abn,tag,revenue_lvl,take_rate,antique shop sale,appliance rent leasing,art dealer gallery,artist supply craft shop,...,shoe shop,silverware shop,software,souvenir shop,stationery,telecom,tent owning shop,tool,toy game shop,watch
0,Felis Limited,"((furniture, home furnishings and equipment sh...",10023283211,"furniture, home furnishings and equipment shop...",e,0.18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Arcu Ac Orci Corporation,"([cable, satellite, and otHer pay television a...",10142254217,"cable, satellite, and other pay television and...",b,4.22,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]...",10165489824,"jewelry, watch, clock, and silverware shops",b,4.40,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops], [b]...",10187291046,"watch, clock, and jewelry repair shops",b,3.29,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Enim Condimentum PC,"([music shops - musical instruments, pianos, a...",10192359162,"music shops - musical instruments, pianos, and...",a,6.33,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4021,Elit Dictum Eu Ltd,"[(opticians, optical goods, and eyeglasses), (...",99938978285,"opticians, optical goods, and eyeglasses",b,4.50,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4022,Mollis LLP,"((books, periodicals, and newspapers), (b), (t...",99974311662,"books, periodicals, and newspapers",b,3.17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4023,Sociosqu Corp.,"((shoe shops), (a), (take rate: 6.57))",99976658299,shoe shops,a,6.57,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4024,Commodo Hendrerit LLC,"[[motor vehicle Supplies and new parts], [a], ...",99987905597,motor vehicle supplies and new parts,a,6.82,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
merchant_category[merchant_category['watch'] != 0]

Unnamed: 0,name,tags,merchant_abn,tag,revenue_lvl,take_rate,antique shop sale,appliance rent leasing,art dealer gallery,artist supply craft shop,...,shoe shop,silverware shop,software,souvenir shop,stationery,telecom,tent owning shop,tool,toy game shop,watch
2,Nunc Sed Company,"([jewelry, watch, clock, and silverware shops]...",10165489824,"jewelry, watch, clock, and silverware shops",b,4.40,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,Ultricies Dignissim Lacus Foundation,"([wAtch, clock, and jewelry repair shops], [b]...",10187291046,"watch, clock, and jewelry repair shops",b,3.29,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,Ipsum Primis Ltd,"[[watch, clock, and jewelry repair shops], [c]...",10264435225,"watch, clock, and jewelry repair shops",c,2.39,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
25,Egestas A Associates,"[(jewelry, watch, clock, and siLverware shops)...",10596295795,"jewelry, watch, clock, and silverware shops",a,6.84,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
36,Sed Facilisis Vitae Incorporated,"((jewelry, watch, clock, and silverware shops)...",10881038707,"jewelry, watch, clock, and silverware shops",a,6.24,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3973,At Company,"[[watch, clock, and jewelry repair shops], [b]...",98626476096,"watch, clock, and jewelry repair shops",b,4.17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3986,Auctor Vitae Aliquet Associates,"([jewelry, watcH, clock, and silverware shops]...",99022662131,"jewelry, watch, clock, and silverware shops",b,5.06,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3999,Senectus LLP,"((watch, clock, and jewelry repair shops), (a)...",99319455478,"watch, clock, and jewelry repair shops",a,6.99,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4006,Sed Diam Industries,"([jewelry, watch, clock, and silverware shops]...",99473039626,"jewelry, watch, clock, and silverware shops",b,4.15,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [22]:
print(f"Number of categories: {len(count_vect_df.columns)}")

Number of categories: 57


In [None]:
tag_col.to_csv("../data/curated/tag_col_preprocessed.csv")

In [None]:
tag_category_array = tag_col.unique()

In [None]:
# load retail category
import json

try:
    f = open('retail_category.json')
    # try load retail category dictionary
    category_dict = json.load(f)
    print("retail category loaded")
except:
    print("category file not loadeds")
    # if retail dictionary doesn't exist, build one
    category_dict = {}
    for i in range(tag_category_array.size):
         category_dict[i] = tag_category_array[i]
    with open('retail_category.json', 'w') as f:
        json.dump(category_dict, f)
    print("retail category created")
finally:
    f.close()


In [None]:
tag_col

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
COS_THRESHHOLD = 0.5 # tune this 

# match the new tag to existing tag
existing_category = set(category_dict.values())

data_category = set(tag_category_array)

potential_new_category_list = list(data_category.difference(existing_category))

# similar category are treated as distinct values
all_category = existing_category.union(data_category)
potential_new_category = data_category.difference

# create a tf-idf vectorizer
tfidf_vectorizer = TfidfVectorizer(
    stop_words="english", max_df=0.5, min_df=1, ngram_range=(1, 1)
)
tfidf_vectorizer.fit(list(all_category))


for new_category in potential_new_category_list:
    add_flag = False
    for old_category in existing_category:
        tfidf_vec_new = tfidf_vectorizer.transform([new_category])
        tfidf_vec_old = tfidf_vectorizer.transform([old_category])
        # put the new category to the old one if they are similar
        if cosine_similarity(tfidf_vec_new,tfidf_vec_old) >= COS_THRESHHOLD:
            add_flag = True
            break

    if add_flag:
        existing_category.add(new_category)
        last_key = int(list(category_dict.keys())[-1])
        category_dict[last_key+1] = new_category
    




In [None]:
len(all_category)

In [None]:
with open('retail_category.json', 'w') as f:
    json.dump(category_dict, f)

In [None]:
# standardize the category tag (to number)
for i in range(tag_col.size):
    tag = tag_col[i]
    category_key = -1
    for key,v in category_dict.items():
        if tag == v:
            category_key = key
            break
    assert category_key != -1
    tag_col[i] = key

tag_col.to_csv("../data/curated/tag_col_standardized.csv")