In [83]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt


spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)


In [84]:
sdf = spark.read.parquet("../data/tables/tbl_merchants.parquet")
# sdf.printSchema
sdf.distinct().count()


4026

In [85]:
# convert to pandas dataframe for convenience
df = sdf.toPandas()


In [86]:
import re


def extract_tags(arr, category="tags"):

    # Split tags into the three components
    arr = arr[1:-1]
    split_arr = re.split("\), \(|\], \[", arr.strip("[()]"))

    if category == "take_rate":
        return re.findall("[\d\.\d]+", split_arr[2])[0]

    elif category == "revenue_level":
        return split_arr[1].lower()

    return split_arr[0].lower()


In [87]:
df["tag"] = df.apply(lambda row: extract_tags(row.tags, "tags"), axis=1)
df["revenue_lvl"] = df.apply(
    lambda row: extract_tags(row.tags, "revenue_level"), axis=1
)
df["take_rate"] = df.apply(lambda row: extract_tags(row.tags, "take_rate"), axis=1)

# df.head(1)
tag_col = df["tag"].copy()

In [88]:
# preprocess the data, with the following pipeline
from nltk.metrics.distance import jaccard_distance
from nltk.metrics.distance import edit_distance
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('words')
from autocorrect import Speller

# use a class notion so only need to init corpus words and lemmatizer once
# API: preprocess, which takes a single entry of tag, and returns preprocessed tag
class Preprocessor:
    def __init__(self, correct_method="auto"):
        self.lemmatizer = WordNetLemmatizer()
        self.correct_words = words.words()
        self.correct_method = correct_method
        self.stopwords = set(stopwords.words('english'))

        if correct_method == "auto":
            self.auto_corrector = Speller(lang='en')
    
    # jaccard distance is better for minor typos
    def __correct_spelling__(self,word):
        # match the first character
        if self.correct_method == "jaccard":
            similarity_list = [(jaccard_distance(set(ngrams(word, 2)),set(ngrams(w, 2))),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        if self.correct_method == "edit_distance":
            similarity_list = [(edit_distance(word,w),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        else:
            return self.auto_corrector(word)


    # case standardization -> puncuation, number removal -> 
    # tokenize -> spelling correction -> lemmatization -> minimum length
    def preprocess(self,tag):
        #case standardization
        tag = tag.lower()
        #puncuation, number removal
        tag= re.sub(r'[^a-zA-Z\s]', '',tag)
        #tokenize 
        tag = word_tokenize(tag)

        #correct spelling
        tag = [self.__correct_spelling__(word) for word in tag ]

        #stop word removal
        tag = [word for word in tag if word not in self.stopwords]
        
        #lemmatization
        tag = [self.lemmatizer.lemmatize(word) for word in tag]

        #minimum length of 2
        tag = [word for word in tag if len(word) > 2]

        #join the word
        tag = " ".join(word for word in tag)
        return tag


[nltk_data] Downloading package words to /Users/ke/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [89]:
# p = preprocessor()
# p.__correct_spelling__("friuts")
# # p.preprocess("appleee ,")


In [90]:
preprocessor = Preprocessor()
for i in range(tag_col.size):
    tag_col[i] = preprocessor.preprocess(tag_col[i])
    

In [91]:
tag_col.to_csv("../data/curated/tag_col_preprocessed.csv")

In [92]:
tag_category_array = tag_col.unique()

In [93]:
# load retail category
import json

try:
    f = open('retail_category.json')
    # try load retail category dictionary
    category_dict = json.load(f)
    print("retail category loaded")
except:
    print("category file not loadeds")
    # if retail dictionary doesn't exist, build one
    category_dict = {}
    for i in range(tag_category_array.size):
         category_dict[i] = tag_category_array[i]
    with open('retail_category.json', 'w') as f:
        json.dump(category_dict, f)
    print("retail category created")
finally:
    f.close()


retail category loaded


In [94]:
tag_col

0       furniture home furnishing equipment shop manuf...
1            cable satellite pay television radio service
2                     jewelry watch clock silverware shop
3                         watch clock jewelry repair shop
4         music shop musical instrument piano sheet music
                              ...                        
4021                       optician optical good eyeglass
4022                            book periodical newspaper
4023                                            shoe shop
4024                        motor vehicle supply new part
4025                        motor vehicle supply new part
Name: tag, Length: 4026, dtype: object

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
COS_THRESHHOLD = 0.5 # tune this 

# match the new tag to existing tag
existing_category = set(category_dict.values())

data_category = set(tag_category_array)

potential_new_category_list = list(data_category.difference(existing_category))

# similar category are treated as distinct values
all_category = existing_category.union(data_category)
potential_new_category = data_category.difference

# create a tf-idf vectorizer
tfidf_vectorizer = TfidfVectorizer(
    stop_words="english", max_df=0.5, min_df=1, ngram_range=(1, 1)
)
tfidf_vectorizer.fit(list(all_category))


for new_category in potential_new_category_list:
    add_flag = False
    for old_category in existing_category:
        tfidf_vec_new = tfidf_vectorizer.transform([new_category])
        tfidf_vec_old = tfidf_vectorizer.transform([old_category])
        # put the new category to the old one if they are similar
        if cosine_similarity(tfidf_vec_new,tfidf_vec_old) >= COS_THRESHHOLD:
            add_flag = True
            break

    if add_flag:
        existing_category.add(new_category)
        last_key = int(list(category_dict.keys())[-1])
        category_dict[last_key+1] = new_category
    




In [98]:
with open('retail_category.json', 'w') as f:
    json.dump(category_dict, f)

TypeError: dump() missing 1 required positional argument: 'fp'

In [97]:
# standardize the category tag (to number)
for i in range(tag_col.size):
    tag = tag_col[i]
    category_key = -1
    for key,v in category_dict.items():
        if tag == v:
            category_key = key
            break
    assert category_key != -1
    tag_col[i] = key

tag_col.to_csv("../data/curated/tag_col_standardized.csv")