# ETL - Tags Transformation

### Importing Libraries and Loading Merchant Dataset

In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import matplotlib.pyplot as plt


spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.3.0
      /_/

Using Python version 3.7.4 (default, Aug 13 2019 15:17:50)
Spark context Web UI available at http://10.13.152.101:4041
Spark context available as 'sc' (master = local[*], app id = local-1666164776339).
SparkSession available as 'spark'.


In [2]:
sdf = spark.read.parquet("../../data/tables/tbl_merchants.parquet")
sdf.count()


4026

In [3]:
# convert to pandas dataframe for convenience
df = sdf.toPandas()


### Extract Tags

In [4]:
import re


def extract_tags(arr, category="tags"):

    # Split tags into the three components
    arr = arr[1:-1]
    split_arr = re.split("\), \(|\], \[", arr.strip("[()]"))

    if category == "take_rate":
        return re.findall("[\d\.\d]+", split_arr[2])[0]

    elif category == "revenue_level":
        return split_arr[1].lower()

    return split_arr[0].lower()


In [5]:
df["tag"] = df.apply(lambda row: extract_tags(row.tags, "tags"), axis=1)
df["revenue_level"] = df.apply(
    lambda row: extract_tags(row.tags, "revenue_level"), axis=1
)
df["take_rate"] = df.apply(lambda row: extract_tags(row.tags, "take_rate"), axis=1)

tag_col = df["tag"].copy()

### Preprocess Texts in Tags

In [6]:
# preprocess the data, with the following pipeline
from nltk.metrics.distance import jaccard_distance
from nltk.metrics.distance import edit_distance
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('words')
from autocorrect import Speller

# use a class notion so only need to init corpus words and lemmatizer once
# API: preprocess, which takes a single entry of tag, and returns preprocessed tag
class Preprocessor:
    def __init__(self, correct_method="auto"):
        self.lemmatizer = WordNetLemmatizer()
        self.correct_words = words.words()
        self.correct_method = correct_method
        self.stopwords = set(stopwords.words('english'))

        if correct_method == "auto":
            self.auto_corrector = Speller(lang='en')
    
    # jaccard distance is better for minor typos
    def __correct_spelling__(self,word):
        # match the first character
        if self.correct_method == "jaccard":
            similarity_list = [(jaccard_distance(set(ngrams(word, 2)),set(ngrams(w, 2))),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        if self.correct_method == "edit_distance":
            similarity_list = [(edit_distance(word,w),w) for w in self.correct_words if w[0]==word[0]]
            similarity_list = sorted(similarity_list, key = lambda val:val[0])
            return similarity_list[0][1]
        else:
            return self.auto_corrector(word)


    # case standardization -> puncuation, number removal -> 
    # tokenize -> spelling correction -> lemmatization -> minimum length
    def preprocess(self,tag_line):
        #case standardization
        tag_line = tag_line.lower()
        
        #puncuation, number removal, except comma
        tag_line= re.sub(r'[^a-zA-Z\s,]', ' ',tag_line)
        
        #tokenize by comma
        tag_line = tag_line.split(',')
        
        
        #strip leading and ending of tag
        tag_line = [text.strip() for text in tag_line]

        new_tag_line = []
        for tag in tag_line:

            new_tag = word_tokenize(tag)

            #correct spelling
            new_tag = [self.__correct_spelling__(text) for text in new_tag ]
        
            #stop word removal
            new_tag = [text for text in new_tag if text not in self.stopwords]

            #lemmatization
            new_tag = [self.lemmatizer.lemmatize(text) for text in new_tag]

            #minimum length of 2
            new_tag = " ".join([text for text in new_tag if len(text) > 2])
            
            new_tag_line.append(new_tag)

        return ",".join(new_tag_line)
    


[nltk_data] Downloading package words to /Users/oliver/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [7]:
preprocessor = Preprocessor()
for i in range(tag_col.size):
    tag_col[i] = preprocessor.preprocess(tag_col[i])
    

In [None]:
# p = Preprocessor()
# p.preprocess(tag_col[0])
# tag_col.head(10)

### Use Bag-of-Words Method with Tags to Categorise Merchants

In [8]:
# Count and vectorize text in tags with comma as delimiter
vectorizer = CountVectorizer(tokenizer=lambda text: re.split(',',text))
X = vectorizer.fit_transform(tag_col)


In [9]:
# Join the vectorizer with merchant data
count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())
merchant_category = pd.concat([df, count_vect_df], axis=1)


In [58]:
# Check all categories only 58 unique tags
# count_vect_df.shape[1]


In [None]:
# save preprocessed tag column
tag_col.to_csv("../data/curated/tag_col_preprocessed.csv")

In [None]:

# Remove the tags columns
merchant_category.drop(['tags','tag'], axis=1, inplace=True)
# save expanded tags dataframe
merchant_category.to_csv("../data/curated/merchant_tag.csv")

## An example of analysis the data by tag

In [10]:
#Find all merchant abns associated with each category/tag
merchants_by_categories = {}
for category in count_vect_df.columns:
    merchants_by_categories[category]=merchant_category[merchant_category[category] != 0]['merchant_abn'].unique()

In [11]:
merchant_category[merchant_category['merchant_abn'].isin(merchants_by_categories['card'])]

Unnamed: 0,name,tags,merchant_abn,tag,revenue_level,take_rate,antique shop sale,appliance rent leasing,art dealer gallery,artist supply craft shop,...,shoe shop,silverware shop,software,souvenir shop,stationery,telecom,tent owning shop,tool,toy game shop,watch
5,Fusce Company,"[(gift, card, novelty, and souvenir shops), (a...",10206519221,"gift, card, novelty, and souvenir shops",a,6.34,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
19,Ut Consulting,"([gift, card, novelty, and souvenir shops], [c...",10462560289,"gift, card, novelty, and souvenir shops",c,2.95,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
29,Fermentum Risus Foundation,"([gift, card, novelty, and souvenir shops], [a...",10702078694,"gift, card, novelty, and souvenir shops",a,5.95,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
37,Per Inceptos Company,"((gift, card, novelty, and souvenir shops), (a...",10901349044,"gift, card, novelty, and souvenir shops",a,5.96,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
39,Metus Vitae Velit LLP,"([gift, card, novelty, and souvenir shops], [a...",10930486968,"gift, card, novelty, and souvenir shops",a,5.73,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3862,Cras Sed Leo Corp.,"([gift, card, novelty, and souvenir shops], [a...",96244711717,"gift, card, novelty, and souvenir shops",a,6.44,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3870,Non Luctus Sit Incorporated,"[[gift, card, novelty, and souvenir shops], [c...",96458464738,"gift, card, novelty, and souvenir shops",c,2.26,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3889,At Lacus Quisque Institute,"([gifT, card, novelty, and souvenir shops], [a...",96880556465,"gift, card, novelty, and souvenir shops",a,5.70,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3957,Lobortis Augue Industries,"([gift, card, novelty, and souvenir shops], [a...",98314397036,"gift, card, novelty, and souvenir shops",a,6.47,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


***

### An Alternative method of grouping tags (not used)

In [None]:
# # load retail category
# import json

# try:
#     f = open('retail_category.json')
#     # try load retail category dictionary
#     category_dict = json.load(f)
#     print("retail category loaded")
# except:
#     print("category file not loadeds")
#     # if retail dictionary doesn't exist, build one
#     category_dict = {}
#     for i in range(tag_category_array.size):
#          category_dict[i] = tag_category_array[i]
#     with open('retail_category.json', 'w') as f:
#         json.dump(category_dict, f)
#     print("retail category created")
# finally:
#     f.close()


In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# COS_THRESHHOLD = 0.5 # tune this 

# # match the new tag to existing tag
# existing_category = set(category_dict.values())

# data_category = set(tag_category_array)

# potential_new_category_list = list(data_category.difference(existing_category))

# # similar category are treated as distinct values
# all_category = existing_category.union(data_category)
# potential_new_category = data_category.difference

# # create a tf-idf vectorizer
# tfidf_vectorizer = TfidfVectorizer(
#     stop_words="english", max_df=0.5, min_df=1, ngram_range=(1, 1)
# )
# tfidf_vectorizer.fit(list(all_category))


# for new_category in potential_new_category_list:
#     add_flag = False
#     for old_category in existing_category:
#         tfidf_vec_new = tfidf_vectorizer.transform([new_category])
#         tfidf_vec_old = tfidf_vectorizer.transform([old_category])
#         # put the new category to the old one if they are similar
#         if cosine_similarity(tfidf_vec_new,tfidf_vec_old) >= COS_THRESHHOLD:
#             add_flag = True
#             break

#     if add_flag:
#         existing_category.add(new_category)
#         last_key = int(list(category_dict.keys())[-1])
#         category_dict[last_key+1] = new_category
    




In [None]:
# len(all_category)

In [None]:
# with open('retail_category.json', 'w') as f:
#     json.dump(category_dict, f)

In [None]:
# # standardize the category tag (to number)
# for i in range(tag_col.size):
#     tag = tag_col[i]
#     category_key = -1
#     for key,v in category_dict.items():
#         if tag == v:
#             category_key = key
#             break
#     assert category_key != -1
#     tag_col[i] = key

# tag_col.to_csv("../data/curated/tag_col_standardized.csv")