In [119]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [120]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [121]:
import re
from pyspark.sql.functions import regexp_extract

In [122]:
# Preprocess for the Column 'tags'
merchant_df = spark.read.parquet('../data/tables/tbl_merchants.parquet').withColumnRenamed('name','merchant_name')

In [123]:
merchant_df.select("tags").show(truncate=False)

# look at the format of tags, we found elemtent 'e' and 'take rate' in it.
# We are planning to remove it.

+-----------------------------------------------------------------------------------------------------------------+
|tags                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------+
|((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))|
|([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])                        |
|([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])                                          |
|([wAtch, clock, and jewelry repair shops], [b], [take rate: 3.29])                                               |
|([music shops - musical instruments, pianos, and sheet music], [a], [take rate: 6.33])                           |
|[(gift, card, novelty, and souvenir shops), (a), (take rate: 6.34)]    

In [124]:
# In the following few steps, we remove the punctuation [] ()

m1 = merchant_df.withColumn("tags",regexp_replace(merchant_df['tags'],"\\[",""))
m2 = m1.withColumn("tags",regexp_replace(m1['tags'],"\\]",""))

In [125]:
m3 = m2.withColumn("tags",regexp_replace(m2['tags'],"\\(",""))
m4 = m3.withColumn("tags",regexp_replace(m3['tags'],"\\)",""))

In [126]:
m4.show(truncate=False)

+------------------------------------+---------------------------------------------------------------------------------------------------------+------------+
|merchant_name                       |tags                                                                                                     |merchant_abn|
+------------------------------------+---------------------------------------------------------------------------------------------------------+------------+
|Felis Limited                       |furniture, home furnishings and equipment shops, and manufacturers, except appliances, e, take rate: 0.18|10023283211 |
|Arcu Ac Orci Corporation            |cable, satellite, and otHer pay television and radio services, b, take rate: 4.22                        |10142254217 |
|Nunc Sed Company                    |jewelry, watch, clock, and silverware shops, b, take rate: 4.40                                          |10165489824 |
|Ultricies Dignissim Lacus Foundation|wAtch, clock, 

In [127]:
from pyspark.sql.functions import split, col

In [128]:
df1 = m4.select("tags").alias("tag")
df1.show(truncate=False)

+---------------------------------------------------------------------------------------------------------+
|tags                                                                                                     |
+---------------------------------------------------------------------------------------------------------+
|furniture, home furnishings and equipment shops, and manufacturers, except appliances, e, take rate: 0.18|
|cable, satellite, and otHer pay television and radio services, b, take rate: 4.22                        |
|jewelry, watch, clock, and silverware shops, b, take rate: 4.40                                          |
|wAtch, clock, and jewelry repair shops, b, take rate: 3.29                                               |
|music shops - musical instruments, pianos, and sheet music, a, take rate: 6.33                           |
|gift, card, novelty, and souvenir shops, a, take rate: 6.34                                              |
|computers, comPUter periphe

In [129]:
from pyspark.sql.functions import expr

In [130]:
# We are using pandas to process the rest
# In this stage, we remove the single letters and 'take rate'

import pandas as pd
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pandasDF = df1.toPandas()
array = []
for index, row in pandasDF.iterrows():

    list_row = row.tolist()

    for title in list_row:
        new_title = title.split(", ")

        for i in new_title.copy():
            
            if len(i) <= 1:
                new_title.remove(i)
                continue

            if "take rate" in i:
                new_title.remove(i)
                continue

            if "except" in i:              ## Example "except appliances" -> useless
                new_title.remove(i)
                continue
            

        array.append(new_title)


In [131]:
array

[['furniture', 'home furnishings and equipment shops', 'and manufacturers'],
 ['cable', 'satellite', 'and otHer pay television and radio services'],
 ['jewelry', 'watch', 'clock', 'and silverware shops'],
 ['wAtch', 'clock', 'and jewelry repair shops'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['gift', 'card', 'novelty', 'and souvenir shops'],
 ['computers', 'comPUter peripheral equipment', 'and softwAre'],
 ['watch', 'clock', 'and jewelry repair shops'],
 ['computer programming ',
  'data processing',
  'and integrated systems design services'],
 ['furniture', 'home furnishings and equipment shopS', 'and manufacturers'],
 ['computers', 'computer peripheral equipment', 'and software'],
 ['equipment', 'tool', 'furniture', 'and appliance  rent al and leAsing'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['artist supply and craft  shops'],
 ['florists supplies', 'nursery stock', 'and flowers'],
 ['computers', 'computer peripheral  eq

In [132]:
# Make all the letters lower case
for merchant in array:

    for i_cat in range(0,len(merchant)):
        merchant[i_cat] = merchant[i_cat].lower()

array

[['furniture', 'home furnishings and equipment shops', 'and manufacturers'],
 ['cable', 'satellite', 'and other pay television and radio services'],
 ['jewelry', 'watch', 'clock', 'and silverware shops'],
 ['watch', 'clock', 'and jewelry repair shops'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['gift', 'card', 'novelty', 'and souvenir shops'],
 ['computers', 'computer peripheral equipment', 'and software'],
 ['watch', 'clock', 'and jewelry repair shops'],
 ['computer programming ',
  'data processing',
  'and integrated systems design services'],
 ['furniture', 'home furnishings and equipment shops', 'and manufacturers'],
 ['computers', 'computer peripheral equipment', 'and software'],
 ['equipment', 'tool', 'furniture', 'and appliance  rent al and leasing'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['artist supply and craft  shops'],
 ['florists supplies', 'nursery stock', 'and flowers'],
 ['computers', 'computer peripheral  eq

In [133]:
# Feature engineering

# After reasearch, we find a sic code for merchant tags
# we are trying to match the tags in our dataset and this standard disctionary tags

import pandas as pd

url = 'https://raw.githubusercontent.com/saintsjd/sic4-list/master/sic-codes.csv'
tag_dic = pd.read_csv(url, index_col=0)
print(tag_dic.head(5))

          Major Group  Industry Group  SIC  \
Division                                     
A                   1              11  111   
A                   1              11  112   
A                   1              11  115   
A                   1              11  116   
A                   1              11  119   

                                    Description  
Division                                         
A                                         Wheat  
A                                          Rice  
A                                          Corn  
A                                      Soybeans  
A         Cash Grains, Not Elsewhere Classified  


In [134]:
# Creat a list for description

tag_dictionary = tag_dic['Description']
print(type(tag_dictionary))
tag_bank = tag_dictionary.tolist()

<class 'pandas.core.series.Series'>


In [135]:
tag_bank

['Wheat',
 'Rice',
 'Corn',
 'Soybeans',
 'Cash Grains, Not Elsewhere Classified',
 'Cotton',
 'Tobacco',
 'Sugarcane and Sugar Beets',
 'Irish Potatoes',
 'Field Crops, Except Cash Grains, Not Elsewhere Classified',
 'Vegetables and Melons',
 'Berry Crops',
 'Grapes',
 'Tree Nuts',
 'Citrus Fruits',
 'Deciduous Tree Fruits',
 'Fruits and Tree Nuts, Not Elsewhere Classified',
 'Ornamental Floriculture and Nursery Products',
 'Food Crops Grown Under Cover',
 'General Farms, Primarily Crop',
 'Beef Cattle Feedlots',
 'Beef Cattle, Except Feedlots',
 'Hogs',
 'Sheep and Goats',
 'General Livestock, Except Dairy and Poultry',
 'Dairy Farms',
 'Broiler, Fryer, and Roaster Chickens',
 'Chicken Eggs',
 'Turkeys and Turkey Eggs',
 'Poultry Hatcheries',
 'Poultry and Eggs, Not Elsewhere Classified',
 'Fur-Bearing Animals and Rabbits',
 'Horses and Other Equines',
 'Animal Aquaculture',
 'Animal Specialties, Not Elsewhere Classified',
 'General Farms, Primarily Livestock and Animal Specialties',

In [136]:
# levi distance

def minDistance(word1: str, word2: str) -> int:
    n1 = len(word1)
    n2 = len(word2)
    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]

    for j in range(1, n2 + 1):
        dp[0][j] = dp[0][j-1] + 1

    for i in range(1, n1 + 1):
        dp[i][0] = dp[i-1][0] + 1
    for i in range(1, n1 + 1):
        for j in range(1, n2 + 1):
            if word1[i-1] == word2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                if ((dp[i][j-1] <= dp[i-1][j]) and (dp[i][j-1]<=dp[i-1][j-1])):
                    dp[i][j] = dp[i][j-1] + 1
                elif ((dp[i-1][j] <= dp[i][j-1]) and (dp[i-1][j]<=dp[i-1][j-1])):
                    dp[i][j] = dp[i-1][j] + 1
                else:
                    dp[i][j] = dp[i-1][j-1] + 1

                #dp[i][j] = min([dp[i][j-1], dp[i-1][j], dp[i-1][j-1]]) + 1      
    return dp[-1][-1]

In [137]:
for merchant in array:
    for title in merchant:
        print(title)
        min_dis = 100
        closer_word = 'empty'
        for tag in tag_bank:
            if (minDistance(tag, title) == 0):
                closer_word = tag
                break
            if(minDistance(tag, title)<min_dis):
                min_dis = minDistance(tag, title)
                closer_word = tag
        print(closer_word)
    break
        


furniture
Furniture
home furnishings and equipment shops
Farm Machinery and Equipment
and manufacturers
Manufactured Ice


In [161]:
# Convert cleaned tags into forms ['merchant tags', 'merchant_tags'] (a list of string)
tags_list = []
for merchant_tags in array:
    string = ','.join(merchant_tags)
    tags_list.append(string)
tags_list


['furniture,home furnishings and equipment shops,and manufacturers',
 'cable,satellite,and other pay television and radio services',
 'jewelry,watch,clock,and silverware shops',
 'watch,clock,and jewelry repair shops',
 'music shops - musical instruments,pianos,and sheet music',
 'gift,card,novelty,and souvenir shops',
 'computers,computer peripheral equipment,and software',
 'watch,clock,and jewelry repair shops',
 'computer programming ,data processing,and integrated systems design services',
 'furniture,home furnishings and equipment shops,and manufacturers',
 'computers,computer peripheral equipment,and software',
 'equipment,tool,furniture,and appliance  rent al and leasing',
 'music shops - musical instruments,pianos,and sheet music',
 'artist supply and craft  shops',
 'florists supplies,nursery stock,and flowers',
 'computers,computer peripheral  equipment,and software',
 'antique shops - sales,repairs,and restoration services',
 'motor vehicle supplies and new parts',
 'motor 