In [2]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [4]:
import re
from pyspark.sql.functions import regexp_extract

In [5]:
merchant_df = spark.read.parquet('../data/tables/tbl_merchants.parquet').withColumnRenamed('name','merchant_name')

In [6]:
merchant_df.select("tags").limit(5)

tags
"((furniture, home..."
"([cable, satellit..."
"([jewelry, watch,..."
"([wAtch, clock, a..."
([music shops - m...


In [7]:
m1 = merchant_df.withColumn("tags",regexp_replace(merchant_df['tags'],"\\[",""))
m2 = m1.withColumn("tags",regexp_replace(m1['tags'],"\\]",""))

In [8]:
m3 = m2.withColumn("tags",regexp_replace(m2['tags'],"\\(",""))
m4 = m3.withColumn("tags",regexp_replace(m3['tags'],"\\)",""))

In [9]:
m4.show(truncate=False)

+------------------------------------+---------------------------------------------------------------------------------------------------------+------------+
|merchant_name                       |tags                                                                                                     |merchant_abn|
+------------------------------------+---------------------------------------------------------------------------------------------------------+------------+
|Felis Limited                       |furniture, home furnishings and equipment shops, and manufacturers, except appliances, e, take rate: 0.18|10023283211 |
|Arcu Ac Orci Corporation            |cable, satellite, and otHer pay television and radio services, b, take rate: 4.22                        |10142254217 |
|Nunc Sed Company                    |jewelry, watch, clock, and silverware shops, b, take rate: 4.40                                          |10165489824 |
|Ultricies Dignissim Lacus Foundation|wAtch, clock, 

In [10]:
from pyspark.sql.functions import split, col

In [11]:
df1 = m4.select("tags").alias("tag")
df1.show(truncate=False)

+---------------------------------------------------------------------------------------------------------+
|tags                                                                                                     |
+---------------------------------------------------------------------------------------------------------+
|furniture, home furnishings and equipment shops, and manufacturers, except appliances, e, take rate: 0.18|
|cable, satellite, and otHer pay television and radio services, b, take rate: 4.22                        |
|jewelry, watch, clock, and silverware shops, b, take rate: 4.40                                          |
|wAtch, clock, and jewelry repair shops, b, take rate: 3.29                                               |
|music shops - musical instruments, pianos, and sheet music, a, take rate: 6.33                           |
|gift, card, novelty, and souvenir shops, a, take rate: 6.34                                              |
|computers, comPUter periphe

In [12]:
from pyspark.sql.functions import expr

In [30]:
import pandas as pd
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pandasDF = df1.toPandas()
array = []
for index, row in pandasDF.iterrows():

    list_row = row.tolist()

    for title in list_row:
        new_title = title.split(", ")

        for i in new_title.copy():
            
            if len(i) <= 1:
                new_title.remove(i)
                continue

            if "take rate" in i:
                new_title.remove(i)
                continue

            if "except" in i:              ## Example "except appliances" -> useless
                new_title.remove(i)
                continue
            

        array.append(new_title)


In [31]:
array

[['furniture', 'home furnishings and equipment shops', 'and manufacturers'],
 ['cable', 'satellite', 'and otHer pay television and radio services'],
 ['jewelry', 'watch', 'clock', 'and silverware shops'],
 ['wAtch', 'clock', 'and jewelry repair shops'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['gift', 'card', 'novelty', 'and souvenir shops'],
 ['computers', 'comPUter peripheral equipment', 'and softwAre'],
 ['watch', 'clock', 'and jewelry repair shops'],
 ['computer programming ',
  'data processing',
  'and integrated systems design services'],
 ['furniture', 'home furnishings and equipment shopS', 'and manufacturers'],
 ['computers', 'computer peripheral equipment', 'and software'],
 ['equipment', 'tool', 'furniture', 'and appliance  rent al and leAsing'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['artist supply and craft  shops'],
 ['florists supplies', 'nursery stock', 'and flowers'],
 ['computers', 'computer peripheral  eq

In [39]:
for merchant in array:

    for i_cat in range(0,len(merchant)):
        merchant[i_cat] = merchant[i_cat].lower()

array

[['furniture', 'home furnishings and equipment shops', 'and manufacturers'],
 ['cable', 'satellite', 'and other pay television and radio services'],
 ['jewelry', 'watch', 'clock', 'and silverware shops'],
 ['watch', 'clock', 'and jewelry repair shops'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['gift', 'card', 'novelty', 'and souvenir shops'],
 ['computers', 'computer peripheral equipment', 'and software'],
 ['watch', 'clock', 'and jewelry repair shops'],
 ['computer programming ',
  'data processing',
  'and integrated systems design services'],
 ['furniture', 'home furnishings and equipment shops', 'and manufacturers'],
 ['computers', 'computer peripheral equipment', 'and software'],
 ['equipment', 'tool', 'furniture', 'and appliance  rent al and leasing'],
 ['music shops - musical instruments', 'pianos', 'and sheet music'],
 ['artist supply and craft  shops'],
 ['florists supplies', 'nursery stock', 'and flowers'],
 ['computers', 'computer peripheral  eq

In [16]:
import pandas as pd

url = 'https://raw.githubusercontent.com/saintsjd/sic4-list/master/sic-codes.csv'
tag_dic = pd.read_csv(url, index_col=0)
print(tag_dic.head(5))

          Major Group  Industry Group  SIC  \
Division                                     
A                   1              11  111   
A                   1              11  112   
A                   1              11  115   
A                   1              11  116   
A                   1              11  119   

                                    Description  
Division                                         
A                                         Wheat  
A                                          Rice  
A                                          Corn  
A                                      Soybeans  
A         Cash Grains, Not Elsewhere Classified  


In [25]:
# Creat a list for description

tag_dictionary = tag_dic['Description']
print(type(tag_dictionary))
tag_bank = tag_dictionary.tolist()

<class 'pandas.core.series.Series'>


In [104]:
tag_bank

['wheat',
 'rice',
 'corn',
 'soybeans',
 'cash grains, not elsewhere classified',
 'cotton',
 'tobacco',
 'sugarcane and sugar beets',
 'irish potatoes',
 'field crops, except cash grains, not elsewhere classified',
 'vegetables and melons',
 'berry crops',
 'grapes',
 'tree nuts',
 'citrus fruits',
 'deciduous tree fruits',
 'fruits and tree nuts, not elsewhere classified',
 'ornamental floriculture and nursery products',
 'food crops grown under cover',
 'general farms, primarily crop',
 'beef cattle feedlots',
 'beef cattle, except feedlots',
 'hogs',
 'sheep and goats',
 'general livestock, except dairy and poultry',
 'dairy farms',
 'broiler, fryer, and roaster chickens',
 'chicken eggs',
 'turkeys and turkey eggs',
 'poultry hatcheries',
 'poultry and eggs, not elsewhere classified',
 'fur-bearing animals and rabbits',
 'horses and other equines',
 'animal aquaculture',
 'animal specialties, not elsewhere classified',
 'general farms, primarily livestock and animal specialties',

In [98]:


def minDistance(word1: str, word2: str) -> int:
    n1 = len(word1)
    n2 = len(word2)
    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]

    for j in range(1, n2 + 1):
        dp[0][j] = dp[0][j-1] + 1

    for i in range(1, n1 + 1):
        dp[i][0] = dp[i-1][0] + 1
    for i in range(1, n1 + 1):
        for j in range(1, n2 + 1):
            if word1[i-1] == word2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                if ((dp[i][j-1] <= dp[i-1][j]) and (dp[i][j-1]<=dp[i-1][j-1])):
                    dp[i][j] = dp[i][j-1] + 1
                elif ((dp[i-1][j] <= dp[i][j-1]) and (dp[i-1][j]<=dp[i-1][j-1])):
                    dp[i][j] = dp[i-1][j] + 1
                else:
                    dp[i][j] = dp[i-1][j-1] + 1

                #dp[i][j] = min([dp[i][j-1], dp[i-1][j], dp[i-1][j-1]]) + 1      
    return dp[-1][-1]

In [111]:
for merchant in array:
    for title in merchant:
        print(title)
        min_dis = 100
        closer_word = 'empty'
        for tag in tag_bank:
            if (minDistance(tag, title) == 0):
                closer_word = tag
                break
            if(minDistance(tag, title)<min_dis):
                min_dis = minDistance(tag, title)
                closer_word = tag
        print(closer_word)
        
    break

furniture
furniture
home furnishings and equipment shops
farm machinery and equipment
and manufacturers
manufactured ice
