In [1]:
# create modeling spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('Project 2 test')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/30 18:49:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# read merchants data
merchants = spark.read.parquet('../data/tables/tbl_merchants.parquet')
merchants.limit(5)

                                                                                

name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


In [3]:
from pyspark.sql import functions as F

In [4]:
# tags are seperated by ), ( or ], [, replace these by ### for splitting

merchants = merchants.withColumn("tags",F.regexp_replace(F.regexp_replace(
    F.col("tags"),"\){1},{1}\s?\(","###"),"\]{1},{1}\s?\[","###"))


In [5]:
# check if correctly replaced
merchants.select("tags").show(10, False)

+---------------------------------------------------------------------------------------------------------------+
|tags                                                                                                           |
+---------------------------------------------------------------------------------------------------------------+
|((furniture, home furnishings and equipment shops, and manufacturers, except appliances###e###take rate: 0.18))|
|([cable, satellite, and otHer pay television and radio services###b###take rate: 4.22])                        |
|([jewelry, watch, clock, and silverware shops###b###take rate: 4.40])                                          |
|([wAtch, clock, and jewelry repair shops###b###take rate: 3.29])                                               |
|([music shops - musical instruments, pianos, and sheet music###a###take rate: 6.33])                           |
|[(gift, card, novelty, and souvenir shops###a###take rate: 6.34)]                      

In [6]:
# split tags data to three columns, denoted as business_area, revenue_level and take_rate
split_merchants = merchants.withColumn("business_area", F.split(F.col("tags"), "###").getItem(0))\
                           .withColumn("revenue_level", F.split(F.col("tags"), "###").getItem(1))\
                           .withColumn("take_rate", F.split(F.col("tags"), "###").getItem(2))

In [7]:
split_merchants.limit(5)

name,tags,merchant_abn,business_area,revenue_level,take_rate
Felis Limited,"((furniture, home...",10023283211,"((furniture, home...",e,take rate: 0.18))
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217,"([cable, satellit...",b,take rate: 4.22])
Nunc Sed Company,"([jewelry, watch,...",10165489824,"([jewelry, watch,...",b,take rate: 4.40])
Ultricies Digniss...,"([wAtch, clock, a...",10187291046,"([wAtch, clock, a...",b,take rate: 3.29])
Enim Condimentum PC,([music shops - m...,10192359162,([music shops - m...,a,take rate: 6.33])


In [8]:
split_merchants = split_merchants\
                    .withColumn("business_area",F.regexp_replace(F.col("business_area"),r"\(\[\)\]",""))
                    


In [9]:
# remove remaining ()[] punctuation in column business_area and take_rate

# obtain take rate in numeric form from take_rate column, 
# in the form of take rate: d.dd, therefore item 1 is the numeric value

# convert string in business_area to lower form

# convert string in revenue_level to lower form, just in case if there's a typo

curated_merchant = split_merchants\
            .withColumn("business_area", F.regexp_replace("business_area", "[\[\]\(\)]", ""))\
            .withColumn("take_rate", F.regexp_replace("take_rate", "[\[\]\(\)]", ""))\
            .withColumn("take_rate", F.split(F.col("take_rate"), ":").getItem(1))\
            .withColumn("business_area", F.lower(F.col('business_area')))\
            .withColumn("revenue_level", F.lower(F.col('revenue_level')))


final_merchant = curated_merchant.drop('tags')

In [10]:
final_merchant.limit(5)

name,merchant_abn,business_area,revenue_level,take_rate
Felis Limited,10023283211,"furniture, home f...",e,0.18
Arcu Ac Orci Corp...,10142254217,"cable, satellite,...",b,4.22
Nunc Sed Company,10165489824,"jewelry, watch, c...",b,4.4
Ultricies Digniss...,10187291046,"watch, clock, and...",b,3.29
Enim Condimentum PC,10192359162,music shops - mus...,a,6.33


In [11]:
# check if revenue_level match levels (5 types)
final_merchant.groupby('revenue_level').count()


revenue_level,count
e,53
d,98
c,922
b,1351
a,1602


In [12]:
# check for different business area
final_merchant.groupby('business_area').count()


business_area,count
"stationery, offic...",2
health and beaut...,1
"opticians, optica...",145
"books, periodical...",5
"watch, clock, and...",159
computer programm...,182
"jewelry, watch, c...",2
tent and awning ...,1
"opticians, optica...",1
"books, periodical...",5


In [13]:
consumer = spark.read.parquet('/Users/aobo/Desktop/Project_2/data/tables/consumer_user_details.parquet')
consumer

AnalysisException: Path does not exist: file:/Users/aobo/Desktop/Project_2/data/tables/consumer_user_details.parquet

In [None]:
consumer.count()

499999

In [None]:
consumer_detail = spark.read.option('sep', "|").csv(directory + '/data/tables/tbl_consumer.csv', 
                                                    header = True)
consumer_detail

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


In [None]:
consumer_detail.count()

499999

In [None]:
curated_merchant.count()

4026