In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/30 01:23:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data_loc = "../data/tables/"
consumer = spark.read.parquet(data_loc+"consumer_user_details.parquet")
tbl_consumer = spark.read.csv(data_loc+"tbl_consumer.csv", sep="|", header=True)
tbl_merchant = spark.read.parquet(data_loc+"tbl_merchants.parquet")
transact = spark.read.parquet(data_loc+"transactions_20210228_20210827_snapshot/")

                                                                                

In [3]:
consumer.limit(5)

                                                                                

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


In [4]:
tbl_consumer.limit(5)

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975


In [5]:
tbl_merchant.limit(5)

name,tags,merchant_abn
Felis Limited,"((furniture, home...",10023283211
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217
Nunc Sed Company,"([jewelry, watch,...",10165489824
Ultricies Digniss...,"([wAtch, clock, a...",10187291046
Enim Condimentum PC,([music shops - m...,10192359162


In [6]:
tbl_merchant.select("tags").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------+
|tags                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------+
|((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))|
|([cable, satellite, and otHer pay television and radio services], [b], [take rate: 4.22])                        |
|([jewelry, watch, clock, and silverware shops], [b], [take rate: 4.40])                                          |
|([wAtch, clock, and jewelry repair shops], [b], [take rate: 3.29])                                               |
|([music shops - musical instruments, pianos, and sheet music], [a], [take rate: 6.33])                           |
|[(gift, card, novelty, and souvenir shops), (a), (take rate: 6.34)]    

<h2> 1. Standardize column format of table</h2>

<h3>1.1 "tbl_merchants.parquet"</h3>

In [7]:
# separate "tags" into "items", "revenue_level", "take_rate"
preprocessed_tbl_merchant = tbl_merchant.withColumn("split_tags", split(col("tags"), ",")) \
            .withColumn("take_rate", element_at(col("split_tags"), -1)) \
            .withColumn("revenue_level", element_at(col("split_tags"), -2)) \
            .withColumn("items", slice(reverse(col("split_tags")), 3, 100)) \
            .withColumn("items", reverse(col("items")))


In [8]:
preprocessed_tbl_merchant.select("items").show(truncate=False)

+---------------------------------------------------------------------------------------------+
|items                                                                                        |
+---------------------------------------------------------------------------------------------+
|[((furniture,  home furnishings and equipment shops,  and manufacturers,  except appliances)]|
|[([cable,  satellite,  and otHer pay television and radio services]]                         |
|[([jewelry,  watch,  clock,  and silverware shops]]                                          |
|[([wAtch,  clock,  and jewelry repair shops]]                                                |
|[([music shops - musical instruments,  pianos,  and sheet music]]                            |
|[[(gift,  card,  novelty,  and souvenir shops)]                                              |
|[[(computers,  comPUter peripheral equipment,  and softwAre)]                                |
|[[[watch,  clock,  and jewelry repair s

In [9]:
# remove the special characters of "revenue_level" and "take_rate"
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumn("revenue_level", regexp_replace("revenue_level", "[^a-e]", "")) \
                                                     .withColumn("take_rate", regexp_replace("take_rate", "[^0-9.]", ""))

In [10]:
# convert "items" into the bags of word
preprocessed_tbl_merchant = preprocessed_tbl_merchant.withColumn("items", concat_ws(",", col("items"))) \
                                                     .withColumn("items", regexp_replace("items", "[^A-Za-z0-9]", " ")) \
                                                     .withColumn("items", regexp_replace("items", "\s+", " ")) \
                                                     .withColumn("items", regexp_replace("items", "(^\s+)|(\s+$)", "")) \
                                                     .withColumn("items", split(col("items"), " "))

In [11]:
preprocessed_tbl_merchant.select("name", "items", "revenue_level", "take_rate", "merchant_abn")

name,items,revenue_level,take_rate,merchant_abn
Felis Limited,"[furniture, home,...",e,0.18,10023283211
Arcu Ac Orci Corp...,"[cable, satellite...",b,4.22,10142254217
Nunc Sed Company,"[jewelry, watch, ...",b,4.4,10165489824
Ultricies Digniss...,"[wAtch, clock, an...",b,3.29,10187291046
Enim Condimentum PC,"[music, shops, mu...",a,6.33,10192359162
Fusce Company,"[gift, card, nove...",a,6.34,10206519221
Aliquam Enim Inco...,"[computers, comPU...",b,4.32,10255988167
Ipsum Primis Ltd,"[watch, clock, an...",c,2.39,10264435225
Pede Ultrices Ind...,"[computer, progra...",a,5.71,10279061213
Nunc Inc.,"[furniture, home,...",a,6.61,10323485998


<h3>1.2 "transactions_20210228_20210827_snapshot"</h3>

In [12]:
transact.limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


In [13]:
# separate pickup datetime into date, year, month and day
preprocessed_transact = transact.withColumn('order_year', year(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumn('order_month', month(col('order_datetime')))
preprocessed_transact = preprocessed_transact.withColumn('order_day', dayofmonth(col('order_datetime')))

In [14]:
preprocessed_transact

user_id,merchant_abn,dollar_value,order_id,order_datetime,order_year,order_month,order_day
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,2021,8,20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,2021,8,20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,2021,8,20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,2021,8,20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,2021,8,20
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20,2021,8,20
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20,2021,8,20
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20,2021,8,20
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20,2021,8,20
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20,2021,8,20
