# Pre-Processing

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 

# Create a spark session
spark = (
    SparkSession.builder.appName("preprocessing")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

In [35]:
transactiondf1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
transactiondf1

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20


In [36]:
transactiondf2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
transactiondf2

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime
14935,79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26
1,46451548968,72.61581642788431,76bab304-fa2d-400...,2021-11-26
14936,89518629617,3.0783487174439297,a2ae446a-2959-41c...,2021-11-26
1,49167531725,51.58228625503599,7080c274-17f7-4cc...,2021-11-26
14936,31101120643,25.2281149424178,8e301c0f-06ab-45c...,2021-11-26
2,67978471888,691.5028234458998,0380e9ad-b0e8-420...,2021-11-26
14936,60956456424,102.13952056640888,5ac3da9c-5147-452...,2021-11-26
2,47644196714,644.5220654863093,4e368e44-86f8-4de...,2021-11-26
14938,39649557865,209.12780951421405,4d78cd01-4bab-494...,2021-11-26
3,88402174457,141.0387993699113,c50c957d-ecfc-430...,2021-11-26


In [4]:
userdf = spark.read.parquet("../data/tables/consumer_user_details.parquet")
userdf.show()

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      1|    1195503|
|      2|     179208|
|      3|    1194530|
|      4|     154128|
|      5|     712975|
|      6|     407340|
|      7|     511685|
|      8|     448088|
|      9|     650435|
|     10|    1058499|
|     11|     428325|
|     12|    1494640|
|     13|    1146717|
|     14|    1343547|
|     15|    1463076|
|     16|    1356405|
|     17|    1331093|
|     18|      80965|
|     19|    1226530|
|     20|    1390367|
+-------+-----------+
only showing top 20 rows



In [5]:
consumerdf = spark.read.option("header","true").csv("../data/tables/tbl_consumer.csv", sep="|")
consumerdf = consumerdf.withColumnRenamed("name","customer_name")
consumerdf

customer_name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


In [33]:
merchantdf = spark.read.parquet("../data/tables/tbl_merchants.parquet")
merchantdf = merchantdf.withColumnRenamed("name","company_name")

# Replace all square brackets with round brackets
merchantdf = merchantdf.withColumn('tags', regexp_replace('tags', '\\[', '\\('))
merchantdf = merchantdf.withColumn('tags', regexp_replace('tags', '\\]', '\\)'))

# Extract take rate into seperate column
merchantdf = merchantdf.withColumn("take_rate", 
                                   split(col("tags"), "\\),").getItem(2))\
                       .withColumn('take_rate', 
                                   regexp_replace('take_rate', 'take rate: ', 
                                                  ''))\
                       .withColumn('take_rate', 
                                   regexp_replace('take_rate', '\\(', ''))\
                       .withColumn('take_rate', 
                                   regexp_replace('take_rate', '\\)', ''))

# Extract revenue band
merchantdf = merchantdf.withColumn("revenue_band", 
                                   split(col("tags"), "\\),").getItem(1))\
                       .withColumn('revenue_band', 
                                   regexp_replace('revenue_band', '\\(', ''))\
                       .withColumn('revenue_band', 
                                   regexp_replace('revenue_band', '\\)', ''))

# Extract tags band
merchantdf = merchantdf.withColumn("tags", 
                                   split(col("tags"), "\\),").getItem(0))\
                       .withColumn('tags', 
                                   regexp_replace('tags', '\\(', ''))\
                       .withColumn('tags', 
                                   regexp_replace('tags', '\\)', ''))


merchantdf.show(truncate=False)

+------------------------------------+-------------------------------------------------------------------------------------+------------+---------+------------+
|company_name                        |tags                                                                                 |merchant_abn|take_rate|revenue_band|
+------------------------------------+-------------------------------------------------------------------------------------+------------+---------+------------+
|Felis Limited                       |furniture, home furnishings and equipment shops, and manufacturers, except appliances|10023283211 | 0.18    | e          |
|Arcu Ac Orci Corporation            |cable, satellite, and otHer pay television and radio services                        |10142254217 | 4.22    | b          |
|Nunc Sed Company                    |jewelry, watch, clock, and silverware shops                                          |10165489824 | 4.40    | b          |
|Ultricies Dignissim Lacus Foundat

In [37]:
transactiondf = transactiondf1.union(transactiondf2)
mergedf = transactiondf.join(userdf, "user_id")
mergedf
mergedf = mergedf.join(consumerdf, "consumer_id")
mergedf
mergedf = mergedf.join(merchantdf, "merchant_abn")
mergedf.limit(10)

                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
68216911708,551,3471,39.2325218595659,c921263e-489e-45a...,2021-08-21,Todd Long,883 Patty Mountai...,NSW,2021,Male,Placerat Eget Ven...,"computers, comput...",3.05,c
46451548968,551,3471,3.8405569898888423,7e8e7fca-04a1-4f2...,2021-08-19,Todd Long,883 Patty Mountai...,NSW,2021,Male,Tempus Eu Ligula ...,heaLth and beauty...,6.04,a
35556933338,551,3471,66.18325079406301,13b0be6a-20b7-439...,2021-08-22,Todd Long,883 Patty Mountai...,NSW,2021,Male,Semper Cursus Int...,digital goods: bo...,6.87,a
91772719127,551,3471,33.69274773234,3750984f-4313-47d...,2021-08-14,Todd Long,883 Patty Mountai...,NSW,2021,Male,Dictum Cursus Inc...,artist supply and...,5.68,a
29216160692,551,3471,171.76081688162495,f5917c53-caca-4b6...,2021-08-15,Todd Long,883 Patty Mountai...,NSW,2021,Male,Class Aptent LLC,lawn and garden s...,5.6,a
68216911708,551,3471,60.71366865869528,94fbe85c-434a-4e6...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male,Placerat Eget Ven...,"computers, comput...",3.05,c
75034515922,551,3471,5.4642431321898375,12e4b9c7-1738-481...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male,Ac Eleifend Corp.,digital goods: bo...,6.22,a
24852446429,551,3471,56.94751884265085,8e65c8ee-67f1-411...,2021-08-16,Todd Long,883 Patty Mountai...,NSW,2021,Male,Erat Vitae LLP,florists supplies...,2.94,c
60111071436,551,3471,65.18552675559465,5f7d4237-81ca-4e7...,2021-08-27,Todd Long,883 Patty Mountai...,NSW,2021,Male,Imperdiet Non LLC,"cable, satellite,...",6.84,a
22019664345,551,3471,41.38591300683678,d0a09e6d-27b9-445...,2021-07-23,Todd Long,883 Patty Mountai...,NSW,2021,Male,Molestie Sodales ...,moTor vehicle sUp...,6.63,a


In [38]:
mergedf.write.parquet('../data/curated/mergedf.parquet')

                                                                                

In [39]:
mergedf.groupBy("merchant_abn").mean("dollar_value")

                                                                                

merchant_abn,avg(dollar_value)
38700038932,1333.0894941085276
83412691377,35.048115443301164
15613631617,300.1850319573497
19839532017,157.0
73256306726,284.96133358238615
35344855546,87.90619961634984
73841664453,89.0963665048099
78916025936,352.61969394690243
60654402457,84.44940983729136
92202115241,312.11904590114267


main websites out there containing australian datasets:
- https://explore.data.abs.gov.au/ 
- https://data.gov.au/
- https://researchdata.edu.au/
- https://aurin.org.au/

Key features to join on: 
- timestamp
- location: state/postcode/SA2 

Key model features: 
- merchant abn (nominal)
- datetime
- user id: how many users shop at each merchant
- dollar value
- location
- gender
- tags

SA2 Shapefile: https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files