Start spark session.

In [58]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

Merchants data:

In [59]:
merchants = spark.read.parquet("../data/tables/tbl_merchants.parquet")
merchants = merchants.withColumnRenamed("name", "merchant_name")
merchants.show(2, vertical=True)
merchants.count()

-RECORD 0-----------------------------
 merchant_name | Felis Limited        
 tags          | ((furniture, home... 
 merchant_abn  | 10023283211          
-RECORD 1-----------------------------
 merchant_name | Arcu Ac Orci Corp... 
 tags          | ([cable, satellit... 
 merchant_abn  | 10142254217          
only showing top 2 rows



4026

An example of an instance of the "tag" column.

In [60]:
merchants.collect()[0][1]

'((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))'

Extract revenue level and take rate from the "tags" column.

In [67]:
from pyspark.sql import functions as F

merchants = merchants.withColumn("tag", F.split("tags", pattern = "(\],\s\[)|(\),\s\()").getItem(0))
merchants = merchants.withColumn("revenue_level", F.split("tags", pattern = "(\],\s\[)|(\),\s\()").getItem(1))
merchants = merchants.withColumn("take_rate", F.split("tags", pattern = "(\],\s\[)|(\),\s\()").getItem(2))
merchants = merchants.withColumn("take_rate", F.substring("take_rate", 12, 4))
merchants.show(20, vertical=True)

-RECORD 0-----------------------------
 merchant_name | Felis Limited        
 tags          | ((furniture, home... 
 merchant_abn  | 10023283211          
 revenue_level | e                    
 take_rate     | 0.18                 
 tag           | ((furniture, home... 
-RECORD 1-----------------------------
 merchant_name | Arcu Ac Orci Corp... 
 tags          | ([cable, satellit... 
 merchant_abn  | 10142254217          
 revenue_level | b                    
 take_rate     | 4.22                 
 tag           | ([cable, satellit... 
-RECORD 2-----------------------------
 merchant_name | Nunc Sed Company     
 tags          | ([jewelry, watch,... 
 merchant_abn  | 10165489824          
 revenue_level | b                    
 take_rate     | 4.40                 
 tag           | ([jewelry, watch,... 
-RECORD 3-----------------------------
 merchant_name | Ultricies Digniss... 
 tags          | ([wAtch, clock, a... 
 merchant_abn  | 10187291046          
 revenue_level | b       

Consumer data:

In [62]:
consumer = spark.read.option("delimiter", "|").option("header", "true").csv("../data/tables/tbl_consumer.csv")
consumer = consumer.withColumnRenamed("name", "consumer_name")
consumer.show(3, vertical=True)
consumer.count()

-RECORD 0-----------------------------
 consumer_name | Yolanda Williams     
 address       | 413 Haney Gardens... 
 state         | WA                   
 postcode      | 6935                 
 gender        | Female               
 consumer_id   | 1195503              
-RECORD 1-----------------------------
 consumer_name | Mary Smith           
 address       | 3764 Amber Oval      
 state         | NSW                  
 postcode      | 2782                 
 gender        | Female               
 consumer_id   | 179208               
-RECORD 2-----------------------------
 consumer_name | Jill Jones MD        
 address       | 40693 Henry Greens   
 state         | NT                   
 postcode      | 862                  
 gender        | Female               
 consumer_id   | 1194530              
only showing top 3 rows



                                                                                

499999

Consumer user details:

In [63]:
userdetails = spark.read.parquet("../data/tables/consumer_user_details.parquet")
userdetails.show(1, vertical=True)

-RECORD 0--------------
 user_id     | 1       
 consumer_id | 1195503 
only showing top 1 row



Transactions:

In [64]:
transactions = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
transactions.show(2, vertical = True)
transactions.count()

                                                                                

-RECORD 0------------------------------
 user_id        | 18478                
 merchant_abn   | 62191208634          
 dollar_value   | 63.255848959735246   
 order_id       | 949a63c8-29f7-4ab... 
 order_datetime | 2021-08-20           
-RECORD 1------------------------------
 user_id        | 2                    
 merchant_abn   | 15549624934          
 dollar_value   | 130.3505283105634    
 order_id       | 6a84c3cf-612a-457... 
 order_datetime | 2021-08-20           
only showing top 2 rows



                                                                                

3643266

1. Use consumer user details to find the consumer_id for each transaction.
2. Use consumer_id to join transaction data and consumer data.
3. Use merchant_abn to join transaction data and merchant data.

In [65]:
transactions = transactions.join(userdetails, on="user_id", how="left")
transactions = transactions.join(consumer, on="consumer_id", how="left")
transactions = transactions.join(merchants, on="merchant_abn", how="left")
transactions.show(2, vertical = True)
transactions.count()

                                                                                

-RECORD 0------------------------------
 merchant_abn   | 45629217853          
 consumer_id    | 1398491              
 user_id        | 18480                
 dollar_value   | 54.49576549280132    
 order_id       | 09b58b8c-904c-454... 
 order_datetime | 2021-06-18           
 consumer_name  | John Fischer         
 address        | 4770 Mayer Passag... 
 state          | QLD                  
 postcode       | 4887                 
 gender         | Male                 
 merchant_name  | Lacus Consulting     
 tags           | [[gift, Card, nov... 
 revenue_level  | a                    
 take_rate      | 6.98                 
-RECORD 1------------------------------
 merchant_abn   | 41663117354          
 consumer_id    | 1398491              
 user_id        | 18480                
 dollar_value   | 57.271896789115075   
 order_id       | c9740db1-9cbb-4a5... 
 order_datetime | 2021-06-11           
 consumer_name  | John Fischer         
 address        | 4770 Mayer Passag... 


                                                                                

3643266

Group transactions by merchant name, and show merchants with highest and lowest total sales.

In [66]:
import pandas as pd

df = transactions.groupBy("merchant_name").sum("dollar_value").toPandas()
df = df.sort_values(by = "sum(dollar_value)", ascending = False)
df

                                                                                

Unnamed: 0,merchant_name,sum(dollar_value)
801,,5.162072e+07
2742,Arcu Morbi Institute,2.586773e+06
1076,Amet Risus Inc.,2.569946e+06
1811,Lacus Aliquam Corporation,2.506652e+06
2531,Placerat Orci Institute,2.505284e+06
...,...,...
336,Malesuada Fames Limited,4.010094e+02
822,Ac Orci Ut Foundation,3.840798e+02
3980,Magna Praesent Interdum Industries,3.396543e+02
3949,Fringilla Mi Lacinia Incorporated,3.296614e+02
