## Data Lookup

In [1]:
# Start Spark Session
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName("MAST30034 Project 2 BNPL")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

22/09/01 12:51:21 WARN Utils: Your hostname, nadya-aurelia.local resolves to a loopback address: 127.0.0.1; using 10.12.34.36 instead (on interface en0)
22/09/01 12:51:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/01 12:51:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
consumer = spark.read.csv("../data/tables/tbl_consumer.csv", header=True, sep="|")

In [3]:
consumer.show(1, vertical=True)

-RECORD 0---------------------------
 name        | Yolanda Williams     
 address     | 413 Haney Gardens... 
 state       | WA                   
 postcode    | 6935                 
 gender      | Female               
 consumer_id | 1195503              
only showing top 1 row



In [4]:
details = spark.read.parquet("../data/tables/consumer_user_details.parquet")

In [5]:
details.show(10, vertical=True)

-RECORD 0--------------
 user_id     | 1       
 consumer_id | 1195503 
-RECORD 1--------------
 user_id     | 2       
 consumer_id | 179208  
-RECORD 2--------------
 user_id     | 3       
 consumer_id | 1194530 
-RECORD 3--------------
 user_id     | 4       
 consumer_id | 154128  
-RECORD 4--------------
 user_id     | 5       
 consumer_id | 712975  
-RECORD 5--------------
 user_id     | 6       
 consumer_id | 407340  
-RECORD 6--------------
 user_id     | 7       
 consumer_id | 511685  
-RECORD 7--------------
 user_id     | 8       
 consumer_id | 448088  
-RECORD 8--------------
 user_id     | 9       
 consumer_id | 650435  
-RECORD 9--------------
 user_id     | 10      
 consumer_id | 1058499 
only showing top 10 rows



In [6]:
merchants = spark.read.parquet("../data/tables/tbl_merchants.parquet")

In [7]:
merchants.show(3, vertical=True)

-RECORD 0----------------------------
 name         | Felis Limited        
 tags         | ((furniture, home... 
 merchant_abn | 10023283211          
-RECORD 1----------------------------
 name         | Arcu Ac Orci Corp... 
 tags         | ([cable, satellit... 
 merchant_abn | 10142254217          
-RECORD 2----------------------------
 name         | Nunc Sed Company     
 tags         | ([jewelry, watch,... 
 merchant_abn | 10165489824          
only showing top 3 rows



In [8]:
transactions = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")

                                                                                

In [9]:
transactions.show(1, vertical=True)

-RECORD 0------------------------------
 user_id        | 18478                
 merchant_abn   | 62191208634          
 dollar_value   | 63.255848959735246   
 order_id       | 949a63c8-29f7-4ab... 
 order_datetime | 2021-08-20           
only showing top 1 row



In [10]:
# Join consumers with their respective details
consumer_user = consumer.join(details, on="consumer_id")

In [11]:
# Join consumers with their respective transactions
consumer_transactions = consumer_user.join(transactions, on="user_id")

In [12]:
consumer_transactions.show(5, vertical=True)



-RECORD 0------------------------------
 user_id        | 7                    
 consumer_id    | 511685               
 name           | Andrea Jones         
 address        | 122 Brandon Cliff    
 state          | QLD                  
 postcode       | 4606                 
 gender         | Female               
 merchant_abn   | 33064796871          
 dollar_value   | 373.0873675184212    
 order_id       | fe188788-b89f-4dd... 
 order_datetime | 2021-08-20           
-RECORD 1------------------------------
 user_id        | 7                    
 consumer_id    | 511685               
 name           | Andrea Jones         
 address        | 122 Brandon Cliff    
 state          | QLD                  
 postcode       | 4606                 
 gender         | Female               
 merchant_abn   | 68435002949          
 dollar_value   | 232.5364986739752    
 order_id       | b4a89891-a113-45e... 
 order_datetime | 2021-08-20           
-RECORD 2------------------------------


                                                                                

In [13]:
# Join transactions with the respective merchants
consumer_merchants = consumer_transactions.join(merchants, on="merchant_abn")

In [14]:
consumer_merchants.show(5, vertical=True)



-RECORD 0------------------------------
 merchant_abn   | 33064796871          
 user_id        | 7                    
 consumer_id    | 511685               
 name           | Andrea Jones         
 address        | 122 Brandon Cliff    
 state          | QLD                  
 postcode       | 4606                 
 gender         | Female               
 dollar_value   | 373.0873675184212    
 order_id       | fe188788-b89f-4dd... 
 order_datetime | 2021-08-20           
 name           | Curabitur Massa C... 
 tags           | ((computer progra... 
-RECORD 1------------------------------
 merchant_abn   | 68435002949          
 user_id        | 7                    
 consumer_id    | 511685               
 name           | Andrea Jones         
 address        | 122 Brandon Cliff    
 state          | QLD                  
 postcode       | 4606                 
 gender         | Female               
 dollar_value   | 232.5364986739752    
 order_id       | b4a89891-a113-45e... 


                                                                                

In [16]:
# Explore product tags for purchase frequency
consumer_merchants.groupby("tags").count().show(truncate=False)

                                                                                

+-----------------------------------------------------------------------------------------------------------------+-----+
|tags                                                                                                             |count|
+-----------------------------------------------------------------------------------------------------------------+-----+
|((lawn and garden suppLy outlets, including nurseries), (a), (take rate: 6.02))                                  |148  |
|[[watch, clock, and jewelry repair shops], [a], [take rate: 5.93]]                                               |2502 |
|[[florists supplies, nursery stock, and flowers], [a], [take rate: 5.58]]                                        |163  |
|([computer programming , data processing, and integrated systems design services], [a], [take rate: 6.79])       |513  |
|([artist supply and craft shops], [a], [take rate: 6.04])                                                        |452  |
|((tent and awning shops

                                                                                