In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [10]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

# Observation of Merge using full data

In [11]:
sdf = spark.read.parquet("../data/curated/full_data/")
sdf.count()

7817737

In [4]:
sdf.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- SA2_code: integer (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- mean_total_income: integer (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)



In [5]:
sdf.show(1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------
 user_id           | 2                                    
 SA2_code          | 124011455                            
 postcode          | 2782                                 
 consumer_id       | 179208                               
 state             | NSW                                  
 gender            | Female                               
 mean_total_income | 61938                                
 merchant_abn      | 33607911449                          
 dollar_value      | 19.75306182342976                    
 order_id          | bd855c1e-d3bf-4312-ad6e-4796a42880e7 
 name              | Nulla Semper LLC                     
 tags              | florists supplies                    
 revenue_level     | b                                    
 take_rate         | 3.54                                 
only showing top 1 row



In [6]:
for col in sdf.columns:
    print(f"Number of missiong values in {col}:", sdf.filter(F.col(col).isNull()).count())

Number of missiong values in user_id: 7
Number of missiong values in SA2_code: 5208
Number of missiong values in postcode: 7
Number of missiong values in consumer_id: 7
Number of missiong values in state: 7
Number of missiong values in gender: 7
Number of missiong values in mean_total_income: 13307
Number of missiong values in merchant_abn: 0
Number of missiong values in dollar_value: 7
Number of missiong values in order_id: 7
Number of missiong values in name: 0
Number of missiong values in tags: 0
Number of missiong values in revenue_level: 0
Number of missiong values in take_rate: 0


In [7]:
sdf.filter(F.col("consumer_id").isNull())
# 7 merchants does not have any transactions (antique or jewelry shops)
# can be removed ?

user_id,SA2_code,postcode,consumer_id,state,gender,mean_total_income,merchant_abn,dollar_value,order_id,name,tags,revenue_level,take_rate
,,,,,,,46537010521,,,Tempor Augue Ac C...,jewelry,c,2.18
,,,,,,,93267734067,,,Curae Foundation,jewelry,c,2.57
,,,,,,,76866488151,,,Euismod Urna Company,antique shops - s...,b,4.08
,,,,,,,55555661470,,,Nullam Scelerisqu...,jewelry,c,1.83
,,,,,,,28311306642,,,Egestas Nunc Sed LLC,antique shops - s...,b,4.93
,,,,,,,47047735645,,,Aenean Gravida In...,antique shops - s...,c,1.54
,,,,,,,55403018592,,,Elit Limited,antique shops - s...,b,4.84


In [8]:
missing = sdf.filter(F.col("SA2_code").isNull()).groupBy("postcode").count()
missing.count() # 2 postcodes does not have a corresponding SA2 code
missing

postcode,count
6958.0,3242
3989.0,1959
,7
