In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

In [3]:
sdf = spark.read.parquet("../data/curated/full_data/")
sdf.count()

13614675

In [4]:
sdf.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- SA2_code: integer (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- mean_total_income: integer (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_datetime: date (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)



In [5]:
sdf.show(1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------
 user_id           | 44                                   
 SA2_code          | 401041015                            
 postcode          | 5074                                 
 consumer_id       | 564558                               
 state             | SA                                   
 gender            | Undisclosed                          
 mean_total_income | 53613                                
 merchant_abn      | 10648956813                          
 dollar_value      | 68.17405810943993                    
 order_id          | 4d42fd2c-0823-4af6-be6a-244fe712c50c 
 order_datetime    | 2021-04-09                           
 name              | Proin Nisl Institute                 
 tags              | computer                             
 revenue_level     | a                                    
 take_rate         | 6.66                                 
only showing top 1 row



In [6]:
for col in sdf.columns:
    print(f"Number of missiong values in {col}:", sdf.filter(F.col(col).isNull()).count())

Number of missiong values in user_id: 0
Number of missiong values in SA2_code: 9020
Number of missiong values in postcode: 0
Number of missiong values in consumer_id: 0
Number of missiong values in state: 0
Number of missiong values in gender: 0
Number of missiong values in mean_total_income: 0
Number of missiong values in merchant_abn: 0
Number of missiong values in dollar_value: 0
Number of missiong values in order_id: 0
Number of missiong values in order_datetime: 0
Number of missiong values in name: 0
Number of missiong values in tags: 0
Number of missiong values in revenue_level: 0
Number of missiong values in take_rate: 0


In [7]:
missing = sdf.filter(F.col("SA2_code").isNull()).groupBy("postcode").count()
missing.count() # 2 postcodes does not have a corresponding SA2 code
missing

postcode,count
6958,5644
3989,3376
