In [21]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [22]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

In [23]:
sdf = spark.read.parquet("../data/curated/full_data/")
sdf.count()

7817737

In [24]:
sdf.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- SA2_code: integer (nullable = true)
 |-- postcode: integer (nullable = true)
 |-- consumer_id: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- mean_total_income: integer (nullable = true)
 |-- merchant_abn: long (nullable = true)
 |-- dollar_value: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- take_rate: double (nullable = true)



In [25]:
sdf.show(1, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------
 user_id           | 1                                    
 SA2_code          | 504031066                            
 postcode          | 6935                                 
 consumer_id       | 1195503                              
 state             | WA                                   
 gender            | Female                               
 mean_total_income | 62144                                
 merchant_abn      | 12771097467                          
 dollar_value      | 1536.539158652427                    
 order_id          | 94bea086-73d4-43a3-bf19-8657ee652896 
 name              | At Pretium Corp.                     
 tags              | motor                                
 revenue_level     | a                                    
 take_rate         | 6.95                                 
only showing top 1 row



In [26]:
for col in sdf.columns:
    print(f"Number of missiong values in {col}:", sdf.filter(F.col(col).isNull()).count())

Number of missiong values in user_id: 7
Number of missiong values in SA2_code: 5208
Number of missiong values in postcode: 7
Number of missiong values in consumer_id: 7
Number of missiong values in state: 7
Number of missiong values in gender: 7
Number of missiong values in mean_total_income: 7
Number of missiong values in merchant_abn: 0
Number of missiong values in dollar_value: 7
Number of missiong values in order_id: 7
Number of missiong values in name: 0
Number of missiong values in tags: 0
Number of missiong values in revenue_level: 0
Number of missiong values in take_rate: 0


In [7]:
sdf.filter(F.col("consumer_id").isNull())
# 7 merchants does not have any transactions (antique or jewelry shops)
# can be removed ?

user_id,SA2_code,postcode,consumer_id,state,gender,mean_total_income,merchant_abn,dollar_value,order_id,name,tags,revenue_level,take_rate
,,,,,,,76866488151,,,Euismod Urna Company,antique shops - s...,b,4.08
,,,,,,,55403018592,,,Elit Limited,antique shops - s...,b,4.84
,,,,,,,46537010521,,,Tempor Augue Ac C...,jewelry,c,2.18
,,,,,,,93267734067,,,Curae Foundation,jewelry,c,2.57
,,,,,,,55555661470,,,Nullam Scelerisqu...,jewelry,c,1.83
,,,,,,,28311306642,,,Egestas Nunc Sed LLC,antique shops - s...,b,4.93
,,,,,,,47047735645,,,Aenean Gravida In...,antique shops - s...,c,1.54


In [8]:
sdf.select('tags').show()

+--------------------+
|                tags|
+--------------------+
|motor vehicle sup...|
|           opticians|
|digital goods: books|
|           opticians|
|          stationery|
|   florists supplies|
|               books|
|computer programm...|
|bicycle shops - s...|
|                gift|
|          shoe shops|
|            computer|
|                gift|
|           opticians|
|tent and awning s...|
|           opticians|
|tent and awning s...|
|           opticians|
|tent and awning s...|
|digital goods: books|
+--------------------+
only showing top 20 rows



In [9]:
missing = sdf.filter(F.col("SA2_code").isNull()).groupBy("postcode").count()
missing.count() # 2 postcodes does not have a corresponding SA2 code
missing

postcode,count
6958.0,3242
3989.0,1959
,7


In [14]:
sdf.filter(F.col("mean_total_income").isNull()).groupBy("postcode").count()

postcode,count
6958.0,3242
2899.0,3210
6799.0,2918
3989.0,1959
6798.0,1971
,7


In [15]:
sdf.filter(F.col("mean_total_income")==0).groupBy("postcode").count()

postcode,count


In [18]:
output_path = "../data/curated/"
external_output_path = '../data/external/'
transaction_path = "../data/tables/transactions_*/*"

consumer_sdf = spark.read.parquet(output_path + "consumer")
transaction_sdf = spark.read.parquet(transaction_path)
postcode_SA2_sdf = spark.read.csv(output_path + "processed_postcode.csv", inferSchema =True, header=True)
income_sdf = spark.read.csv(output_path + "processed_income.csv", inferSchema =True, header=True)
merchant_sdf = spark.read.csv(output_path + "merchant.csv", inferSchema =True, header=True)
state_income = pd.read_csv(output_path + "state_mean_income.csv").set_index("state").to_dict()["mean_total_income"]

# combine consumer with mean total income based on SA2 code
sdf = consumer_sdf.join(postcode_SA2_sdf,["postcode"],how="left")
sdf = sdf.join(income_sdf, ["SA2_code"], how="left")

sdf.filter(F.col("mean_total_income")==0).groupBy("postcode").count()

postcode,count
5110,165
7468,149
6017,155
2845,164
6084,157
6079,140
3880,155
2021,141


In [19]:
sdf.filter(F.col("mean_total_income").isNull()).groupBy("postcode").count()

postcode,count
6798,177
6958,166
2899,175
3989,151
6799,132


In [20]:

# fill missing total income value with state mean
abbrv = ['NSW', 'VIC','QLD', 'SA', 'WA', 'TAS', 'NT', 'ACT']
for state in abbrv:
    sdf = sdf.withColumn("mean_total_income", 
    F.when(((sdf.mean_total_income == 0) | (sdf.mean_total_income.isNull())) & (sdf.state == state), state_income[state]) \
     .otherwise(sdf.mean_total_income))