# Pre-Processing

In [1]:
RELATIVE_DIR = "../data/"

## Given Data

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv

# Create a spark session
spark = (
    SparkSession.builder.appName("preprocessing")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/10/05 13:02:01 WARN Utils: Your hostname, DESKTOP-JJJD94T resolves to a loopback address: 127.0.1.1; using 192.168.177.17 instead (on interface eth0)
22/10/05 13:02:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 13:02:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/05 13:02:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/05 13:02:04 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/10/05 13:02:04 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
transactiondf1 = spark.read.parquet(f"{RELATIVE_DIR}tables/transactions_20210228_20210827_snapshot/")
transactiondf2 = spark.read.parquet(f"{RELATIVE_DIR}tables/transactions_20210828_20220227_snapshot/")
transactiondf3 = spark.read.parquet(f"{RELATIVE_DIR}tables/transactions_20220228_20220828_snapshot/")
transactiondf12 = transactiondf1.union(transactiondf2)
transactiondf = transactiondf12.union(transactiondf3)
transactiondf.limit(5)

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20


In [4]:
transactiondf.write.mode("overwrite").parquet(f'{RELATIVE_DIR}curated/transactiondf.parquet')

                                                                                

In [5]:
userdf = spark.read.parquet(f"{RELATIVE_DIR}tables/consumer_user_details.parquet")
userdf.limit(5)

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975


In [6]:
consumerdf = spark.read.option("header","true").csv(f"{RELATIVE_DIR}tables/tbl_consumer.csv", sep="|")
consumerdf = consumerdf.withColumnRenamed("name","customer_name")
consumerdf.limit(5)

customer_name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975


In [7]:
merchantdf = spark.read.parquet(f"{RELATIVE_DIR}tables/tbl_merchants.parquet")
merchantdf = merchantdf.withColumnRenamed("name","company_name")

# Replace all square brackets with round brackets
merchantdf = merchantdf.withColumn('tags', regexp_replace('tags', '\\[', '\\('))
merchantdf = merchantdf.withColumn('tags', regexp_replace('tags', '\\]', '\\)'))

# Extract take rate into seperate column
merchantdf = merchantdf.withColumn("take_rate", 
                                   split(col("tags"), "\\),").getItem(2))\
                       .withColumn('take_rate', 
                                   regexp_replace('take_rate', 'take rate: ', 
                                                  ''))\
                       .withColumn('take_rate', 
                                   regexp_replace('take_rate', '\\(', ''))\
                       .withColumn('take_rate', 
                                   regexp_replace('take_rate', '\\)', ''))

# Extract revenue band
merchantdf = merchantdf.withColumn("revenue_band", 
                                   split(col("tags"), "\\),").getItem(1))\
                       .withColumn('revenue_band', 
                                   regexp_replace('revenue_band', '\\(', ''))\
                       .withColumn('revenue_band', 
                                   regexp_replace('revenue_band', '\\)', ''))

# Extract tags band
merchantdf = merchantdf.withColumn("tags", 
                                   split(col("tags"), "\\),").getItem(0))\
                       .withColumn('tags', 
                                   regexp_replace('tags', '\\(', ''))\
                       .withColumn('tags', 
                                   regexp_replace('tags', '\\)', ''))\
                       .withColumn('tags', 
                                   regexp_replace('tags', ' +', ' '))\
                       .withColumn('tags', 
                                   lower('tags'))

merchantdf.limit(5)

company_name,tags,merchant_abn,take_rate,revenue_band
Felis Limited,"furniture, home f...",10023283211,0.18,e
Arcu Ac Orci Corp...,"cable, satellite,...",10142254217,4.22,b
Nunc Sed Company,"jewelry, watch, c...",10165489824,4.4,b
Ultricies Digniss...,"watch, clock, and...",10187291046,3.29,b
Enim Condimentum PC,music shops - mus...,10192359162,6.33,a


In [8]:
merchantdf.write.mode("overwrite").parquet(f'{RELATIVE_DIR}curated/merchantdf.parquet')

                                                                                

In [9]:
# Check no rows dropped when combining transactions with user
print(transactiondf.count(),userdf.count())
mergedf = transactiondf.join(userdf, "user_id")
print(mergedf.count())
print("\n")

# Check no rows dropped when combining with consumer
print(mergedf.count(), consumerdf.count())
mergedf = mergedf.join(consumerdf, "consumer_id")
print(mergedf.count())
print("\n")

# Check no rows dropped when combining with merchant
print(mergedf.count(),merchantdf.count())
mergedf = mergedf.join(merchantdf, "merchant_abn")
print(mergedf.count())

                                                                                

14195505 499999


                                                                                

14195505




                                                                                

14195505 499999


                                                                                

14195505




                                                                                

14195505 4026




13614675


                                                                                

We can see that number of rows goes down from 14195505 to 13614675. Since the join was on merchant_abn, this means that either the merchantdf didn't have those merchants on it or the merged df had incorrect merchant_abns

In [10]:
mergedf.write.mode("overwrite").parquet(f'{RELATIVE_DIR}curated/mergedftemp.parquet')

                                                                                

## Population Data

In [11]:
skip = list(range(7)) + [8] + list(range(2481, 2490)) + [2480]

fields_2b_renamed = ['S/T name', 'no.']

field_names = ['S/T name', 'SA2 code', 'SA2 name', 'no.']

AGE_FIELDS_COUNT = 18
for i in range (1, AGE_FIELDS_COUNT+1):
    string = 'no..' + str(i)
    fields_2b_renamed.append(string)
    field_names.append(string)

AGE_UB = 85
AGE_RANGE = 4
rename_to = ['State/Terr']
for i in range(0, AGE_UB+1, AGE_RANGE+1):
    col_name = "age "

    if i == AGE_UB:
        col_name += f"{i}+"
        rename_to.append(col_name)
        continue

    col_name += f"{i}-{i+AGE_RANGE}"
    rename_to.append(col_name)

rename_to.append('Total')

rename_cols = dict(zip(fields_2b_renamed, rename_to))

In [12]:
import pandas as pd

pop_df = pd \
    .read_excel(
        f'{RELATIVE_DIR}tables/population.xlsx',
        sheet_name = 'Table 3',
        skiprows = skip,
    ) \
    .get(field_names) \
    .rename(columns = rename_cols)

pop_df

FileNotFoundError: [Errno 2] No such file or directory: '../data/tables/population.xlsx'

Create custom categories for better interpretation:
- old: 60+
- middle age: 35-59
- young adult: 18-34
- adolescent: 10-17
- under 10: 0-9

Note: retiremet age in Australia is 66

In [None]:
# Get all age columns to be dropped
drop_cols = [string for string in pop_df.columns if string[:3] == 'age']

In [None]:
groups = {
    'Under 10': [0, 10],
    'Adolescent': [10, 18],
    'Young adult': [18, 35],
    'Middle age': [35, 60],
    'Old': [60, 86]
}

for group, ages in groups.items():
    age_sum = 0
    for i in range(ages[0], ages[1], 5):

        if i == 85:
            age_range_str = f"age {i}+"
        else:
            age_range_str = f"age {i}-{i+4}"

        age_sum += pop_df[age_range_str]
        
    pop_df[group] = age_sum
 
# Drop all columns containing age
pop_df_mod = pop_df.drop(axis=0, columns=drop_cols)

In [None]:
# Type cast all age fields to integer type
pop_df_mod = pop_df_mod.convert_dtypes()
pop_df_mod.dtypes
pop_df_mod[pop_df_mod.isnull().any(axis=1)]
pop_df_mod = pop_df_mod.dropna()

21 cells feaaturing NA values were dropped here, since they were either totals or rows of entirely null values

Convert population pandas dataframe to spark dataframe for later integration.

In [None]:
from pyspark.sql.types import *

mySchema = StructType([
    StructField("State/Terr", StringType()),
    StructField("SA2 code", StringType()),
    StructField("SA2 name", StringType()),
    StructField("Total", IntegerType()),
    StructField("Under 10", IntegerType()),
    StructField("Adolescent", IntegerType()),
    StructField("Young adult", IntegerType()),
    StructField("Middle age", IntegerType()),
    StructField("Old", IntegerType())
])

In [None]:
pop_sdf = spark.createDataFrame(
    pop_df_mod,
    mySchema
)

In [None]:
pop_sdf.limit(5)

                                                                                

State/Terr,SA2 code,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old
New South Wales,101021007,Braidwood,4330,473,403,495,1472,1487
New South Wales,101021008,Karabar,8546,1082,1075,1818,2858,1713
New South Wales,101021009,Queanbeyan,11370,1275,916,3129,3681,2369
New South Wales,101021010,Queanbeyan - East,5093,588,406,1460,1718,921
New South Wales,101021012,Queanbeyan West -...,12743,1796,1910,2266,4933,1838


In [None]:
pop_sdf \
    .write \
    .mode("overwrite") \
    .parquet(f'{RELATIVE_DIR}curated/pop_sdf.parquet')

## Postcode Ratio Data

In [None]:
skip = list(range(5)) + [6]

postcode_ratio_df = pd \
    .read_excel(
        f'{RELATIVE_DIR}tables/1270055006_CG_POSTCODE_2011_SA2_2011.xls',
        sheet_name = 'Table 3',
        skiprows = skip,
        converters = {'POSTCODE': str, 'SA2_MAINCODE_2011': str}
    ) \
    .drop(columns=['POSTCODE.1', 'PERCENTAGE']) \
    .dropna(axis=0, how='any') # removes footer only (no NA values present in dataset)


In [None]:
postcode_ratio_df

Unnamed: 0,POSTCODE,SA2_MAINCODE_2011,SA2_NAME_2011,RATIO
0,0800,701011002,Darwin City,1.000000
1,0810,701021010,Alawa,0.071997
2,0810,701021013,Brinkin - Nakara,0.096392
3,0810,701021016,Coconut Grove,0.096494
4,0810,701021018,Jingili,0.061562
...,...,...,...,...
5983,7466,604031097,West Coast (Tas.),1.000000
5984,7467,604031097,West Coast (Tas.),1.000000
5985,7468,604031097,West Coast (Tas.),1.000000
5986,7469,604031097,West Coast (Tas.),1.000000


In [None]:
# Convert to spark dataframe

mySchema = StructType([
    StructField("postcode", StringType()),
    StructField("sa2_code", StringType()),
    StructField("sa2_name", StringType()),
    StructField("ratio", FloatType())
])

postcode_ratio_sdf = spark.createDataFrame(
    postcode_ratio_df,
    mySchema
)

In [None]:
postcode_ratio_sdf.limit(5)

postcode,sa2_code,sa2_name,ratio
800,701011002,Darwin City,1.0
810,701021010,Alawa,0.0719971
810,701021013,Brinkin - Nakara,0.0963918
810,701021016,Coconut Grove,0.0964936
810,701021018,Jingili,0.061562


In [None]:
postcode_ratio_sdf \
    .write \
    .mode("overwrite") \
    .parquet(f'{RELATIVE_DIR}curated/postcode_ratio_sdf.parquet')

22/09/26 21:22:56 WARN MemoryManager: Total allocation exceeds 95.00% (1,813,485,955 bytes) of heap memory
Scaling row group sizes to 96.51% for 14 writers


## External Dataset Sources

SA2 Shapefile: https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files <br>
SA2 + Postcode datasets: <br>
https://www.matthewproctor.com/australian_postcodes?simple=True <br>
https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/1270.0.55.006July%202011?OpenDocument <br>
External income dataset: https://data.aurin.org.au/dataset/au-govt-abs-abs-personal-income-total-income-sa2-2011-2018-sa2-2016