# Analysis

## Postcode Dataset

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import Row

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

In [18]:
# postcodedf used for merging
postcodedf = spark.read.option("header","true").csv("../data/tables/australian_postcodes.csv")

# testpostcodedf used for income analysis
testpostcodedf = postcodedf.select("SA2_MAINCODE_2016","locality","state","SA2_NAME_2016")
testpostcodedf = testpostcodedf.withColumnRenamed("SA2_MAINCODE_2016","sa2_code")

postcodedf.limit(5)

id,postcode,locality,state,long,lat,dc,type,status,sa3,sa3name,sa4,sa4name,region,Lat_precise,Long_precise,SA1_MAINCODE_2011,SA1_MAINCODE_2016,SA2_MAINCODE_2016,SA2_NAME_2016,SA3_CODE_2016,SA3_NAME_2016,SA4_CODE_2016,SA4_NAME_2016,RA_2011,RA_2016,MMM_2015,MMM_2019,ced,altitude,chargezone,phn_code,phn_name,lgaregion,electorate,electoraterating
230,200,ANU,ACT,149.119,-35.2777,,,,,,,,R1,-35.2777,149.119,80105104901,80105104901,801051049,Acton,80105,North Canberra,801,Australian Capita...,1,1,1,1,,,N2,,,,Durack,
21820,200,Australian Nation...,ACT,149.1189,-35.2777,,,Added 19-Jan-2020,,,,,R1,-35.2776999,149.118527,80105104901,80105104901,801051049,Acton,80105,North Canberra,801,Australian Capita...,1,1,1,1,,,N2,,,,Durack,
232,800,DARWIN,NT,130.83668,-12.458684,,,Updated 6-Feb-2020,70101.0,Darwin City,701.0,Darwin,R1,-12.3932794,130.7766611,70101100203,70101100218,701011002,Darwin City,70101,Darwin City,701,Darwin,3,3,2,2,,,NT1,PHN701,Northern Territory,Darwin,Solomon,Inner Metropolitan
24049,800,DARWIN CITY,NT,130.83668,-12.458684,,,Updated 6-Feb-2020,70101.0,Darwin City,701.0,Darwin,R1,-12.3932794,130.7766611,70101100203,70101100218,701011002,Darwin City,70101,Darwin City,701,Darwin,3,3,2,2,,,NT1,PHN701,Northern Territory,Darwin,Solomon,Inner Metropolitan
233,801,DARWIN,NT,130.83668,-12.458684,,,Updated 25-Mar-20...,70101.0,Darwin City,701.0,Darwin,R1,-12.4634403,130.8456418,70101100208,70101100208,701011002,Darwin City,70101,Darwin City,701,Darwin,3,3,2,2,,,NT1,PHN701,,,Lingiari,Rural


In [19]:
testpostcodedf.write.mode("overwrite").parquet('../data/curated/testpostcodedf.parquet')

                                                                                

In [20]:
# Extracts the useful features
postcodedf = postcodedf.select("postcode","SA2_MAINCODE_2016")
postcodedf = postcodedf.withColumnRenamed("SA2_MAINCODE_2016","sa2_code")
print(postcodedf.count())

# Checks for nulls
postcodedf = postcodedf.dropna("any")
print(postcodedf.count())

# Finds number of unique sa2 codes and postcodes
postcodedf = postcodedf.distinct()
print(postcodedf.count())
print(postcodedf.groupBy("sa2_code").count().count(), postcodedf.groupBy("postcode").count().count())

postcodedf.limit(5)

18442
18265
5492
2221 3165


postcode,sa2_code
1008,117031337
1150,117031337
2100,122031429
2200,119011571
2338,110041201


In the dataset we have 2221 SA2 codes, in actuality there are 2310, missing 89 <br>
In the dataset we have 3165 postcodes, in actuality there are 3333, missing 168 <br>
no other datasets out there with all postcodes and sa2 codes so have to exclude these regions from analysis

In [21]:
# Joins postcode data with original to see number of lost records

sdf = spark.read.parquet("../data/curated/mergedf.parquet/")
print(sdf.count())
mergedf = sdf.join(postcodedf, "postcode")
print(mergedf.count())
print(mergedf.groupBy("sa2_code").count().count(), mergedf.groupBy("postcode").count().count())

mergedf.limit(10)

13613661


                                                                                

23251565


                                                                                

2154 3113


                                                                                

postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,sa2_code
3612,94472466107,30,13842,36.09451992152847,0dc80e20-901c-410...,2021-08-20,Tamara Stewart,352 Jessica Summit,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,216011410
3612,94472466107,30,13842,36.09451992152847,0dc80e20-901c-410...,2021-08-20,Tamara Stewart,352 Jessica Summit,VIC,Female,Eu Dolor Egestas PC,"cable, satellite,...",6.23,a,204011058
3612,21532935983,30,13842,71.1148505207073,5fbb2316-39b7-43b...,2021-08-21,Tamara Stewart,352 Jessica Summit,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,216011410
3612,21532935983,30,13842,71.1148505207073,5fbb2316-39b7-43b...,2021-08-21,Tamara Stewart,352 Jessica Summit,VIC,Female,Eleifend Nec Inco...,"cable, satellite,...",5.58,a,204011058
3612,60956456424,30,13842,56.52469841268393,60bc5068-e775-4c4...,2021-08-19,Tamara Stewart,352 Jessica Summit,VIC,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b,216011410
3612,60956456424,30,13842,56.52469841268393,60bc5068-e775-4c4...,2021-08-19,Tamara Stewart,352 Jessica Summit,VIC,Female,Ultricies Digniss...,"gift, card, novel...",4.69,b,204011058
3612,39211701585,30,13842,105.80444352294496,810594a7-c21a-4dd...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,Female,Diam Eu Dolor PC,shoe shops,4.76,b,216011410
3612,39211701585,30,13842,105.80444352294496,810594a7-c21a-4dd...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,Female,Diam Eu Dolor PC,shoe shops,4.76,b,204011058
3612,27326652377,30,13842,1179.908032136875,7ef554a5-02a8-435...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,Female,Tellus Aenean Cor...,music shops - mus...,6.33,a,216011410
3612,27326652377,30,13842,1179.908032136875,7ef554a5-02a8-435...,2021-08-22,Tamara Stewart,352 Jessica Summit,VIC,Female,Tellus Aenean Cor...,music shops - mus...,6.33,a,204011058


As we can see there was a loss of rows when joining with the "merchant abn" column in the merchant dataset <br>
This means that some of the merchant abns in the transactions were not available in the data table for the different merchants <br>
These can be removed as we dont know their company tags/take_rate/revenue_bands which are all useful features <br>
Removing these merchants also removed some postcodes/sa2_codes via joining <br>
Also transactions originally only had 3167 postcodes out of 3333 <br> 
<br>
Number of rows goes up when combining with postcodes because a postcode can belong to multiple SA2 regions <br>
for the time being treating it as is 

In [22]:
# Saves first postcode dataset for later comparison
postcodedf.write.mode("overwrite").parquet('../data/curated/postcodedf.parquet')

## Postcode Ratio Dataset

In [23]:
postcode_ratio_sdf = spark.read.parquet("../data/curated/postcode_ratio_sdf.parquet")

In [24]:
postcode_ratio_sdf.limit(5)

postcode,sa2_code,sa2_name,ratio
2478,112011242,Lennox Head - Ske...,0.2759902
2479,112011237,Ballina Region,0.2624605
2479,112011238,Bangalow,0.7372028
2479,112011240,Byron Bay,0.0003367
2480,112011237,Ballina Region,0.0006535


In [25]:
sa2_count = postcode_ratio_sdf \
        .distinct() \
        .groupBy("sa2_code") \
        .count() \
        .count()

postcode_count = postcode_ratio_sdf \
        .distinct() \
        .groupBy("postcode") \
        .count() \
        .count()

print(sa2_count, postcode_count)

2162 2653


Obtained from the dataset:
- 2162 unique SA2 codes, 2310 codes exist, missing 148
- 2653 unique postcodes, 3333 codes exist, missing 680