In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/09/19 22:09:41 WARN Utils: Your hostname, AryansLaptop resolves to a loopback address: 127.0.1.1; using 172.28.41.58 instead (on interface eth0)
22/09/19 22:09:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/19 22:09:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/19 22:09:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/19 22:09:44 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Postcodes

In [2]:
# This whole code box should be done in script
url = "https://raw.githubusercontent.com/matthewproctor/australianpostcodes/master/australian_postcodes.csv"
file_path = "../data/tables/australian_postcodes.csv"
urlretrieve(url, file_path)

('../data/tables/australian_postcodes.csv',
 <http.client.HTTPMessage at 0x7f03e2c4e110>)

In [3]:
# postcodedf used for merging
postcodedf = spark.read.option("header","true").csv("../data/tables/australian_postcodes.csv")

# testpostcodedf used for income analysis
testpostcodedf = postcodedf.select("SA2_MAINCODE_2016","locality","state","SA2_NAME_2016")
testpostcodedf = testpostcodedf.withColumnRenamed("SA2_MAINCODE_2016","sa2_code")

postcodedf.limit(5)

22/09/19 22:09:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


id,postcode,locality,state,long,lat,dc,type,status,sa3,sa3name,sa4,sa4name,region,Lat_precise,Long_precise,SA1_MAINCODE_2011,SA1_MAINCODE_2016,SA2_MAINCODE_2016,SA2_NAME_2016,SA3_CODE_2016,SA3_NAME_2016,SA4_CODE_2016,SA4_NAME_2016,RA_2011,RA_2016,MMM_2015,MMM_2019,ced,altitude,chargezone,phn_code,phn_name,lgaregion,electorate,electoraterating
230,200,ANU,ACT,149.119,-35.2777,,,,,,,,R1,-35.2777,149.119,80105104901,80105104901,801051049,Acton,80105,North Canberra,801,Australian Capita...,1,1,1,1,,,N2,,,,Durack,
21820,200,Australian Nation...,ACT,149.1189,-35.2777,,,Added 19-Jan-2020,,,,,R1,-35.2776999,149.118527,80105104901,80105104901,801051049,Acton,80105,North Canberra,801,Australian Capita...,1,1,1,1,,,N2,,,,Durack,
232,800,DARWIN,NT,130.83668,-12.458684,,,Updated 6-Feb-2020,70101.0,Darwin City,701.0,Darwin,R1,-12.3932794,130.7766611,70101100203,70101100218,701011002,Darwin City,70101,Darwin City,701,Darwin,3,3,2,2,,,NT1,PHN701,Northern Territory,Darwin,Solomon,Inner Metropolitan
24049,800,DARWIN CITY,NT,130.83668,-12.458684,,,Updated 6-Feb-2020,70101.0,Darwin City,701.0,Darwin,R1,-12.3932794,130.7766611,70101100203,70101100218,701011002,Darwin City,70101,Darwin City,701,Darwin,3,3,2,2,,,NT1,PHN701,Northern Territory,Darwin,Solomon,Inner Metropolitan
233,801,DARWIN,NT,130.83668,-12.458684,,,Updated 25-Mar-20...,70101.0,Darwin City,701.0,Darwin,R1,-12.4634403,130.8456418,70101100208,70101100208,701011002,Darwin City,70101,Darwin City,701,Darwin,3,3,2,2,,,NT1,PHN701,,,Lingiari,Rural


In [4]:
testpostcodedf.write.mode("overwrite").parquet('../data/curated/testpostcodedf.parquet')

                                                                                

In [5]:
from pyspark.sql import Row

postcodedf = postcodedf.select("postcode","SA2_MAINCODE_2016")
postcodedf = postcodedf.withColumnRenamed("SA2_MAINCODE_2016","sa2_code")
print(postcodedf.count())

postcodedf = postcodedf.dropna("any")
print(postcodedf.count())

postcodedf = postcodedf.distinct()
print(postcodedf.count())
print(postcodedf.groupBy("sa2_code").count().count(), postcodedf.groupBy("postcode").count().count())

postcodedf.limit(5)

18442
18265
5492
2221 3165


postcode,sa2_code
1008,117031337
1150,117031337
2100,122031429
2200,119011571
2338,110041201


In the dataset we have 2221 SA2 codes, in actuality there are 2310, missing 89 <br>
In the dataset we have 3165 postcodes, in actuality there are 3333, missing 168 <br>
no other datasets out there with all postcodes and sa2 codes so have to exclude these regions from analysis

In [6]:
sdf = spark.read.parquet("../data/curated/mergedf.parquet/")
print(sdf.count())
mergedf = sdf.join(postcodedf, "postcode")
print(mergedf.count())
print(mergedf.groupBy("sa2_code").count().count(), mergedf.groupBy("postcode").count().count())

mergedf.limit(10)

13614675


                                                                                

23253251


                                                                                

2154 3113


                                                                                

postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,sa2_code
5581,69666829657,226,5162,79.65159982605903,8765ef9f-dba6-407...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,Female,Sem Ut Institute,bicycle shops - s...,2.86,c,405041127
5581,50321300271,226,5162,594.2915496790856,79f2842d-f8b2-4fd...,2022-06-01,Courtney Torres,6945 Higgins Brooks,SA,Female,Augue Industries,bicycle shops - s...,4.24,b,405041127
5581,70610974780,226,5162,52.820359204536665,bdf345c8-4c5d-48e...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,Female,Elit Aliquam PC,tent and awning s...,6.93,a,405041127
5581,17324645993,226,5162,27.12729568273566,0a44d623-e325-4fc...,2022-05-12,Courtney Torres,6945 Higgins Brooks,SA,Female,Eget Metus In Cor...,tent and awning s...,5.73,a,405041127
5581,15582655078,226,5162,75.31904078962366,47ddf8e0-5f72-408...,2021-08-21,Courtney Torres,6945 Higgins Brooks,SA,Female,Eu Odio Phasellus...,"gift, card, novel...",6.77,a,405041127
5581,52160665475,226,5162,224.2530582904585,0558f853-fa82-4bf...,2022-03-26,Courtney Torres,6945 Higgins Brooks,SA,Female,Mauris Associates,digital goods: bo...,6.88,a,405041127
5581,21439773999,226,5162,22.910510985569918,5af069a2-594a-4a1...,2021-08-21,Courtney Torres,6945 Higgins Brooks,SA,Female,Mauris Non Institute,"cable, satellite,...",6.1,a,405041127
5581,19854089605,226,5162,92.7043931796329,0abb6acd-05ef-492...,2022-04-20,Courtney Torres,6945 Higgins Brooks,SA,Female,Aenean Massa Indu...,"gift, card, novel...",3.15,b,405041127
5581,66842618444,226,5162,97.39085253030449,356f3038-5bd9-45a...,2021-08-19,Courtney Torres,6945 Higgins Brooks,SA,Female,Ultrices Inc.,shoe shops,6.93,a,405041127
5581,46804135891,226,5162,11.22535503758106,e85a5baf-fc13-407...,2022-03-25,Courtney Torres,6945 Higgins Brooks,SA,Female,Suspendisse Dui C...,"opticians, optica...",2.93,c,405041127


As we can see there was a loss of rows when joining with the "merchant abn" column in the merchant dataset <br>
This means that some of the merchant abns in the transactions were not available in the data table for the different merchants <br>
These can be removed as we dont know their company tags/take_rate/revenue_bands which are all useful features <br>
Removing these merchants also removed some postcodes/sa2_codes via joining <br>
Also transactions originally only had 3167 postcodes out of 3333 <br> 
<br>
Number of rows goes up when combining with postcodes because a postcode can belong to multiple SA2 regions <br>
for the time being treating it as is 

In [7]:
postcodedf.write.mode("overwrite").parquet('../data/curated/postcodedf.parquet')