In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/09/21 05:52:55 WARN Utils: Your hostname, DESKTOP-JJJD94T resolves to a loopback address: 127.0.1.1; using 172.26.254.211 instead (on interface eth0)
22/09/21 05:52:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/21 05:52:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/21 05:52:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
postcode_sdf = spark.read.parquet("../data/curated/postcodedf.parquet/")
print(postcode_sdf.count())
postcode_sdf.limit(5)

                                                                                

5492


postcode,sa2_code
1008,117031337
1150,117031337
2100,122031429
2200,119011571
2338,110041201


## Join postcode data with population data

In [3]:
pop_sdf = spark.read.parquet("../data/curated/pop_sdf.parquet/")
print(pop_sdf.count())
populationdf = pop_sdf.withColumnRenamed("SA2 code","sa2_code")
populationdf.limit(5)

2450


State/Terr,sa2_code,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old
New South Wales,115011556,Castle Hill - South,10436,1256,1454,1704,3718,2304
New South Wales,115011557,Castle Hill - West,5198,568,849,810,1928,1043
New South Wales,115011558,Cherrybrook,19135,1982,3084,2522,6760,4787
New South Wales,115011621,Kellyville - East,17748,2300,3032,2748,6701,2967
New South Wales,115011622,Kellyville - West,11417,1702,1453,2162,4067,2033


In [4]:
merged_sdf1 = populationdf.join(postcode_sdf, on="sa2_code", how="inner")

In [5]:
merged_sdf1.groupBy("sa2_code").count().count()

                                                                                

2083

Lost 367 records for which there were no sa2 codes

In [6]:
merged_sdf1.orderBy("postcode").limit(5)

sa2_code,State/Terr,SA2 name,Total,Under 10,Adolescent,Young adult,Middle age,Old,postcode
801051049,Australian Capita...,Acton,2875,6,1528,1292,47,2,200
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,800
701011002,Northern Territory,Darwin City,7679,474,325,3322,2652,906,801
701011007,Northern Territory,Parap,2980,350,282,801,1047,500,804
701021010,Northern Territory,Alawa,2172,326,227,529,704,386,810


In [7]:
postcode_pop_sdf = merged_sdf1.groupBy("postcode") \
    .agg(
        F.round(F.mean(merged_sdf1.Total), 2).alias("total_pop"),
        F.round(F.mean(merged_sdf1['Under 10']), 2).alias("under10_pop"),
        F.round(F.mean(merged_sdf1.Adolescent), 2).alias("adolsc_pop"),
        F.round(F.mean(merged_sdf1['Young adult']), 2).alias("yng_adult_pop"),
        F.round(F.mean(merged_sdf1['Middle age']), 2).alias("mid_age_pop"),
        F.round(F.mean(merged_sdf1.Old), 2).alias("old_pop")
    )
    
postcode_pop_sdf.orderBy("postcode").show()

+--------+---------+-----------+----------+-------------+-----------+-------+
|postcode|total_pop|under10_pop|adolsc_pop|yng_adult_pop|mid_age_pop|old_pop|
+--------+---------+-----------+----------+-------------+-----------+-------+
|    0200|   2875.0|        6.0|    1528.0|       1292.0|       47.0|    2.0|
|    0800|   7679.0|      474.0|     325.0|       3322.0|     2652.0|  906.0|
|    0801|   7679.0|      474.0|     325.0|       3322.0|     2652.0|  906.0|
|    0804|   2980.0|      350.0|     282.0|        801.0|     1047.0|  500.0|
|    0810|   3018.0|      419.0|    346.08|       769.08|    1013.67| 470.17|
|    0811|   3615.0|      372.0|     409.0|       1140.0|     1086.0|  608.0|
|    0812|  3314.67|     483.33|     447.0|       636.83|    1143.67| 603.83|
|    0813|   2516.0|      468.0|     454.0|        729.0|      664.0|  201.0|
|    0814|   4112.0|      490.0|     393.0|       1204.0|     1371.0|  654.0|
|    0815|   3615.0|      372.0|     409.0|       1140.0|     10

## Join postcode data with income data

In [8]:
incomedf = spark.read.parquet("../data/curated/incomedf.parquet/")
print(incomedf.count())
incomedf.limit(5)

2239


sa2_code,num_earners,median_age,median_income,mean_income
213011340,11238,46,51181,61177
213021341,7942,43,57585,69301
213021344,11216,41,66161,80219
213021345,3015,41,52078,60249
213021346,9996,47,63892,90668


In [9]:
merged_sdf2 = incomedf.join(postcode_sdf, on="sa2_code", how="inner")

In [10]:
merged_sdf2.groupBy("sa2_code").count().count()

2186

Lost 53 records for which there were no sa2 codes

In [11]:
merged_sdf2.orderBy("postcode").limit(5)

sa2_code,num_earners,median_age,median_income,mean_income,postcode
801051049,548,23,9306,16835,200
701011002,5909,33,60937,87791,800
701011002,5909,33,60937,87791,801
701011007,1873,40,75219,98872,804
701021010,1387,40,54188,61411,810


Again, use the weighted average approach for the income data

In [12]:
postcode_income_sdf = merged_sdf2.groupBy("postcode") \
    .agg(
        F.round(F.mean(merged_sdf2.num_earners), 2).alias("num_earners"),
        F.round(F.mean(merged_sdf2.median_age), 2).alias("median_age"),
        F.round(F.mean(merged_sdf2.median_income), 2).alias("median_income"),
        F.round(F.mean(merged_sdf2.mean_income), 2).alias("mean_income")
    )
    
postcode_income_sdf.orderBy("postcode").show()

+--------+-----------+----------+-------------+-----------+
|postcode|num_earners|median_age|median_income|mean_income|
+--------+-----------+----------+-------------+-----------+
|    0200|      548.0|      23.0|       9306.0|    16835.0|
|    0800|     5909.0|      33.0|      60937.0|    87791.0|
|    0801|     5909.0|      33.0|      60937.0|    87791.0|
|    0804|     1873.0|      40.0|      75219.0|    98872.0|
|    0810|    1823.33|     39.33|      59558.0|   69537.42|
|    0811|     2395.0|      39.0|      52335.0|    67299.0|
|    0812|     2330.2|      41.6|      62069.0|    69623.6|
|    0813|      296.0|      35.0|      35787.0|    44160.0|
|    0814|     2618.0|      39.0|      65946.0|    81123.0|
|    0815|     2395.0|      39.0|      52335.0|    67299.0|
|    0820|    2223.55|     41.09|     58518.91|   75577.27|
|    0821|     3585.0|      39.0|      51942.0|    69250.0|
|    0822|    1787.38|     39.06|     49168.06|   59432.69|
|    0828|      686.0|      43.0|      5

In [13]:
print(postcode_pop_sdf.count(), postcode_income_sdf.count())

2793 3160


In [14]:
external_data_sdf = postcode_pop_sdf.join(postcode_income_sdf, on="postcode", how="full")
print(external_data_sdf.count())
external_data_sdf.limit(5)

3162


                                                                                

postcode,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
200,2875.0,6.0,1528.0,1292.0,47.0,2.0,548.0,23.0,9306.0,16835.0
800,7679.0,474.0,325.0,3322.0,2652.0,906.0,5909.0,33.0,60937.0,87791.0
801,7679.0,474.0,325.0,3322.0,2652.0,906.0,5909.0,33.0,60937.0,87791.0
804,2980.0,350.0,282.0,801.0,1047.0,500.0,1873.0,40.0,75219.0,98872.0
810,3018.0,419.0,346.08,769.08,1013.67,470.17,1823.33,39.33,59558.0,69537.42


## Join External data with existing data

In [15]:
sdf = spark.read.parquet("../data/curated/mergedf.parquet/")
print(sdf.count())
sdf.limit(5)

13613661


                                                                                

merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,postcode,gender,company_name,tags,take_rate,revenue_band
69666829657,226,5162,79.65159982605903,8765ef9f-dba6-407...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Sem Ut Institute,bicycle shops - s...,2.86,c
49891706470,226,5162,3.887089224741017,9ba8ebb2-6593-49f...,2022-07-13,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Non Vestibulum In...,tent and awning s...,5.8,a
70610974780,226,5162,52.820359204536665,bdf345c8-4c5d-48e...,2021-08-20,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Elit Aliquam PC,tent and awning s...,6.93,a
85276983280,226,5162,250.33729038347653,dcad871d-1b75-4a8...,2022-05-06,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Lacus Varius Corp.,florists supplies...,3.32,b
15582655078,226,5162,75.31904078962366,47ddf8e0-5f72-408...,2021-08-21,Courtney Torres,6945 Higgins Brooks,SA,5581,Female,Eu Odio Phasellus...,"gift, card, novel...",6.77,a


In [16]:
finaldf = sdf.join(external_data_sdf, on="postcode", how="inner")
print(finaldf.count())
finaldf.limit(5)

                                                                                

13394287


                                                                                

postcode,merchant_abn,consumer_id,user_id,dollar_value,order_id,order_datetime,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income
6731,49891706470,58911,22131,25.782245737474312,90498b80-984f-43c...,2022-07-13,Paul Lopez,3908 David Squares,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0
6731,96680767841,58911,22131,415.9547984625,829843bc-c571-493...,2021-08-19,Paul Lopez,3908 David Squares,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0
6731,43186523025,58911,22131,33.082322196774484,c12d2520-12c8-40f...,2022-07-13,Paul Lopez,3908 David Squares,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0
6731,66370248931,58911,22131,82.54046571771035,cc8ac5c5-0dfc-4e1...,2021-08-19,Paul Lopez,3908 David Squares,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0
6731,89726005175,58911,22131,58.46693553052902,b1f179f8-2c15-4f5...,2022-05-06,Paul Lopez,3908 David Squares,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0


Null values occur because for some postcodes, population/income data did not exist.

In [17]:
finaldf = finaldf.dropna(how="any")
finaldf.count()

22/09/21 06:11:33 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

11818811

Removing records containing null values resulted in the loss of 1,575,476 records.

## Join Fraud Data

In [18]:
consumerfrauddf = spark.read.parquet("../data/curated/consumerfrauddf.parquet/")
print(consumerfrauddf.count())
consumerfrauddf = consumerfrauddf.withColumnRenamed("fraud_probability","consumer_fraud_%")
consumerfrauddf.limit(5)

34765


user_id,order_datetime,consumer_fraud_%
3753,2022-02-16,48.85325253622543
9646,2021-09-23,47.83931206340956
243,2021-09-02,50.88971939168309
3907,2021-10-07,38.58123424858352
14864,2021-11-29,27.072321329372105


In [19]:
merchantfrauddf = spark.read.parquet("../data/curated/merchantfrauddf.parquet/")
print(merchantfrauddf.count())
merchantfrauddf = merchantfrauddf.withColumnRenamed("fraud_probability","merchant_fraud_%")
merchantfrauddf.limit(5)

114


merchant_abn,order_datetime,merchant_fraud_%
19492220327,2021-11-28,44.40365864749536
31334588839,2021-10-02,42.75530083865367
19492220327,2021-12-22,38.867790051131095
82999039227,2021-12-19,94.1347004808891
90918180829,2021-09-02,43.32551731714902


In [20]:
print(finaldf.count())
finaldf = finaldf.join(consumerfrauddf, ["order_datetime", "user_id"], "leftouter")
print(finaldf.count())
finaldf = finaldf.join(merchantfrauddf, ["order_datetime", "merchant_abn"], "leftouter")
print(finaldf.count())
finaldf.limit(5)

                                                                                

11818811


                                                                                

11818811


                                                                                

11818811


                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,customer_name,address,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2022-07-13,49891706470,22131,6731,58911,25.782245737474312,90498b80-984f-43c...,Paul Lopez,3908 David Squares,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,Paul Lopez,3908 David Squares,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2022-07-13,43186523025,22131,6731,58911,33.082322196774484,c12d2520-12c8-40f...,Paul Lopez,3908 David Squares,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,Paul Lopez,3908 David Squares,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2022-05-06,89726005175,22131,6731,58911,58.46693553052902,b1f179f8-2c15-4f5...,Paul Lopez,3908 David Squares,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,


In [21]:
finaldf = finaldf.drop("customer_name","address")
#finaldf = finaldf.na.fill(0.1)
finaldf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2022-07-13,49891706470,22131,6731,58911,25.782245737474312,90498b80-984f-43c...,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2022-07-13,43186523025,22131,6731,58911,33.082322196774484,c12d2520-12c8-40f...,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,
2022-05-06,89726005175,22131,6731,58911,58.46693553052902,b1f179f8-2c15-4f5...,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374.0,1373.0,1185.0,2200.0,2600.0,1016.0,2907.0,39.0,48034.0,56306.0,,


Add code for saving `finaldf` in curated folder as parquet file