In [1]:
import pandas as pd
import numpy as np
import os
import re

from pyspark.sql import SparkSession
from pyspark.shell import spark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql import functions as F
import matplotlib.pyplot as plt

spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.1.2
      /_/

Using Python version 3.7.4 (default, Aug 13 2019 15:17:50)
Spark context Web UI available at http://172.16.29.12:4042
Spark context available as 'sc' (master = local[*], app id = local-1662578512765).
SparkSession available as 'spark'.


## Loading all datasets
- We will be performing basic analysis on each of the dataset
- Choose certain features that seem to be appropriate for geospatial visualisation

### Customer transaction details

In [2]:
customer_join_transaction = spark.read.parquet("../data/curated/customer_join_transaction.parquet/")

AnalysisException: Path does not exist: file:/Users/oliver/Documents/GitHub/generic-buy-now-pay-later-project-group-19/data/curated/customer_join_transaction.parquet

In [None]:
customer_join_transaction.count()

In [5]:
customer_join_transaction.show()

+-------+------------+------------------+--------------------+--------------+--------+-----+------+
|user_id|merchant_abn|      dollar_value|            order_id|order_datetime|postcode|state|gender|
+-------+------------+------------------+--------------------+--------------+--------+-----+------+
|  14409| 64403598239|116.41150553221357|5474e46e-7073-442...|    2021-08-21|    3874|  VIC|  Male|
|  14409| 45629217853|28.600613410021932|f539493d-175d-48b...|    2021-04-09|    3874|  VIC|  Male|
|  14409| 77505747217|440.68714072131945|99f8e136-0ca2-491...|    2021-08-22|    3874|  VIC|  Male|
|  14409| 68216911708|22.960774428094698|f4ea2269-3509-411...|    2021-04-10|    3874|  VIC|  Male|
|  14409| 24852446429|30.776665837082245|4b1413e3-3ee9-430...|    2021-08-16|    3874|  VIC|  Male|
|  14409| 98072939449| 1118.263275036643|e597537b-d4a9-446...|    2021-04-10|    3874|  VIC|  Male|
|  14409| 22718657980|307.02232595923795|82c8d263-9d95-424...|    2021-08-27|    3874|  VIC|  Male|


### Customer behaviour
- Only notable feature is no_orders of each customer at a particular date

In [6]:
customer_behaviour = spark.read.parquet("../data/curated/customer_purchase_behaviour.parquet/")

In [7]:
customer_behaviour

user_id,order_datetime,dollar_spent,no_orders
18585,2021-08-20,28.66078522665458,2
271,2021-08-20,1851.9709565996084,2
559,2021-08-20,519.619820268801,2
19156,2021-08-20,302.72316980059384,2
756,2021-08-20,169.25922159143067,2
785,2021-08-20,49.4051194612238,1
970,2021-08-20,12.28895749317062,1
997,2021-08-20,57.1290091592229,1
1069,2021-08-20,65.389116401826,1
1256,2021-08-20,337.0946186819374,2


In [3]:
merchant_sales = spark.read.parquet("../data/curated/merchant_sales.parquet/")

### Merchant Sales
- Sales revenue per day
- Number of orders

In [4]:
merchant_sales = spark.read.parquet("../data/curated/merchant_sales.parquet/")
merchant_sales = merchant_sales.select(merchant_sales['*'], month(col("order_datetime")).alias('month'))
merchant_monthly_sales = merchant_sales.groupby(['merchant_abn','month']).agg({'sales_revenue':'sum',
                                                                               'no_orders':'sum'})
merchant_monthly_deviation = merchant_monthly_sales.groupby('merchant_abn').agg(F.mean('sum(sales_revenue)'),
                                                                                F.stddev('sum(sales_revenue)'),
                                                                                F.mean('sum(no_orders)'),
                                                                                F.stddev('sum(no_orders)'))
coefficient_variation = (merchant_monthly_deviation.withColumn('std/mean',
                                                               F.col('stddev_samp(sum(sales_revenue))') / F.col('avg(sum(sales_revenue))'))
                                                   .withColumn('avgsale/avgorder',
                                                               F.col('avg(sum(sales_revenue))') / F.col('avg(sum(no_orders))'))
                                                   .toPandas())


In [5]:
merchant_sales.count()

398352

In [6]:
merchant_monthly_sales = merchant_sales.groupby(['merchant_abn','month']).agg({'sales_revenue':'sum',
                                                                               'no_orders':'sum'})


In [7]:
merchant_monthly_sales

merchant_abn,month,sum(sales_revenue),sum(no_orders)
69913521743,11,22849.38079537934,133
70268417986,12,3943.524915618402,14
53560474586,11,13386.404472435612,21
67713855705,11,35808.844133378734,67
72724420494,12,10858.95710077551,214
74353814311,10,1148.8788791009647,14
56490929085,10,1796.493872362924,13
87802246756,10,11562.759446146316,151
71946255432,9,34032.68342643956,198
64112855512,12,3439.8563858580706,18


In [8]:
merchant_monthly_deviation = merchant_monthly_sales.groupby('merchant_abn').agg(F.mean('sum(sales_revenue)'),
                                                                                F.stddev('sum(sales_revenue)'),
                                                                                F.mean('sum(no_orders)'),
                                                                                F.stddev('sum(no_orders)'))

In [9]:
coefficient_variation = (merchant_monthly_deviation.withColumn('std/mean',
                                                               F.col('stddev_samp(sum(sales_revenue))') / F.col('avg(sum(sales_revenue))'))
                                                   .withColumn('avgsale/avgorder',
                                                               F.col('avg(sum(sales_revenue))') / F.col('avg(sum(no_orders))'))
                                                   .toPandas())

In [14]:
top_10_most_unstable_merchant = coefficient_variation.sort_values(['avg(sum(sales_revenue))'], ascending=False)[:10]

In [15]:
top_10_most_unstable_merchant

Unnamed: 0,merchant_abn,avg(sum(sales_revenue)),stddev_samp(sum(sales_revenue)),avg(sum(no_orders)),stddev_samp(sum(no_orders)),std/mean,avgsale/avgorder
204,27093785141,446770.003131,220510.25431,1178.142857,576.239947,0.493565,379.215475
807,96680767841,444271.749573,211931.739037,1420.0,675.054566,0.477032,312.867429
3003,50315283629,441749.657164,210839.619311,1370.285714,656.560917,0.477283,322.377773
1967,32709545238,439766.522559,207116.199488,596.857143,286.004329,0.470969,736.803652
1042,35909341340,438266.701529,210533.642029,1743.714286,832.959926,0.480378,251.340891
1895,86578477987,432645.222931,202777.857303,12317.571429,5786.935022,0.468693,35.124231
1289,28057731482,431425.112851,193553.51071,578.857143,256.745937,0.448638,745.304983
3446,21439773999,429822.010033,202483.270143,5472.428571,2580.848495,0.471086,78.543192
3199,18158387243,429719.71932,206832.112158,760.285714,367.556669,0.481319,565.208199
1255,48534649627,429687.956089,203914.492305,2970.428571,1413.041266,0.474564,144.655206


In [13]:
top_10_most_rev_per_order = coefficient_variation.sort_values('avgsale/avgorder', ascending=False)[:10]
top_10_most_rev_per_order

Unnamed: 0,merchant_abn,avg(sum(sales_revenue)),stddev_samp(sum(sales_revenue)),avg(sum(no_orders)),stddev_samp(sum(no_orders)),std/mean,avgsale/avgorder
67,38049816588,2075.8759,,1.0,,,2075.8759
3894,66482985683,2072.334889,,1.0,,,2072.334889
2371,71762159356,2069.791694,,1.0,,,2069.791694
1682,65959377833,2055.390127,,1.0,,,2055.390127
1356,44345785419,2054.167327,,1.0,,,2054.167327
527,75265429612,2048.379909,,1.0,,,2048.379909
1480,31552582037,2046.743273,,1.0,,,2046.743273
609,53877856360,2045.32448,,1.0,,,2045.32448
824,57241240228,2034.47353,,1.0,,,2034.47353
1326,79800042168,3036.921031,1470.389244,1.5,0.707107,0.484171,2024.61402


### Sales by region
- Information regarding sales made by customers of specific locations in Australia

In [16]:
sales_by_region = pd.read_parquet("../data/curated/sales_by_region.parquet/")

In [17]:
sales_by_region.head()

Unnamed: 0,state,postcode,order_datetime,dollar_spent,no_orders
0,NT,841,2021-10-01,1190.514532,4
1,VIC,3123,2021-11-11,2088.477116,13
2,NSW,1360,2021-11-30,1572.944357,10
3,SA,5251,2021-11-18,1176.370464,11
4,VIC,3375,2022-02-04,3149.566721,12


In [18]:
sales_by_region.nunique()

state                  8
postcode            3165
order_datetime       184
dollar_spent      576539
no_orders             53
dtype: int64

In [19]:
sales_by_region

Unnamed: 0,state,postcode,order_datetime,dollar_spent,no_orders
0,NT,841,2021-10-01,1190.514532,4
1,VIC,3123,2021-11-11,2088.477116,13
2,NSW,1360,2021-11-30,1572.944357,10
3,SA,5251,2021-11-18,1176.370464,11
4,VIC,3375,2022-02-04,3149.566721,12
...,...,...,...,...,...
576584,NSW,2380,2021-08-30,38.844258,1
576585,NSW,1187,2021-08-28,73.372201,3
576586,NSW,2343,2022-01-19,76.887112,2
576587,VIC,3160,2021-09-20,31.002981,1


In [20]:
revenue_by_region = sales_by_region\
                    .groupby(['state', 'postcode'], as_index=False)\
                    .agg(
                        {
                            'dollar_spent': 'sum'
                        }
                    ).rename(columns={'dollar_spent': 'total_revenue_generated'})


In [21]:
revenue_by_region['postcode'] = revenue_by_region['postcode'].astype(int)

In [22]:
revenue_by_region

Unnamed: 0,state,postcode,total_revenue_generated
0,ACT,200,101102.253779
1,ACT,2600,128671.340198
2,ACT,2601,143713.207764
3,ACT,2602,165889.186214
4,ACT,2603,163251.853817
...,...,...,...
3160,WA,6989,249449.669191
3161,WA,6990,169500.042058
3162,WA,6991,200993.506739
3163,WA,6992,187094.500729


### Checking for unique values of the columns
Mainly just looking out for inconsistencies in state and gender

In [15]:
from pyspark.sql import functions as F

In [16]:
customer_join_transaction.select("gender").distinct().show()

+-----------+
|     gender|
+-----------+
|Undisclosed|
|     Female|
|       Male|
+-----------+



In [17]:
customer_join_transaction.select("state").distinct().show()

+-----+
|state|
+-----+
|   NT|
|  ACT|
|   SA|
|  TAS|
|   WA|
|  QLD|
|  VIC|
|  NSW|
+-----+



## External dataset
- External dataset allows linking respective postcodes to their SA2 level index
- External dataset contains geometry for SA2 level which allows for geospatial analysis


In [23]:
import pandas as pd
import geopandas as gpd
import io
import requests

url = "https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))
df.to_parquet("../data/curated/postcode_database.parquet")

In [24]:
df = spark.read.parquet("../data/curated/postcode_database.parquet")

Dataset contains alot of useless features that are not required for our analysis, hence we will be retaining only some of the important features.
- For instance, in this dataset, we have long, lat, long_precise, and lat_precise, we will retain the precise longitude and latitde as they are generated from Google Maps API 

In [25]:
# postcode_sdf = spark.createDataFrame(df[['postcode', 'SA2_MAINCODE_2016', 'Long_precise', 'Lat_precise']])
postcode_df = df.select(['postcode', 'SA2_MAINCODE_2016', 'Long_precise', 'Lat_precise']).toPandas()

In [26]:
postcode_df = (postcode_df.groupby(['postcode','SA2_MAINCODE_2016'], as_index=False)['Lat_precise','Long_precise'].mean())

  """Entry point for launching an IPython kernel.


In [27]:
revenue_by_region

Unnamed: 0,state,postcode,total_revenue_generated
0,ACT,200,101102.253779
1,ACT,2600,128671.340198
2,ACT,2601,143713.207764
3,ACT,2602,165889.186214
4,ACT,2603,163251.853817
...,...,...,...
3160,WA,6989,249449.669191
3161,WA,6990,169500.042058
3162,WA,6991,200993.506739
3163,WA,6992,187094.500729


In [28]:
revenue_by_region_1 = postcode_df.merge(revenue_by_region, how='inner')
revenue_by_region_1.count()

postcode                   5488
SA2_MAINCODE_2016          5488
Lat_precise                5488
Long_precise               5488
state                      5488
total_revenue_generated    5488
dtype: int64

In [29]:
revenue_by_region_1

Unnamed: 0,postcode,SA2_MAINCODE_2016,Lat_precise,Long_precise,state,total_revenue_generated
0,200,801051049.0,-35.277700,149.118764,ACT,101102.253779
1,800,701011002.0,-12.393279,130.776661,NT,268024.595375
2,801,701011002.0,-12.463440,130.845642,NT,215211.742051
3,804,701011007.0,-12.432480,130.846254,NT,171869.178297
4,810,701021010.0,-12.380000,130.873000,NT,262455.328626
...,...,...,...,...,...,...
5483,9013,305011105.0,-27.469771,153.025124,QLD,220583.835445
5484,9015,305011105.0,-27.469771,153.025124,QLD,176184.518459
5485,9464,302031038.0,-27.390000,153.066000,QLD,230439.000896
5486,9726,309101268.0,-28.016700,153.400000,QLD,158105.539360


In [31]:
gdf = gpd.GeoDataFrame(
      revenue_by_region_1, geometry=gpd.points_from_xy(revenue_by_region_1.Long_precise, revenue_by_region_1.Lat_precise))

In [57]:
gdf = gdf.drop(['Long_precise', 'Lat_precise'], axis=1)

In [32]:
gdf

Unnamed: 0,postcode,SA2_MAINCODE_2016,Lat_precise,Long_precise,state,total_revenue_generated,geometry
0,200,801051049.0,-35.277700,149.118764,ACT,101102.253779,POINT (149.11876 -35.27770)
1,800,701011002.0,-12.393279,130.776661,NT,268024.595375,POINT (130.77666 -12.39328)
2,801,701011002.0,-12.463440,130.845642,NT,215211.742051,POINT (130.84564 -12.46344)
3,804,701011007.0,-12.432480,130.846254,NT,171869.178297,POINT (130.84625 -12.43248)
4,810,701021010.0,-12.380000,130.873000,NT,262455.328626,POINT (130.87300 -12.38000)
...,...,...,...,...,...,...,...
5483,9013,305011105.0,-27.469771,153.025124,QLD,220583.835445,POINT (153.02512 -27.46977)
5484,9015,305011105.0,-27.469771,153.025124,QLD,176184.518459,POINT (153.02512 -27.46977)
5485,9464,302031038.0,-27.390000,153.066000,QLD,230439.000896,POINT (153.06600 -27.39000)
5486,9726,309101268.0,-28.016700,153.400000,QLD,158105.539360,POINT (153.40000 -28.01670)


In [33]:
rev_by_sa2 = gdf.groupby('SA2_MAINCODE_2016', as_index=False).agg({'total_revenue_generated':'sum'})

In [34]:
rev_by_sa2

Unnamed: 0,SA2_MAINCODE_2016,total_revenue_generated
0,101021007.0,4.246364e+05
1,101021008.0,2.515289e+05
2,101021009.0,2.515289e+05
3,101021010.0,2.515289e+05
4,101021011.0,1.305980e+06
...,...,...
2216,801111141.0,4.002018e+05
2217,901011001.0,1.642408e+05
2218,901021002.0,2.294536e+05
2219,901031003.0,2.543896e+05


In [35]:
# gdf.groupby('SA2_MAINCODE_2016', as_index=False)['Lat_precise','Long_precise'].mean()
rev_by_sa2 = (gdf.groupby('SA2_MAINCODE_2016', as_index=False).agg({'total_revenue_generated':'sum'})
    .merge(gdf.groupby('SA2_MAINCODE_2016', as_index=False)['Long_precise', 'Lat_precise'].mean()))

  This is separate from the ipykernel package so we can avoid doing imports until


In [36]:
gdf = gpd.GeoDataFrame(
      rev_by_sa2, geometry=gpd.points_from_xy(rev_by_sa2.Long_precise, rev_by_sa2.Lat_precise))

In [49]:
rev_by_sa2['SA2_MAINCODE_2016'] = rev_by_sa2['SA2_MAINCODE_2016'].astype(int).astype(str)

In [53]:
rev_by_sa2

Unnamed: 0,SA2_MAINCODE_2016,total_revenue_generated,Long_precise,Lat_precise,geometry
0,101021007,4.246364e+05,149.768193,-35.332661,POINT (149.76819 -35.33266)
1,101021008,2.515289e+05,149.233333,-35.366667,POINT (149.23333 -35.36667)
2,101021009,2.515289e+05,149.226329,-35.350386,POINT (149.22633 -35.35039)
3,101021010,2.515289e+05,149.253464,-35.348220,POINT (149.25346 -35.34822)
4,101021011,1.305980e+06,149.445979,-35.363561,POINT (149.44598 -35.36356)
...,...,...,...,...,...
2216,801111141,4.002018e+05,148.982743,-35.501929,POINT (148.98274 -35.50193)
2217,901011001,1.642408e+05,105.690449,-10.447525,POINT (105.69045 -10.44753)
2218,901021002,2.294536e+05,96.850173,-12.176433,POINT (96.85017 -12.17643)
2219,901031003,2.543896e+05,150.703069,-35.131759,POINT (150.70307 -35.13176)


In [54]:
rev_by_sa2.drop('geometry', inplace=True, axis=1)

In [59]:
geoJSON = gdf[['SA2_MAINCODE_2016','geometry']].to_json()

In [44]:
gdf['SA2_MAINCODE_2016'] = gdf['SA2_MAINCODE_2016'].astype(int).astype(str)

In [45]:
gdf

Unnamed: 0,SA2_MAINCODE_2016,total_revenue_generated,Long_precise,Lat_precise,geometry
0,101021007,4.246364e+05,149.768193,-35.332661,POINT (149.76819 -35.33266)
1,101021008,2.515289e+05,149.233333,-35.366667,POINT (149.23333 -35.36667)
2,101021009,2.515289e+05,149.226329,-35.350386,POINT (149.22633 -35.35039)
3,101021010,2.515289e+05,149.253464,-35.348220,POINT (149.25346 -35.34822)
4,101021011,1.305980e+06,149.445979,-35.363561,POINT (149.44598 -35.36356)
...,...,...,...,...,...
2216,801111141,4.002018e+05,148.982743,-35.501929,POINT (148.98274 -35.50193)
2217,901011001,1.642408e+05,105.690449,-10.447525,POINT (105.69045 -10.44753)
2218,901021002,2.294536e+05,96.850173,-12.176433,POINT (96.85017 -12.17643)
2219,901031003,2.543896e+05,150.703069,-35.131759,POINT (150.70307 -35.13176)


## Statistical Areas Level 2 - 2021 - Shapefile

In [39]:
sf = gpd.read_file("../data/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp")

In [40]:
sf = sf[['SA2_CODE21','geometry']]

In [55]:
gdf = sf.merge(rev_by_sa2, right_on='SA2_MAINCODE_2016', left_on='SA2_CODE21')

In [56]:
gdf

Unnamed: 0,SA2_CODE21,geometry,SA2_MAINCODE_2016,total_revenue_generated,Long_precise,Lat_precise
0,101021007,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4...",101021007,424636.426495,149.768193,-35.332661
1,101021008,"POLYGON ((149.21899 -35.36738, 149.21800 -35.3...",101021008,251528.885714,149.233333,-35.366667
2,101021009,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3...",101021009,251528.885714,149.226329,-35.350386
3,101021010,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3...",101021010,251528.885714,149.253464,-35.348220
4,101021012,"POLYGON ((149.19572 -35.36126, 149.19970 -35.3...",101021012,468763.039302,149.203913,-35.370800
...,...,...,...,...,...,...
2082,801111141,"POLYGON ((148.80407 -35.37619, 148.80417 -35.3...",801111141,400201.789443,148.982743,-35.501929
2083,901011001,"POLYGON ((105.67393 -10.41566, 105.67399 -10.4...",901011001,164240.839186,105.690449,-10.447525
2084,901021002,"MULTIPOLYGON (((96.91512 -12.14044, 96.91513 -...",901021002,229453.592303,96.850173,-12.176433
2085,901031003,"MULTIPOLYGON (((150.69567 -35.18295, 150.69556...",901031003,254389.609736,150.703069,-35.131759


In [57]:
gdf.describe()

Unnamed: 0,total_revenue_generated,Long_precise,Lat_precise
count,2087.0,2087.0,2087.0
mean,469099.9,143.866746,-31.915911
std,493581.8,11.578704,6.294713
min,21847.46,37.196119,-45.901538
25%,186004.1,143.18952,-35.442
50%,282130.3,147.361875,-33.73986
75%,548717.3,151.222047,-27.682353
max,5911230.0,167.952586,-5.447754


In [None]:
import folium
m = folium.Map(location=[25, 133], 
               tiles="cartodb positron",
               zoom_start=3,
               zoom_control=False,
               width=475,
               height=500,
               html='<div style="font-size: 10pt"</div>')

title = '''
        <h3 align="left" style="font-size:16px"><b>
        Total Dollar Spent by SA2 Code</b></h3>'''.format('Corpus Christi')

m.get_root().html.add_child(folium.Element(title))

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=gdf, # data source
    columns=['SA2_MAINCODE_2016','total_revenue_generated'], # the columns required
    key_on='properties.SA2_MAINCODE_2016', # this is from the geoJSON's properties
    bins=4,
    fill_color='YlGn', # color scheme
    line_opacity=0.3,
    nan_fill_color='white',
    legend_name='Total Spending by SA2 from August 2021 to February 2022'
)

c.add_to(m)

m