In [1]:
import pandas as pd
import geopandas as gpd
import folium
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import numpy as np



## 1. Geovisualisation for the weekly income by postcode

In [2]:
income_df = pd.read_csv("../data/income/2021_income.csv")
income_df

Unnamed: 0,POA_CODE_2021,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size
0,2000,32,2800,941,625,2367,1.3,2225,2.1
1,2007,30,2500,772,500,2197,1.2,1805,2.1
2,2008,28,2600,860,525,2453,1.2,1746,1.9
3,2009,37,2800,1297,580,3035,1.1,2422,2.1
4,2010,36,2900,1479,550,3709,1.1,2297,1.7
...,...,...,...,...,...,...,...,...,...
2638,2899,50,1300,736,240,1630,0.8,1184,2.1
2639,6798,38,1300,1233,150,2659,1.0,2109,2.5
2640,6799,40,1972,741,231,1678,1.1,2519,3.8
2641,9494,0,0,0,0,0,0.0,0,0.0


In [3]:
# sf stands for shape file
sf = gpd.read_file("../data/shapefile/POA_2016_AUST.shp")

sf['POA_CODE16'] = sf['POA_CODE16'].astype(int)
sf

Unnamed: 0,POA_CODE16,POA_NAME16,AREASQKM16,geometry
0,800,0800,3.1734,"POLYGON ((130.83454 -12.45798, 130.83395 -12.4..."
1,810,0810,23.7902,"POLYGON ((130.84711 -12.37754, 130.84726 -12.3..."
2,812,0812,35.8899,"POLYGON ((130.89192 -12.36879, 130.89221 -12.3..."
3,815,0815,0.6381,"POLYGON ((130.87240 -12.37655, 130.87235 -12.3..."
4,820,0820,39.0462,"POLYGON ((130.83499 -12.43006, 130.83508 -12.4..."
...,...,...,...,...
2665,7468,7468,309.9843,"MULTIPOLYGON (((145.19451 -42.19607, 145.19449..."
2666,7469,7469,5893.7056,"MULTIPOLYGON (((144.76361 -41.43594, 144.76359..."
2667,7470,7470,108.0510,"POLYGON ((145.52191 -41.77599, 145.52226 -41.7..."
2668,9797,Migratory - Offshore - Shipping (Aust.),0.0000,


In [4]:
# Merge the two data frame
income_df = income_df \
    .merge(sf[['POA_CODE16', 'geometry']], left_on='POA_CODE_2021', right_on='POA_CODE16') \
    .drop('POA_CODE16', axis=1)

income_df

Unnamed: 0,POA_CODE_2021,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size,geometry
0,2000,32,2800,941,625,2367,1.3,2225,2.1,"POLYGON ((151.20278 -33.85631, 151.20287 -33.8..."
1,2007,30,2500,772,500,2197,1.2,1805,2.1,"POLYGON ((151.19468 -33.88091, 151.19461 -33.8..."
2,2008,28,2600,860,525,2453,1.2,1746,1.9,"POLYGON ((151.19399 -33.88657, 151.19440 -33.8..."
3,2009,37,2800,1297,580,3035,1.1,2422,2.1,"POLYGON ((151.18869 -33.86636, 151.18907 -33.8..."
4,2010,36,2900,1479,550,3709,1.1,2297,1.7,"POLYGON ((151.21074 -33.87861, 151.21102 -33.8..."
...,...,...,...,...,...,...,...,...,...,...
2632,2899,50,1300,736,240,1630,0.8,1184,2.1,"MULTIPOLYGON (((167.99472 -29.04534, 167.99432..."
2633,6798,38,1300,1233,150,2659,1.0,2109,2.5,"POLYGON ((105.67392 -10.41567, 105.67398 -10.4..."
2634,6799,40,1972,741,231,1678,1.1,2519,3.8,"MULTIPOLYGON (((96.83047 -12.17636, 96.83045 -..."
2635,9494,0,0,0,0,0,0.0,0,0.0,


In [5]:
geoJSON = sf[['POA_CODE16', 'geometry']].drop_duplicates('POA_CODE16').to_json()
# print(geoJSON[:300])

In [None]:
# Map of Weekly family income

m = folium.Map(tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=income_df, # data source
    columns=['POA_CODE_2021','Median_tot_prsnl_inc_weekly'], # the columns required
    key_on='properties.POA_CODE16', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Median Total Personal Income Weekly AUD$'
)

c.add_to(m)

m.save('../plots/income_weekly.html')
m

## 2. Geovisualisation for the Distribution of Consumers

In [None]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true") 
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [None]:
transaction_df = spark.read.parquet('../data/curated/transactions_detail.parquet')
transaction_df.limit(10)

In [None]:
transaction_df.count()

In [None]:
transaction_count_df = transaction_df.groupBy('merchant_name', 'postcode').count()
transaction_count_df = transaction_count_df.withColumnRenamed("count","transaction_count")

In [None]:
transaction_count_df.limit(10)

In [None]:
transaction_count_df.count()

In [None]:
transaction_count_df.filter(transaction_count_df.transaction_count<20)

In [None]:
transaction_count_df.groupBy('merchant_name', 'postcode').sum('transaction_count').count()

### Visualising the Distribution of a Merchant

Taking the mechant "Erat Vitae LLP" as an example for visualisation:

In [None]:
merchant_dist = transaction_count_df.filter(transaction_count_df.merchant_name=='Erat Vitae LLP').toPandas()
merchant_dist

In [None]:
# Merge the two data frame

merchant_dist['postcode'] = merchant_dist['postcode'].astype(int)

merchant_dist_map = merchant_dist \
    .merge(sf[['POA_CODE16', 'geometry']], left_on='postcode', right_on='POA_CODE16') \
    .drop('POA_CODE16', axis=1)

merchant_dist_map

In [None]:
# Map of score

m = folium.Map(tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=merchant_dist_map, # data source
    columns=['postcode','transaction_count'], # the columns required
    key_on='properties.POA_CODE16', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Number of Transactions'
)

c.add_to(m)

m

## Below is the convertion of SA2 to Postcode if needed (example)

In [None]:

sa_income_df = pd.read_csv("../data/income/SA2_income_2018.csv")
sa_income_df['SA2'] = income_df['SA2'].astype(str)
sa_income_df = sa_income_df.replace(',','', regex=True)
sa_income_df['Median'] = sa_income_df['Median'].astype(int)
sa_income_df

In [None]:
sa_income_df.isnull().sum()

In [None]:
sa_income_df[sa_income_df['Median']>10000]

In [None]:
# Read the postcode dataframe and simplify into a dataframe only contains postcode and SA2
postcode_df = pd.read_csv("../data/tables/australian_postcodes.csv")
postcode_df['SA2_MAINCODE_2016'] = postcode_df['SA2_MAINCODE_2016'].fillna(0).astype(int)
convert_df = postcode_df[['postcode','SA2_MAINCODE_2016']]
convert_df = convert_df.rename(columns={'SA2_MAINCODE_2016':'SA2'})
convert_df = convert_df.astype(str)
convert_df = convert_df.drop_duplicates()
convert_df

In [None]:
sa_income_df = pd.merge(sa_income_df, convert_df, on='SA2')
sa_income_df

In [None]:
sf = gpd.read_file("../data/shapefile/POA_2016_AUST.shp")

sf['POA_CODE16'] = sf['POA_CODE16'].astype(int)
sf

In [None]:
# Merge the two data frame
sa_income_df['postcode'] = sa_income_df['postcode'].astype(int)

sa_income_df = sa_income_df \
    .merge(sf[['POA_CODE16', 'geometry']], left_on='postcode', right_on='POA_CODE16') \
    .drop('POA_CODE16', axis=1)

sa_income_df

In [None]:
geoJSON = sf[['POA_CODE16', 'geometry']].drop_duplicates('POA_CODE16').to_json()


In [None]:
# Map of Salary

m = folium.Map(tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=sa_income_df, # data source
    columns=['postcode','Median'], # the columns required
    key_on='properties.POA_CODE16', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Salary'
)

c.add_to(m)

m