In [None]:
import pandas as pd
import geopandas as gpd
import folium
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import numpy as np

## 1. Geovisualisation for the weekly income by postcode

### Note before you start: This notebook might take up large space to run as the shape file of the whole Australia is huge, if not able to run both visualisations at the same time, try run them one by one.

This creates the geovisualisation for the median weekly personal income

In [None]:
income_df = pd.read_csv("../data/income/2021_income.csv")
income_df

In [None]:
income_df = income_df[(income_df['Median_tot_prsnl_inc_weekly'] < 10000)]

In [None]:
# sf stands for shape file
sf = gpd.read_file("../data/shapefile/POA_2016_AUST.shp")

sf['POA_CODE16'] = sf['POA_CODE16'].astype(int)
sf

In [None]:
# Merge the income data frame and the shape file
income_df = income_df \
    .merge(sf[['POA_CODE16', 'geometry']], left_on='POA_CODE_2021', right_on='POA_CODE16') \
    .drop('POA_CODE16', axis=1)

income_df

In [None]:
geoJSON = sf[['POA_CODE16', 'geometry']].drop_duplicates('POA_CODE16').to_json()

In [None]:
# Map of Weekly personal income

m = folium.Map(location=[-38.043995, 145.264296], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=income_df, # data source
    columns=['POA_CODE_2021','Median_tot_prsnl_inc_weekly'], # the columns required
    key_on='properties.POA_CODE16', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Median Total Personal Income Weekly AUD$'
)

c.add_to(m)

m.save('../plots/income_weekly.html')
m

In [None]:
income_df = income_df.dropna()
income_df['centroid'] = income_df['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
income_df[['POA_CODE_2021', 'Median_tot_prsnl_inc_weekly', 'centroid']].head()

In [None]:
for income, coord in income_df.loc[income_df['Median_tot_prsnl_inc_weekly'] > 1500, ['Median_tot_prsnl_inc_weekly', 'centroid']].values:
    m.add_child(
        folium.Marker(location=coord, popup=income)
    )
m.save('../plots/income_weekly_ping.html')
m


## 2. Geovisualisation for the Distribution of Consumers

This visualises the distribution of the merchant's consumer count by postcodes

In [None]:
from pyspark.sql import SparkSession

# Create a spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true") 
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [None]:
transaction_df = spark.read.parquet('../data/curated/transactions_detail.parquet')
transaction_df.limit(10)

In [None]:
transaction_df.count()

In [None]:
transaction_count_df = transaction_df.groupBy('merchant_name', 'postcode').count()
transaction_count_df = transaction_count_df.withColumnRenamed("count","transaction_count")

In [None]:
transaction_count_df.limit(10)

In [None]:
transaction_count_df.count()

In [None]:
transaction_count_df.filter(transaction_count_df.transaction_count<20)

In [None]:
transaction_count_df.groupBy('merchant_name', 'postcode').sum('transaction_count').count()

### Visualising the Distribution of a Merchant

Taking the mechant "Erat Vitae LLP" as an example for visualisation:

In [None]:
merchant_dist = transaction_count_df.filter(transaction_count_df.merchant_name=='Erat Vitae LLP').toPandas()
merchant_dist

In [None]:
# Merge the two data frame

merchant_dist['postcode'] = merchant_dist['postcode'].astype(int)

merchant_dist_map = merchant_dist \
    .merge(sf[['POA_CODE16', 'geometry']], left_on='postcode', right_on='POA_CODE16') \
    .drop('POA_CODE16', axis=1)

merchant_dist_map

In [None]:
# Map of consumer distribution of 'Erat Vitae LLP'

m = folium.Map(location=[-38.043995, 145.264296], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=merchant_dist_map, # data source
    columns=['postcode','transaction_count'], # the columns required
    key_on='properties.POA_CODE16', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Number of Transactions'
)

c.add_to(m)

m

Now we add pings for the transaction counts over 150 to visualise which areas have higher transactions.

In [None]:
merchant_dist_map['centroid'] = merchant_dist_map['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))
merchant_dist_map[['postcode', 'transaction_count', 'centroid']].head()

In [None]:
merchant_dist_map.loc[merchant_dist_map['transaction_count'] > 150]

In [None]:
for count, coord in merchant_dist_map.loc[merchant_dist_map['transaction_count'] > 150, ['transaction_count', 'centroid']].values:
    m.add_child(
        folium.Marker(location=coord, popup=count)
    )
m.save('../plots/consumer_distribution_ping.html')
m


## Below is the convertion of SA2 to Postcode if needed (example)

### Do not need to run the following:
This is another dataset of the income based on SA2, it is used to test if the there are lesser missing data (since the postcode income set have some missing postcodes) but it is found that there is not much of difference, so this is not that useful.
It is remained in case future testings need it.


In [None]:
sa_income_df = pd.read_csv("../data/income/SA2_income_2018.csv")
sa_income_df['SA2'] = sa_income_df['SA2'].astype(str)
sa_income_df = sa_income_df.replace(',','', regex=True)
sa_income_df['Median'] = sa_income_df['Median'].astype(int)
sa_income_df

In [None]:
sa_income_df.isnull().sum()

In [None]:
sa_income_df[sa_income_df['Median']>10000]

In [None]:
# Read the postcode dataframe and simplify into a dataframe only contains postcode and SA2
postcode_df = pd.read_csv("../data/tables/australian_postcodes.csv")
postcode_df['SA2_MAINCODE_2016'] = postcode_df['SA2_MAINCODE_2016'].fillna(0).astype(int)
convert_df = postcode_df[['postcode','SA2_MAINCODE_2016']]
convert_df = convert_df.rename(columns={'SA2_MAINCODE_2016':'SA2'})
convert_df = convert_df.astype(str)
convert_df = convert_df.drop_duplicates()
convert_df

In [None]:
sa_income_df = pd.merge(sa_income_df, convert_df, on='SA2')
sa_income_df

In [None]:
sf = gpd.read_file("../data/shapefile/POA_2016_AUST.shp")

sf['POA_CODE16'] = sf['POA_CODE16'].astype(int)
sf

In [None]:
# Merge the two data frame
sa_income_df['postcode'] = sa_income_df['postcode'].astype(int)

sa_income_df = sa_income_df \
    .merge(sf[['POA_CODE16', 'geometry']], left_on='postcode', right_on='POA_CODE16') \
    .drop('POA_CODE16', axis=1)

sa_income_df

In [None]:
geoJSON = sf[['POA_CODE16', 'geometry']].drop_duplicates('POA_CODE16').to_json()


In [None]:
# Map of Salary

m = folium.Map(tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on more information on how to plot aggregated data.
c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=sa_income_df, # data source
    columns=['postcode','Median'], # the columns required
    key_on='properties.POA_CODE16', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Salary'
)

c.add_to(m)

m