## Geospatial Analysis by SA2
1. By Population Number
2. By Age Median
3. By Income
4. By Transaction Frequency

#### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

import geopandas as gpd
import folium
from pyspark.sql import SparkSession, Window, functions as F
from pyspark.sql.functions import countDistinct, col, date_format
import numpy as np
import pyspark.sql.functions as func
from pyspark.sql.types import (
    StringType,
    LongType,
    DoubleType,
    StructField,
    StructType,
    FloatType
)

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Start Spark Session
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder.appName("MAST30034 Project 2 BNPL")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

#### Load Datasets

In [None]:
# load BNPL dataset
consumer = spark.read.csv("../data/tables/tbl_consumer.csv", header=True, sep="|")
details = spark.read.parquet("../data/tables/consumer_user_details.parquet")
merchants = spark.read.parquet("../data/tables/tbl_merchants.parquet")

In [None]:
# load population data
pop_df = gpd.read_file(f'../data/abs/2021_population_census.gml')

# Select 2021 population census
area_id = ['gml_id', 'primaryindex', 'state_code_2016', 'sa2_maincode_2016',
           'sa2_name_2016']
col_2021 = [x for x in pop_df.columns if '2021' in x or '2020_21' in x]
pop_21 = pop_df[area_id + col_2021]

In [None]:
# load population income data
sa2_income = gpd.read_file(f'../data/abs/sa2_income.gml')

In [None]:
# load all transactions datasets
paths=['../data/tables/transactions_20210228_20210827_snapshot',
       '../data/tables/transactions_20210828_20220227_snapshot']

first = 1
for path in paths:
    if first:
        transactions = spark.read.parquet(path)
        print(f'added {path.split("/")[3]}')
        first = 0
    else:
        append_transactions = spark.read.parquet(path)
        transactions = transactions.union(append_transactions)
        print(f'added {path.split("/")[3]}')

In [None]:
# load poa_to_sa2 dataset
poa_to_sa2 = spark.read.csv("../data/curated/poa_w_sa2.csv", header=True)

In [None]:
poa_to_sa2.show(5, vertical=True)

In [None]:
# rename columns
merchants = merchants.withColumnRenamed('name', 'merchant_name')
consumer = consumer.withColumnRenamed('name', 'consumer_name')

#### Merge SA2 to transaction by postcode

In [None]:
# Join consumers with their respective details
consumer_detail = consumer.join(details, on="consumer_id")

# Join consumers with their respective transactions
consumer_trx = consumer_detail.join(transactions, on="user_id")

# Join transactions with the respective merchants
df_trx = consumer_trx.join(merchants, on="merchant_abn")

In [None]:
df_trx.show(5, vertical=True)

In [None]:
# translate postcodes in transaction to sa2 codes
sa2_cols = ['poa_name_2016', 'sa2_maincode_2016', 'sa2_name_2016', 'geometry']
df_trx_sa2 = (df_trx \
                .join(poa_to_sa2[sa2_cols], 
                     on=[df_trx['postcode'] == poa_to_sa2['poa_name_2016']],
                     how='inner')
                .drop('poa_name_2016')
             )

In [None]:
df_trx_sa2.printSchema()

---
### Geospatial Analysis

In [None]:
poa_to_sa2_pd = pd.read_csv("../data/curated/poa_w_sa2.csv")

In [None]:
poa_to_sa2_pd = poa_to_sa2_pd.dropna()

In [None]:
from shapely import wkt

poa_to_sa2_pd['geometry'] = poa_to_sa2_pd['geometry'].astype('str').apply(wkt.loads)
gdf = gpd.GeoDataFrame(poa_to_sa2_pd, crs='epsg:4326')

In [None]:
gdf['geometry'] = gdf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [None]:
# create a JSON 
geoJSON = gdf[['sa2_maincode_2016', 'geometry']].drop_duplicates('sa2_maincode_2016').to_json()

#### 1. Geospatial Analysis by Population Number

In [None]:
m = folium.Map(location=[-38.043995, 145.264296], tiles="Stamen Terrain", zoom_start=8)
m.add_child(folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    data=pop_21, 
    columns = ['sa2_maincode_2016', 'erp_2021'], 
    key_on = 'properties.sa2_maincode_2016', 
    fill_color='YlGnBu', 
    name='choropleth', 
    legend_name='Estimated Population Number per Area'))
m

#### 2. Geospatial Analysis by Median Age of Earners

#### 3. Geospatial Analysis by Income

In [None]:
m = folium.Map(location=[-38.043995, 145.264296], tiles="Stamen Terrain", zoom_start=8)
m.add_child(folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    data=sa2_income, 
    columns = ['sa2_code', 'median_age_of_earners_years'], 
    key_on = 'properties.sa2_maincode_2016', 
    fill_color='YlGnBu', 
    name='choropleth', 
    legend_name='Estimated Population Number per Area'))
m

#### 4. Geospatial Analysis by Transaction

In [None]:
m = folium.Map(location=[-38.043995, 145.264296], tiles="Stamen Terrain", zoom_start=8)
m.add_child(folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    data=df_trx_sa2, 
    columns = ['sa2_code', 'median_age_of_earners_years'], 
    key_on = 'properties.sa2_maincode_2016', 
    fill_color='YlGnBu', 
    name='choropleth', 
    legend_name='Estimated Population Number per Area'))
m