In [74]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
import pandas as pd
import geopandas as gpd
import folium
from shapely.wkt import loads
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [97]:
final = spark.read.parquet('../data/curated/final_merchant_info.parquet')
geospark = spark.read.parquet('../data/curated/SA2_2021_VIC_shapefile.parquet/')
vic_data = geospark.join(final, geospark.SA2_CODE21 == final.SA2_CODE_2021, "left").drop(final.SA2_CODE_2021)

vic_data.dtypes

[('SA2_CODE21', 'string'),
 ('SA2_NAME21', 'string'),
 ('geometry', 'string'),
 ('income_percentage', 'double'),
 ('age_percentage', 'double'),
 ('SA2_NAME_2021', 'string'),
 ('RATIO_FROM_TO', 'string'),
 ('postcode', 'string'),
 ('avg_lat', 'double'),
 ('avg_long', 'double'),
 ('count_of_bigorder', 'bigint'),
 ('Avg_amount_monthly', 'double'),
 ('Avg_count_monthly', 'double'),
 ('Order_avg_value', 'double'),
 ('avg_prob_fraud_cus', 'double'),
 ('num_of_fraud', 'double'),
 ('count_cus_per_mon', 'double'),
 ('fix_cus_prob', 'double'),
 ('merchant_abn', 'bigint')]

In [95]:
def visualisation(col_name, vic_data,geospark, title):
    data = vic_data.groupBy("SA2_CODE21").agg(F.avg(col_name).alias(col_name))
    pdf = geospark.select("SA2_CODE21", "geometry").toPandas()
    pdf['geometry'] = gpd.GeoSeries.from_wkt(pdf['geometry'])
    gdf = gpd.GeoDataFrame(pdf, geometry='geometry')
    print(pdf.columns)
    geoJSON = gdf[['SA2_CODE21', 'geometry']].drop_duplicates('SA2_CODE21').to_json()
    m = folium.Map(location=[-37.84, 144.95], tiles="Stamen Terrain", zoom_start=10)
    print(geoJSON[:300])
    # refer to the folium documentations on how to plot aggregated data.
    c = folium.Choropleth(
        geo_data=geoJSON, # geoJSON 
        name='choropleth', # name of plot
        data=data.toPandas(), # data source
        columns=['SA2_CODE21',col_name], # the columns required
        key_on='properties.SA2_CODE21', # this is from the geoJSON's properties
        fill_color='YlOrRd', # color scheme
        nan_fill_color='black',
        legend_name=col_name
    )

    map = c.add_to(m)

    map.save('../plots/foliumChoropleth'+col_name+'.html')
    map

In [96]:
visualisation("income_percentage", vic_data, geospark,"average income percentage by SA2")

                                                                                

Index(['SA2_CODE21', 'geometry'], dtype='object')
{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"SA2_CODE21": "216021413"}, "geometry": {"type": "Polygon", "coordinates": [[[144.9840516499428, -36.05297636768364], [144.9840130799277, -36.05318303768235], [144.9839141899028, -36.05357800767813], [144.983900
