In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
import pandas as pd
import geopandas as gpd
import folium
from shapely.wkt import loads

In [None]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [None]:
geospark = spark.read.parquet('../data/external/SA2_2021_VIC_shapefile.parquet/')
census = spark.read.parquet('../data/curated/final_census.parquet')
vic_data = geospark.join(census, geospark.SA2_CODE21 == census.SA2_CODE_2021, "left").drop(census.SA2_CODE_2021)

In [None]:
# Use SA2 to do the visualizaton
def visualisation(col_name, vic_data,geospark, title):
    data = vic_data.groupBy("SA2_CODE21").agg(F.avg(col_name).alias(col_name))
    pdf = geospark.select("SA2_CODE21", "geometry").toPandas()
    pdf['geometry'] = gpd.GeoSeries.from_wkt(pdf['geometry'])
    gdf = gpd.GeoDataFrame(pdf, geometry='geometry')
    geoJSON = gdf[['SA2_CODE21', 'geometry']].drop_duplicates('SA2_CODE21').to_json()
    m = folium.Map(location=[-37.84, 144.95], tiles="Stamen Terrain", zoom_start=10)
    # refer to the folium documentations on how to plot aggregated data.
    c = folium.Choropleth(
        geo_data=geoJSON, # geoJSON 
        name='choropleth', # name of plot
        data=data.toPandas(), # data source
        columns=['SA2_CODE21',col_name], # the columns required
        key_on='properties.SA2_CODE21', # this is from the geoJSON's properties
        fill_color='YlOrRd', # color scheme
        nan_fill_color='black',
        legend_name=col_name
    )

    map = c.add_to(m)

    map.save('../plots/foliumChoropleth_'+col_name+'.html')
    map


In [None]:
visualisation("income_percentage", vic_data, geospark,"average income percentage by SA2")
visualisation("age_percentage", vic_data, geospark,"age_percentage")