# Visualization

---

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import* 
import pandas as pd
import folium
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt

In [2]:
spark = (
    SparkSession.builder.appName("Visualisation")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.driver.memory","4G")
    .config("spark.executor.memory","4G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

24/09/07 02:42:40 WARN Utils: Your hostname, Cocos-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.16.33.67 instead (on interface en0)
24/09/07 02:42:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/07 02:42:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/07 02:42:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/07 02:42:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# Read datasets

In [3]:
# read datasets
consumer  = spark.read.parquet('../data/curated/consumer')
income  = spark.read.parquet('../data/curated/income')
merchant  = spark.read.parquet('../data/curated/merchant')
population  = spark.read.parquet('../data/curated/population')
unemployment  = spark.read.parquet('../data/curated/unemployment')
merged_external = spark.read.parquet('../data/curated/merged_external')
merged_external_df = pd.read_csv('../data/curated/merged_datasets.csv')

                                                                                

In [4]:
# check the data type
print('merged_external_df:')
print(merged_external_df.dtypes)

merged_external_df:
SA2_name               object
average_population    float64
earners                 int64
income                  int64
unemployment_rate     float64
geometry               object
postcode                int64
state                  object
dtype: object


In [5]:
# convert the 'geometry' column from WKT strings to Shapely geometries
merged_external_df['geometry'] = merged_external_df['geometry'].apply(wkt.loads)

# create a GeoDataFrame
merged_gdf = gpd.GeoDataFrame(merged_external_df, geometry='geometry')

# set the original CRS (assuming Web Mercator for example)
merged_gdf.set_crs(epsg=4326, inplace=True)

Unnamed: 0,SA2_name,average_population,earners,income,unemployment_rate,geometry,postcode,state
0,Kiama Downs - Minnamurra,6002.0,3633,255198601,3.2,"POLYGON ((150.84893 -34.63408, 150.84936 -34.6...",2533,NSW
1,Lismore,15079.5,7696,400449143,6.7,"POLYGON ((153.25828 -28.80043, 153.25836 -28.7...",2480,NSW
2,Raymond Terrace,14802.0,7118,390229618,7.2,"POLYGON ((151.74096 -32.75945, 151.74258 -32.7...",2322,NSW
3,Raymond Terrace,14802.0,7118,390229618,7.2,"POLYGON ((151.74096 -32.75945, 151.74258 -32.7...",2324,NSW
4,Niagara Park - Lisarow,8215.5,4877,315596744,3.9,"POLYGON ((151.34439 -33.38197, 151.34385 -33.3...",2250,NSW
...,...,...,...,...,...,...,...,...
4926,Hume,394.0,17,1269271,3.4,"POLYGON ((149.15243 -35.39743, 149.1522 -35.39...",2620,ACT
4927,Hume,394.0,17,1269271,12.5,"POLYGON ((149.15243 -35.39743, 149.1522 -35.39...",2620,ACT
4928,Macarthur,1382.0,972,79841775,2.9,"POLYGON ((149.1251 -35.40572, 149.12402 -35.40...",2904,ACT
4929,Hawker,2991.0,2074,170782353,3.9,"POLYGON ((149.02894 -35.24157, 149.02907 -35.2...",2614,ACT


In [6]:
print(merged_gdf.dtypes)

SA2_name                object
average_population     float64
earners                  int64
income                   int64
unemployment_rate      float64
geometry              geometry
postcode                 int64
state                   object
dtype: object


In [7]:
merged_gdf.shape

(4931, 8)

# Maps

In [8]:
geoJSON = merged_gdf[['SA2_name', 'geometry']].drop_duplicates('SA2_name').to_json()

## Population distribution by SA2 name

In [9]:
# select the top 5 SA2 name based on 'average_population'
top5_population = merged_gdf.nlargest(5, 'average_population')
top5_population

Unnamed: 0,SA2_name,average_population,earners,income,unemployment_rate,geometry,postcode,state
2229,Tarneit - Central,28645.5,15501,912506820,7.2,"POLYGON ((144.65232 -37.83945, 144.65237 -37.8...",3029,VIC
1016,Schofields - East,27520.5,13319,1041581844,4.7,"POLYGON ((150.87205 -33.6896, 150.87232 -33.68...",2762,NSW
3815,Baldivis - South,27385.5,15173,1146932303,5.0,"POLYGON ((115.79037 -32.3306, 115.79105 -32.32...",6171,WA
2116,Pakenham - South West,27302.5,14622,850335823,5.2,"POLYGON ((145.43721 -38.0642, 145.43739 -38.06...",3810,VIC
1444,Wonthaggi - Inverloch,27137.0,14035,764219024,4.0,"POLYGON ((145.4229 -38.41215, 145.42268 -38.41...",3925,VIC


In [21]:
# create the map
population_map = folium.Map(location=[-25.2744, 133.7751], zoom_start=4)

folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data=merged_gdf[['SA2_name', 'average_population', 'geometry']],
    columns=['SA2_name', 'average_population'],
    key_on='feature.properties.SA2_name',
    bins=7,
    fill_color='RdPu',
    nan_fill_color='gray',
    legend_name='Population Distribution'
).add_to(population_map)

# mark the area of top 5 population
top5_population['centroid'] = top5_population['geometry'].centroid

for _, row in top5_population.iterrows():
    lat, lon = row['centroid'].y, row['centroid'].x
    
    folium.Marker(
        location=[lat, lon],
        popup=f"{row['SA2_name']} - Population: {row['average_population']}",
        tooltip=f"{row['SA2_name']}"
    ).add_to(population_map)

# save the map
population_map.save('../plots/population_distribution.html')


  top5_population['centroid'] = top5_population['geometry'].centroid


## Income distribution by SA2 name

In [12]:
# select the top 5 SA2 name based on income
top5_income = merged_gdf.nlargest(5, 'income')
top5_income

Unnamed: 0,SA2_name,average_population,earners,income,unemployment_rate,geometry,postcode,state
1349,Brighton (Vic.),22975.0,15882,2281369435,3.7,"POLYGON ((144.98392 -37.90034, 144.98387 -37.9...",3186,VIC
1023,Manly - Fairlight,22756.0,16374,2140501569,3.4,"POLYGON ((151.26947 -33.79355, 151.26963 -33.7...",2094,NSW
1024,Manly - Fairlight,22756.0,16374,2140501569,3.4,"POLYGON ((151.26947 -33.79355, 151.26963 -33.7...",2095,NSW
1054,Lindfield - Roseville,24711.0,16338,2057087501,4.1,"POLYGON ((151.17806 -33.7681, 151.17818 -33.76...",2069,NSW
1055,Lindfield - Roseville,24711.0,16338,2057087501,4.1,"POLYGON ((151.17806 -33.7681, 151.17818 -33.76...",2070,NSW


In [24]:
# create the map
income_map = folium.Map(location=[-25.2744, 133.7751], zoom_start=4)

folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data=merged_gdf[['SA2_name', 'income', 'geometry']],
    columns=['SA2_name', 'income'],
    key_on='feature.properties.SA2_name',
    bins=5,
    fill_color='RdPu',
    nan_fill_color='gray',
    legend_name='Income Distribution'
).add_to(income_map)

# mark the area of top 5 income
top5_income['centroid'] = top5_income['geometry'].centroid

for _, row in top5_income.iterrows():
    lat, lon = row['centroid'].y, row['centroid'].x
    
    folium.Marker(
        location=[lat, lon],
        popup=f"{row['SA2_name']} - Income: {row['income']}",
        tooltip=f"{row['SA2_name']}"
    ).add_to(income_map)

# save the map
income_map.save('../plots/income_distribution.html')


  top5_income['centroid'] = top5_income['geometry'].centroid


## Unemployment rate distribution by SA2 name

In [15]:
# select the top 5 SA2 name based on unemployment rate
top5_unemp = merged_gdf.nlargest(5, 'unemployment_rate')
top5_unemp

Unnamed: 0,SA2_name,average_population,earners,income,unemployment_rate,geometry,postcode,state
4855,Whitlam,30.5,15,989500,66.7,"POLYGON ((149.05114 -35.26921, 149.05092 -35.2...",2611,ACT
2610,Carole Park,7.0,17,1105340,50.0,"POLYGON ((152.91544 -27.61917, 152.91593 -27.6...",4300,QLD
3280,Parafield,18.0,7,416932,50.0,"POLYGON ((138.61917 -34.79599, 138.61976 -34.7...",5106,SA
3635,Kowanyama - Pormpuraaw,1756.5,541,21734438,42.0,"MULTIPOLYGON (((141.591 -15.19652, 141.59143 -...",4892,QLD
2754,Yarrabah,2608.0,601,25354037,37.2,"MULTIPOLYGON (((145.90107 -16.99399, 145.90083...",4871,QLD


In [22]:
# create the map
unemp_map = folium.Map(location=[-25.2744, 133.7751], zoom_start=4)

folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data=merged_gdf,
    columns=['SA2_name', 'unemployment_rate'],
    key_on='feature.properties.SA2_name',
    bins=9,
    fill_color='RdPu',
    nan_fill_color='gray',
    legend_name='Unemployment Rate Distribution'
).add_to(unemp_map)

# mark the area of top 5 unemployment rate
top5_unemp['centroid'] = top5_unemp['geometry'].centroid

for _, row in top5_unemp.iterrows():
    lat, lon = row['centroid'].y, row['centroid'].x
    
    folium.Marker(
        location=[lat, lon],
        popup=f"{row['SA2_name']} - Unemployment Rate: {row['unemployment_rate']}",
        tooltip=f"{row['SA2_name']}"
    ).add_to(unemp_map)

# save the plot
unemp_map.save('../plots/unemployment_rate_distribution.html')


  top5_unemp['centroid'] = top5_unemp['geometry'].centroid
