In [110]:
import requests
from os.path import exists
import zipfile
import folium
import pandas as pd
import geopandas as gpd
import math
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [111]:
# Income use SA2 from 2016
# Population use SA2 from 2021

if not exists('../data/raw/1270055001_sa2_2016_aust_shape.zip'):
    r = requests.get('https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&1270055001_sa2_2016_aust_shape.zip&1270.0.55.001&Data%20Cubes&A09309ACB3FA50B8CA257FED0013D420&0&July%202016&12.07.2016&Latest', allow_redirects=True)
    open('../data/raw/1270055001_sa2_2016_aust_shape.zip', 'wb').write(r.content)
if not exists('../data/raw/1270055001_sa2_2016_aust_shape.shp'):
    with zipfile.ZipFile("../data/raw/1270055001_sa2_2016_aust_shape.zip", "r") as zip_ref:
        zip_ref.extractall(path='../data/raw')
if not exists('../data/raw/sa.zip'):
    r = requests.get('https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip', allow_redirects=True)
    open('../data/raw/sa.zip', 'wb').write(r.content)
if not exists('../data/raw/SA2_2021_AUST_GDA2020.shp'):
    with zipfile.ZipFile("../data/raw/sa.zip", "r") as zip_ref:
        zip_ref.extractall(path='../data/raw')

In [112]:
# Preoprocess SA2 geometry

sa22016 = gpd.read_file("../data/raw/SA2_2016_AUST.shp")
sa22016 = sa22016.rename(columns={'SA2_MAIN16': 'SA2 code'})
sa22016 = sa22016[sa22016['STE_NAME16'] == 'Victoria']
sa22016 = sa22016[['SA2 code', 'geometry']]
sa22016 = sa22016[sa22016['geometry'] != None]
sa22016['geometry'] = sa22016['geometry'].\
    to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sa22016['SA2area'] = sa22016['geometry'].area
sa22016['SA2 code'] = sa22016['SA2 code'].astype(int)

sa22021 = gpd.read_file("../data/raw/SA2_2021_AUST_GDA2020.shp")
sa22021 = sa22021.rename(columns={'SA2_CODE21': 'SA2 code'})
sa22021 = sa22021[sa22021['STE_NAME21'] == 'Victoria']
sa22021 = sa22021[['SA2 code', 'geometry']]
sa22021 = sa22021[sa22021['geometry'] != None]
sa22021['geometry'] = sa22021['geometry'].\
    to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sa22021['SA2area'] = sa22021['geometry'].area
sa22021['SA2 code'] = sa22021['SA2 code'].astype(int)



Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.



Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.




In [114]:
# Preoprocess suburb geometry

gdf = gpd.read_file('../data/raw/gda94_victoriagrid/esrishape/whole_of_dataset/victoria/VMADMIN/POSTCODE_POLYGON.shp')
gdf = gdf[['POSTCODE', 'geometry']]
gdf['geometry'] = gdf['geometry'].\
    to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
gdf['subarea'] = gdf['geometry'].area
gdf['POSTCODE'] = gdf['POSTCODE'].astype(int)


Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.




In [115]:
# Gain intersection of suburb and SA2

inter2016 = gpd.overlay(sa22016, gdf, how="intersection")
inter2016['geometry'] = inter2016['geometry'].\
    to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
inter2016['intersectionarea'] = inter2016['geometry'].area

inter2021 = gpd.overlay(sa22021, gdf, how="intersection")
inter2021['geometry'] = inter2021['geometry'].\
    to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
inter2021['intersectionarea'] = inter2021['geometry'].area



Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.



Geometry is in a geographic CRS. Results from 'area' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.




In [116]:
def SA2_to_suburb(file, inter, methodology, name):
    """
    :param file: file to precess
    :param inter: overlayed dataframe by SA2 and Suburb
    :param methodology: calculation based on SA2 or Suburb
    :param name: column name for SA2 or Suburb
    :returns: data by suburb
    """

    df = pd.read_csv(file)
    intergdf = gpd.GeoDataFrame(
        pd.merge(df, inter, on='SA2 code', how='inner')
    )

    years = list(set(df.columns) - set(['SA2 code']))
    years.sort()

    # Percentage of the area of this district compared to its SA2 district
    intergdf['ratio'] = intergdf['intersectionarea'] / intergdf[methodology]

    for year in years:
        # Calculate number in that district
        intergdf[year] = intergdf[year] * intergdf['ratio']

    # sum the number in one suburb
    resultdf = intergdf.groupby(['POSTCODE']).\
        sum()[years].reset_index()

    # Make sure the whole ratio is 1 overall and check for outlier
    a = intergdf.groupby(name).sum('ratio').reset_index()
    pd.options.plotting.backend = "plotly"
    v = a.plot(kind='scatter', x=name, y='ratio').show()

    return resultdf


There is a simple example on how it is calculated. Population is proportional to SA2 area, for it is reasonable to assumn that people are equally distributed within SA2 districts. Income is proportional to suburb area, as it is not proportional to the SA2 area, hence we sum up the income in each suburb by its percentage area within suburb.

<img src="../plots/SA2toSub.jpg" width=800 height=600 />



In [117]:
# Calculation of intersection area income
# is based on its percentage on suburb.

filepath = Path('../data/curated/subincome.csv')
resultdf = SA2_to_suburb('../data/curated/predictincome.csv',
                         inter2016, 'subarea', 'POSTCODE')
resultdf.to_csv(filepath, index=False)

In [118]:
# Calculation of intersection area population
# is based on its percentage on SA2 area.

filepath = Path('../data/curated/subpopu.csv')
resultdf = SA2_to_suburb('../data/curated/predictpopu.csv',
                         inter2021, 'SA2area', 'SA2 code')
resultdf.to_csv(filepath, index=False)