final outputs:

Digital boundary:
- ../data/raw/ABS/digitalBoundary/SA2_2021_AUST_GDA2020.shp

SA2 code to district names
- ../data/raw/ABS/SA2_TO_Name.csv
- code(int), name(String)


Estimated Resident Population (ERP) (2001 to 2021) (By SA2)
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)

Household income (weekly) (exclude visitor/non-classifiable) (2021) (By SA2)
- ../data/raw/ABS/Household_income/Household_income.csv
- SA2 (int), year (int),  household_type (String), income_level (String), popultaion (int)

Population projection (2017 - 2066) (VIC overall)
- ../data/raw/ABS/Population/Population.csv
- year (int), popultaion (int)

School location
- ../data/raw/ABS/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)



In [9]:
from urllib.request import urlretrieve
import sys
import pandas as pd
import geopandas as gpd 
import folium
import requests
import math
import zipfile
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
pd.options.display.float_format = '{:,.4f}'.format

OUTPUT_DIR = "../data/raw/ABS/"

headers = {
    "accept": "text/csv"
}

In [3]:
def pull_direct(url, output_dir, file_name):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        urlretrieve(url, f"{output_dir}{file_name}")
        print(f"Request succeed: pulling from{url}\nFile saved in: {output_dir}{file_name}")
    except Exception as e:
        print(f"********\nRequest failure: ")
        print(e)
        print("********")

def write_file(output_dir, file_name, content, mod = "w"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(f"{output_dir}{file_name}", mod) as f:
            f.write(content)
    except Exception as e:
        print(f"****** Writing file failure: {output_dir}{file_name}")
        print(e)
        print("******")
        return False
    return True

def get_match_list(url, output_dir, file_name, xpath = ".//structure:Code",
    name_space = {"structure": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure"}):
    
    # pull_direct(url, output_dir, f"{file_name}.xml")
    response = requests.get(url, allow_redirects=True)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(f"{output_dir}{file_name}.xml", "w") as f:
        f.write(response.text)

    print(f"{output_dir}{file_name}.xml")

    df = pd.read_xml(f"{output_dir}{file_name}.xml", 
        xpath=xpath,
        namespaces=name_space
    )
    print(True)

    df.to_csv(f"{output_dir}{file_name}.csv")




SA2 match tabel

In [53]:
### Pull SA2 match table
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/SA2_2021_AUST.xlsx"
file_name = "SA2_TO_Name.xlsx"
pull_direct(url, OUTPUT_DIR, file_name)

vic_df = pd.read_excel(f"{OUTPUT_DIR}{file_name}")
vic_df = vic_df.loc[vic_df["STATE_NAME_2021"] == "Victoria"]
vic_df = vic_df[["SA2_CODE_2021", "SA2_NAME_2021"]]\
        .rename(columns={"SA2_CODE_2021": "code", "SA2_NAME_2021": "name"})
vic_df.to_csv(f"{OUTPUT_DIR}SA2_TO_Name.csv")

Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/SA2_2021_AUST.xlsx
File saved in: ../data/raw/ABS/SA2_TO_Name.xlsx


SA2 shape file

In [4]:
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip"
file_name = "SA2_2021_AUST_SHP_GDA2020.zip"
pull_direct(url, OUTPUT_DIR, file_name)

with zipfile.ZipFile(f"{OUTPUT_DIR}{file_name}", "r") as zip_ref:
    zip_ref.extractall(f"{OUTPUT_DIR}digitalBoundary/")

Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip
File saved in: ../data/raw/ABS/SA2_2021_AUST_SHP_GDA2020.zip


In [5]:
with zipfile.ZipFile(f"{OUTPUT_DIR}{file_name}", "r") as zip_ref:
    zip_ref.extractall(f"{OUTPUT_DIR}digitalBoundary/")

ABS Data

List of all Victoria SA2 codes

Estimated Resident Population (ERP) (2001 to 2021)

In [23]:
url = "https://api.data.abs.gov.au/data/ABS,ABS_ANNUAL_ERP_ASGS2021,1.2.0/.SA2..A?startPeriod=2010&endPeriod=2021&dimensionAtObservation=AllDimensions"

response = requests.get(url, headers=headers)
# response = requests.get(url)
write_file(f"{OUTPUT_DIR}ERP/", "ERP_raw.csv", response.text)

match_url = "https://api.data.abs.gov.au/datastructure/ABS/ABS_ANNUAL_ERP_ASGS2021/1.2.0?references=all"
match_output_dir = f"{OUTPUT_DIR}ERP/"
match_file_name = "ERP_match"
get_match_list(match_url, match_output_dir, match_file_name)

../data/raw/ABS/ERP/ERP_match.xml
True


In [54]:
# feature selection / rename / reset datatype
erp_sdf = spark.read.csv(f"{OUTPUT_DIR}ERP/ERP_raw.csv", header=True)[["ASGS_2021", "TIME_PERIOD", "OBS_VALUE"]]
erp_sdf = erp_sdf.withColumnRenamed("ASGS_2021", "SA2").withColumn("SA2", F.col("SA2").cast("int"))\
        .withColumnRenamed("TIME_PERIOD", "year").withColumn("year", F.col("year").cast("int"))\
        .withColumnRenamed("OBS_VALUE", "population").withColumn("population", F.col("population").cast("int"))

# Filter: victoria SA2
vic_df = pd.read_csv(f"{OUTPUT_DIR}SA2_TO_Name.csv")
vic_sa2 = vic_df["code"].tolist()
erp_sdf = erp_sdf.filter(F.col("SA2").isin(vic_sa2))

erp_sdf.show(2, vertical = True, truncate=100)
erp_sdf.write.mode('overwrite').csv(f"{OUTPUT_DIR}ERP/ERP.csv")

-RECORD 0---------------
 SA2        | 201011481 
 year       | 2010      
 population | 8664      
-RECORD 1---------------
 SA2        | 201011481 
 year       | 2011      
 population | 8814      
only showing top 2 rows



Household income (weekly) (exclude visitor/non-classifiable) (2021)

In [24]:
url = "https://api.data.abs.gov.au/data/ABS,C21_G33_SA2,1.0.0/...SA2.?startPeriod=2021&dimensionAtObservation=AllDimensions"


response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Household_income/", "Household_income_raw.csv", response.text)


match_url = "https://api.data.abs.gov.au/datastructure/ABS/C21_G33_SA2/1.0.0?references=all"
match_output_dir = f"{OUTPUT_DIR}Household_income/"
match_file_name = "Household_income_match"
get_match_list(match_url, match_output_dir, match_file_name)


../data/raw/ABS/Household_income/Household_income_match.xml
True


In [46]:
# feature selection / rename / reset datatype
income_sdf = spark.read.csv(f"{OUTPUT_DIR}Household_income/Household_income_raw.csv", header=True)[["REGION", "HIND", "HHCD", "OBS_VALUE", "STATE", "TIME_PERIOD"]]
income_sdf = income_sdf.withColumnRenamed("REGION", "SA2").withColumn("SA2", F.col("SA2").cast("int"))\
        .withColumnRenamed("TIME_PERIOD", "year").withColumn("year", F.col("year").cast("int"))\
        .withColumnRenamed("OBS_VALUE", "popultaion").withColumn("popultaion", F.col("popultaion").cast("int"))\
        .withColumnRenamed("STATE", "state").withColumn("state", F.col("state").cast("int"))\
        .withColumnRenamed("HIND", "income_level").withColumnRenamed("HHCD", "household_type")
match_sdf = spark.read.csv(f"{OUTPUT_DIR}Household_income/Household_income_match.csv", header=True)[["id", "Name"]]\
    .withColumnRenamed("id", "code")

# Filter: victoria
income_sdf = income_sdf.filter(F.col("state")==2)
income_sdf = income_sdf[["SA2", "year", "household_type", "income_level", "popultaion"]]


# inner join
match_sdf.createOrReplaceTempView("match")
income_sdf.createOrReplaceTempView("income_sdf")
matched_sdf = spark.sql("""
    SELECT SA2, year, Name AS household_type, income_level, popultaion
    FROM(
        SELECT SA2, year, household_type, Name AS income_level, popultaion
        FROM income_sdf
        INNER JOIN match ON income_sdf.income_level = match.code) AS half
    INNER JOIN match ON half.household_type = match.code
""")


matched_sdf.show(1, vertical = True, truncate=100)
matched_sdf.write.mode('overwrite').csv(f"{OUTPUT_DIR}Household_income/Household_income.csv")

-RECORD 0---------------------------
 SA2            | 201011002         
 year           | 2021              
 household_type | Family households 
 income_level   | Total             
 popultaion     | 3032              
only showing top 1 row



Population projection (2017 - 2066)

In [4]:
url = "https://api.data.abs.gov.au/data/ABS,POP_PROJ_REGION_2012_2061,1.0.0/2.3.TT.1.1.1.1.A?startPeriod=2017&dimensionAtObservation=AllDimensions"

response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Population/", "Population_raw.csv", response.text)

match_url = "https://api.data.abs.gov.au/datastructure/ABS/POP_PROJ_REGION_2012_2061/1.0.0?references=all"
match_output_dir = f"{OUTPUT_DIR}Population/"
match_file_name = "Population_match"
get_match_list(match_url, match_output_dir, match_file_name)

../data/raw/ABS/Population/Population_match.xml
True


In [45]:

# feature selection / rename / reset datatype
population_sdf = spark.read.csv(f"{OUTPUT_DIR}Population/Population_raw.csv", header=True)[["TIME_PERIOD", "OBS_VALUE"]]
population_sdf = population_sdf.withColumnRenamed("TIME_PERIOD", "year").withColumn("year", F.col("year").cast("int"))\
        .withColumnRenamed("OBS_VALUE", "popultaion").withColumn("popultaion", F.col("popultaion").cast("int"))

population_sdf.show(1, vertical = True, truncate=100)
population_sdf.write.mode('overwrite').csv(f"{OUTPUT_DIR}Population/Population.csv")

-RECORD 0-------------
 year       | 2017    
 popultaion | 6321648 
only showing top 1 row



School location

In [35]:
url = "https://www.acara.edu.au/docs/default-source/default-document-library/school-location-2021e23a2f404c94637ead88ff00003e0139.xlsx?sfvrsn=51ae4c07_0"


response = requests.get(url)

write_file(f"{OUTPUT_DIR}School_location/", "School_location_raw.xlsx", response.content, "wb")
# pull_direct(url, OUTPUT_DIR, f"School_location.xlsx")

True

In [44]:
# feature selection / VIC filtering
school_df = pd.read_excel(f"{OUTPUT_DIR}School_location/School_location_raw.xlsx", sheet_name="SchoolLocations 2021")\
    [["School Name", "Statistical Area 2", "State", "Latitude", "Longitude", "School Type"]]
school_df = school_df.loc[school_df["State"] == "VIC"]
school_df = school_df.rename(columns={"Statistical Area 2": "SA2"})
school_df = school_df.drop(columns = "State")

print(school_df.head())
print(school_df.shape)

school_df.to_csv(f"{OUTPUT_DIR}School_location/School_location.csv")

                             School Name        SA2  Latitude  Longitude  \
241   Kurnai College - University Campus  205041094  -38.3094   146.4249   
295         Learning Co-Operative School  209031212  -37.6294   145.2133   
297                        Andale School  207011153  -37.8055   145.0360   
298                 The Currajong School  208041195  -37.8760   145.0599   
299  Mansfield Autism Statewide Services  204011057  -37.0608   146.0859   

    School Type  
241   Secondary  
295     Primary  
297    Combined  
298    Combined  
299    Combined  
(2729, 5)


Geopandas

In [None]:
shape_gdf = gpd.read_file("../data/raw/ABS/digitalBoundary/SA2_2021_AUST_GDA2020.shp")
shape_gdf = shape_gdf.loc[shape_gdf["STE_NAME21"] == "Victoria"]
shape_gdf["geometry"] = shape_gdf["geometry"].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

geoJSON = shape_gdf[["SA2_CODE21", "geometry"]].drop_duplicates('SA2_CODE21').to_json()
base_map = folium.Map(location=[-37.79, 144.96], tiles="Stamen Terrain", zoom_start=7.3)

base_map.add_child(folium.Choropleth(geo_data=geoJSON, name="choropleth"))
base_map


In [8]:
shape_gdf.head()

Unnamed: 0,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
644,201011001,Alfredton,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,52.7109,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
645,201011002,Ballarat,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,12.3787,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
646,201011005,Buninyong,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,51.5855,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
647,201011006,Delacombe,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,34.1607,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.75050 -37.59119, 143.75044 -37.5..."
648,201011007,Smythes Creek,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,104.7274,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."
