final outputs:

Digital boundary (AUS range, manually select VIC if needed)
- ../data/raw/ABS/digitalBoundary/SA2_2021_AUST_GDA2020.shp

SA2 code to district names
- ../data/raw/ABS/SA2_TO_Name.csv
- code(int), name(String)


Estimated Resident Population (ERP) (2001 to 2021) (By SA2)
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)
- SA2: interger code marking each district, e.g. 206041117 for Carlton

Household income (weekly) (exclude visitor/non-classifiable) (2021) (By SA2)
- ../data/raw/ABS/Household_income/Household_income.csv
- SA2 (int), year (int),  household_type (String), income_level (String), popultaion (int)

Population projection (2017 - 2066) (VIC overall)
- ../data/raw/ABS/Population/Population.csv
- year (int), popultaion (int)

School location
- ../data/raw/ABS/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)



In [2]:
from urllib.request import urlretrieve
import sys
import pandas as pd
import geopandas as gpd
import folium
import requests
import math
import zipfile
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
pd.options.display.float_format = "{:,.4f}".format

OUTPUT_DIR = "../data/raw/ABS/"

headers = {"accept": "text/csv"}

22/09/05 20:32:55 WARN Utils: Your hostname, Bruce-PC resolves to a loopback address: 127.0.1.1; using 172.21.194.51 instead (on interface eth0)
22/09/05 20:32:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/05 20:32:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def pull_direct(url, output_dir, file_name):
    """
    use urlretrieve function to directly pull data from given url and save it 
        to path: {output_dir}{file_name}
    url: the String url which needs to be pulled from
    output_dir: the String output folder directory, automatically create if not exist
    file_name: the String file name of the file needed to be pulled from
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        urlretrieve(url, f"{output_dir}{file_name}")
        print(
            f"Request succeed: pulling from{url}\nFile saved in: {output_dir}{file_name}"
        )
    except Exception as e:
        print(f"********\nRequest failure: ")
        print(e)
        print("********")


def write_file(output_dir, file_name, content, mod="w"):
    """
    write given content to local file at: {output_dir}{file_name} with mode: {mod}
    output_dir: the String output folder directory, automatically create if not exist
    file_name: the String file name used to save file
    content: expecting objects which can be written to file with open function
    mod: String of writing mode code used in writing file
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(f"{output_dir}{file_name}", mod) as f:
            f.write(content)
    except Exception as e:
        print(f"****** Writing file failure: {output_dir}{file_name}")
        print(e)
        print("******")
        return False
    return True


def get_match_list(
    url,
    output_dir,
    file_name,
    xpath=".//structure:Code",
    name_space={
        "structure": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure"
    },
):
    """
    Used to download match list to {output_dir}{file_name} using given {url}
    Using {xpath} and {name_space} when reading the pulled xml
    url: the String url which needs to be pulled from
    output_dir: the String output folder directory, 
        automatically create if not exist
    file_name: the String file name used to save file
    xpath: A String used to select data in xml
    name_space: A String used to select data in xml
    """

    # pull_direct(url, output_dir, f"{file_name}.xml")
    response = requests.get(url, allow_redirects=True)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(f"{output_dir}{file_name}.xml", "w") as f:
        f.write(response.text)

    print(f"data saved: {output_dir}{file_name}.xml")

    # select data from xml
    df = pd.read_xml(f"{output_dir}{file_name}.xml", xpath=xpath, namespaces=name_space)
    df.to_csv(f"{output_dir}{file_name}.csv")


SA2 to district name match tabel

In [4]:
### Pull SA2 match table
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical\
-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads\
/allocation-files/SA2_2021_AUST.xlsx"
file_name = "SA2_TO_Name.xlsx"
pull_direct(url, OUTPUT_DIR, file_name)

# Select victoria data then save
vic_df = pd.read_excel(f"{OUTPUT_DIR}{file_name}")
vic_df = vic_df.loc[vic_df["STATE_NAME_2021"] == "Victoria"]
vic_df = vic_df[["SA2_CODE_2021", "SA2_NAME_2021"]].rename(
    columns={"SA2_CODE_2021": "code", "SA2_NAME_2021": "name"}
)
vic_df.to_csv(f"{OUTPUT_DIR}SA2_TO_Name.csv")


Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/SA2_2021_AUST.xlsx
File saved in: ../data/raw/ABS/SA2_TO_Name.xlsx


SA2 shape file

In [9]:
### pull shape file
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical\
-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads\
/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip"
file_name = "SA2_2021_AUST_SHP_GDA2020.zip"
pull_direct(url, OUTPUT_DIR, file_name)

# unzip zip file
with zipfile.ZipFile(f"{OUTPUT_DIR}{file_name}", "r") as zip_ref:
    zip_ref.extractall(f"{OUTPUT_DIR}digitalBoundary/")


Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip
File saved in: ../data/raw/ABS/SA2_2021_AUST_SHP_GDA2020.zip


ABS Data

Estimated Resident Population (ERP) (2001 to 2021)

In [5]:
### Pull ERP data
url = "https://api.data.abs.gov.au/data/ABS,ABS_ANNUAL_ERP_ASGS2021,1.2.0/.\
SA2..A?startPeriod=2010&endPeriod=2021&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}ERP/", "ERP_raw.csv", response.text)

### pull ERP match table
match_url = "https://api.data.abs.gov.au/datastructure/ABS/ABS_ANNUAL_ERP\
_ASGS2021/1.2.0?references=all"
match_output_dir = f"{OUTPUT_DIR}ERP/"
match_file_name = "ERP_match"
get_match_list(match_url, match_output_dir, match_file_name)

data saved: ../data/raw/ABS/ERP/ERP_match.xml


In [11]:
# read file
erp_sdf = spark.read.csv(f"{OUTPUT_DIR}ERP/ERP_raw.csv", header=True)
print("Before selection:")
erp_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
erp_sdf = erp_sdf[["ASGS_2021", "TIME_PERIOD", "OBS_VALUE"]]
erp_sdf = (
    erp_sdf.withColumnRenamed("ASGS_2021", "SA2")
    .withColumn("SA2", F.col("SA2").cast("int"))
    .withColumnRenamed("TIME_PERIOD", "year")
    .withColumn("year", F.col("year").cast("int"))
    .withColumnRenamed("OBS_VALUE", "population")
    .withColumn("population", F.col("population").cast("int"))
)

# Filter: victoria SA2
vic_df = pd.read_csv(f"{OUTPUT_DIR}SA2_TO_Name.csv")
vic_sa2 = vic_df["code"].tolist()
erp_sdf = erp_sdf.filter(F.col("SA2").isin(vic_sa2))

# save file
print("After selection:")
erp_sdf.show(2, vertical=True, truncate=100)
erp_sdf.write.option("header", True).mode("overwrite").csv(f"{OUTPUT_DIR}ERP/ERP.csv")


Before selection:
-RECORD 0------------------------------------------
 DATAFLOW     | ABS:ABS_ANNUAL_ERP_ASGS2021(1.2.0) 
 MEASURE      | ERP                                
 REGION_TYPE  | SA2                                
 ASGS_2021    | 101021010                          
 FREQ         | A                                  
 TIME_PERIOD  | 2010                               
 OBS_VALUE    | 4813                               
 UNIT_MEASURE | PSNS                               
 OBS_STATUS   | null                               
 OBS_COMMENT  | null                               
-RECORD 1------------------------------------------
 DATAFLOW     | ABS:ABS_ANNUAL_ERP_ASGS2021(1.2.0) 
 MEASURE      | ERP                                
 REGION_TYPE  | SA2                                
 ASGS_2021    | 101021010                          
 FREQ         | A                                  
 TIME_PERIOD  | 2011                               
 OBS_VALUE    | 4951                          

Household income (weekly) (exclude visitor/non-classifiable) (2021)

In [6]:
### pull household income data
url = f"https://api.data.abs.gov.au/data/ABS,C21_G33_SA2,1.0.0/...SA2.?\
startPeriod=2021&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Household_income/", "Household_income_raw.csv",
            response.text)

### pull match data
match_url = "https://api.data.abs.gov.au/datastructure/ABS/C21_G33_SA2/1.\
0.0?references=all"
match_output_dir = f"{OUTPUT_DIR}Household_income/"
match_file_name = "Household_income_match"
get_match_list(match_url, match_output_dir, match_file_name)


data saved: ../data/raw/ABS/Household_income/Household_income_match.xml


In [12]:
# read data
income_sdf = spark.read.csv(
    f"{OUTPUT_DIR}Household_income/Household_income_raw.csv", header=True
)
print("Before selection:")
income_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
income_sdf = income_sdf[["REGION", "HIND", "HHCD", "OBS_VALUE", "STATE", "TIME_PERIOD"]]
income_sdf = (
    income_sdf.withColumnRenamed("REGION", "SA2")
    .withColumn("SA2", F.col("SA2").cast("int"))
    .withColumnRenamed("TIME_PERIOD", "year")
    .withColumn("year", F.col("year").cast("int"))
    .withColumnRenamed("OBS_VALUE", "popultaion")
    .withColumn("popultaion", F.col("popultaion").cast("int"))
    .withColumnRenamed("STATE", "state")
    .withColumn("state", F.col("state").cast("int"))
    .withColumnRenamed("HIND", "income_level")
    .withColumnRenamed("HHCD", "household_type")
)
match_sdf = spark.read.csv(
    f"{OUTPUT_DIR}Household_income/Household_income_match.csv", header=True
)[["id", "Name"]].withColumnRenamed("id", "code")

# Filter: victoria
income_sdf = income_sdf.filter(F.col("state") == 2)
income_sdf = income_sdf[["SA2", "year", "household_type", "income_level",\
                        "popultaion"]]


# inner join
match_sdf.createOrReplaceTempView("match")
income_sdf.createOrReplaceTempView("income_sdf")
matched_sdf = spark.sql(
    """
    SELECT SA2, year, Name AS household_type, income_level, popultaion
    FROM(
        SELECT SA2, year, household_type, Name AS income_level, popultaion
        FROM income_sdf
        INNER JOIN match ON income_sdf.income_level = match.code) AS half
    INNER JOIN match ON half.household_type = match.code
"""
)

# write filtered file
print("After selection:")
matched_sdf.show(2, vertical=True, truncate=100)
matched_sdf.write.option("header", True).mode("overwrite").csv(
    f"{OUTPUT_DIR}Household_income/Household_income.csv"
)


Before selection:
-RECORD 0-----------------------------
 DATAFLOW    | ABS:C21_G33_SA2(1.0.0) 
 HIND        | 7                      
 HHCD        | 1_2                    
 REGION      | 101021008              
 REGION_TYPE | SA2                    
 STATE       | 1                      
 TIME_PERIOD | 2021                   
 OBS_VALUE   | 94                     
-RECORD 1-----------------------------
 DATAFLOW    | ABS:C21_G33_SA2(1.0.0) 
 HIND        | 7                      
 HHCD        | _T                     
 REGION      | 101021611              
 REGION_TYPE | SA2                    
 STATE       | 1                      
 TIME_PERIOD | 2021                   
 OBS_VALUE   | 124                    
only showing top 2 rows

After selection:
-RECORD 0---------------------------
 SA2            | 201011002         
 year           | 2021              
 household_type | Family households 
 income_level   | Total             
 popultaion     | 3032              
-RECORD 1-------

Population projection (2017 - 2066)

In [9]:
### pull popultation projection data
url = "https://api.data.abs.gov.au/data/ABS,POP_PROJ_REGION_2012_2061,\
1.0.0/2.3.TT.1.1.1.1.A?startPeriod=2017&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Population/", "Population_raw.csv", response.text)

### pull match tabel
match_url = "https://api.data.abs.gov.au/datastructure/ABS/POP_PROJ_REGION_\
2012_2061/1.0.0?references=all"
match_output_dir = f"{OUTPUT_DIR}Population/"
match_file_name = "Population_match"
get_match_list(match_url, match_output_dir, match_file_name)


data saved: ../data/raw/ABS/Population/Population_match.xml


In [13]:
# read data
population_sdf = spark.read.csv(
    f"{OUTPUT_DIR}Population/Population_raw.csv", header=True
)
print("Before selection:")
population_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
population_sdf = population_sdf[["TIME_PERIOD", "OBS_VALUE"]]
population_sdf = (
    population_sdf.withColumnRenamed("TIME_PERIOD", "year")
    .withColumn("year", F.col("year").cast("int"))
    .withColumnRenamed("OBS_VALUE", "popultaion")
    .withColumn("popultaion", F.col("popultaion").cast("int"))
)

# save modified file
print("After selection:")
population_sdf.show(2, vertical=True, truncate=100)
population_sdf.write.option("header", True).mode("overwrite").csv(
    f"{OUTPUT_DIR}Population/Population.csv"
)


Before selection:
-RECORD 0--------------------------------------------
 DATAFLOW     | ABS:POP_PROJ_REGION_2012_2061(1.0.0) 
 REGION       | 2                                    
 SEX_ABS      | 3                                    
 AGE          | TT                                   
 FERTILITY    | 1                                    
 MORTALITY    | 1                                    
 NOM          | 1                                    
 NIM          | 1                                    
 FREQUENCY    | A                                    
 TIME_PERIOD  | 2017                                 
 OBS_VALUE    | 6321648                              
 UNIT_MEASURE | PSNS                                 
 OBS_STATUS   | null                                 
 OBS_COMMENT  | null                                 
-RECORD 1--------------------------------------------
 DATAFLOW     | ABS:POP_PROJ_REGION_2012_2061(1.0.0) 
 REGION       | 2                                    
 SEX_ABS  

School location

In [17]:
# pull school location file
url = "https://www.acara.edu.au/docs/default-source/default-document-\
library/school-location-2021e23a2f404c94637ead88ff00003e0139.xlsx\
?sfvrsn=51ae4c07_0"
response = requests.get(url)
write_file(f"{OUTPUT_DIR}School_location/", "School_location_raw.xlsx",
            response.content, "wb")
# pull_direct(url, OUTPUT_DIR, f"School_location.xlsx")

True

In [18]:
# feature selection / VIC filtering
school_df = pd.read_excel(
    f"{OUTPUT_DIR}School_location/School_location_raw.xlsx",
    sheet_name="SchoolLocations 2021",
)[["School Name", "Statistical Area 2", "State", "Latitude", 
    "Longitude", "School Type"]]
school_df = school_df.loc[school_df["State"] == "VIC"]
school_df = school_df.rename(columns={"Statistical Area 2": "SA2"})
school_df = school_df.drop(columns="State")

print(school_df.head())
print(school_df.shape)

# save filtered data
school_df.to_csv(f"{OUTPUT_DIR}School_location/School_location.csv")


                             School Name        SA2  Latitude  Longitude  \
241   Kurnai College - University Campus  205041094  -38.3094   146.4249   
295         Learning Co-Operative School  209031212  -37.6294   145.2133   
297                        Andale School  207011153  -37.8055   145.0360   
298                 The Currajong School  208041195  -37.8760   145.0599   
299  Mansfield Autism Statewide Services  204011057  -37.0608   146.0859   

    School Type  
241   Secondary  
295     Primary  
297    Combined  
298    Combined  
299    Combined  
(2729, 5)


Geopandas

In [19]:
plot_dir = "../plots/geo/draft/"

# read shape file and make geometry readable
shape_gdf = gpd.read_file("../data/raw/ABS/digitalBoundary/\
SA2_2021_AUST_GDA2020.shp")
shape_gdf = shape_gdf.loc[shape_gdf["STE_NAME21"] == "Victoria"]
shape_gdf["geometry"] = shape_gdf["geometry"].to_crs(
    " proj=longlat  ellps=WGS84  datum=WGS84  no_defs"
)

# plto Choropleth map
geoJSON = shape_gdf[["SA2_CODE21", "geometry"]]\
    .drop_duplicates("SA2_CODE21").to_json()
base_map = folium.Map(location=[-37.79, 144.96],
    tiles="Stamen Terrain", zoom_start=7.3)
base_map.add_child(folium.Choropleth(geo_data=geoJSON,
                                    name="choropleth"))

# save plot
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)
base_map.save(f"{plot_dir}base_map.html")


In [None]:
# uncomment the following line and run to view the map
# base_map