## TO USE: run all

final outputs:
### ABS
#### SA2 code to district names
- ../data/raw/ABS/SA2_TO_Name.csv
- code(int), name(String)

#### Digital boundary (AUS range, manually select VIC if needed)
- ../data/raw/ABS/digitalBoundary/SA2_2021_AUST_GDA2020.shp

#### Estimated Resident Population (ERP) (2001 to 2021) (By SA2)
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)
- SA2: interger code marking each district, e.g. 206041117 for Carlton

#### Median household income (weekly) (2021) (By SA2)
- ../data/raw/ABS/Household_income/Household_income.csv
- SA2 (int), year (int),  household_type (String), income_level (String), popultaion (int)

#### Population projection (2017 - 2066) (VIC overall)
- ../data/raw/ABS/Population/Population.csv
- ../data/curated/population_projection.csv
- year (int), popultaion (int)

#### Residential Property Price Index (1/2011 - 12/2021) (AUS)
- ../data/raw/ABS/Price_index/Price_index.csv
- year (int), price_index (int)

### ACARA
#### School location
- ../data/raw/ACARA/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)

### RBA
#### Interest rate (2013 - 2021) (AUS)
- ../data/raw/rba/interest_rate/interest_rate.csv
- time(int, year), bond (float, risk free interest rate)

In [1]:
from urllib.request import urlretrieve
import sys
import pandas as pd
import geopandas as gpd
import folium
import requests
import math
import zipfile
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
pd.options.display.float_format = "{:,.4f}".format

OUTPUT_DIR = "../data/raw/ABS/"

headers = {"accept": "text/csv"}

22/09/15 10:22:24 WARN Utils: Your hostname, Bruce-PC resolves to a loopback address: 127.0.1.1; using 172.31.117.118 instead (on interface eth0)
22/09/15 10:22:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/15 10:22:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/15 10:22:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/15 10:22:26 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
def pull_direct(url, output_dir, file_name):
    """
    use urlretrieve function to directly pull data from given url and save it 
        to path: {output_dir}{file_name}
    url: the String url which needs to be pulled from
    output_dir: the String output folder directory, automatically create if not exist
    file_name: the String file name of the file needed to be pulled from
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        urlretrieve(url, f"{output_dir}{file_name}")
        print(
            f"Request succeed: pulling from{url}\nFile saved in: {output_dir}{file_name}"
        )
    except Exception as e:
        print(f"********\nRequest failure: ")
        print(e)
        print("********")


def write_file(output_dir, file_name, content, mod="w"):
    """
    write given content to local file at: {output_dir}{file_name} with mode: {mod}
    output_dir: the String output folder directory, automatically create if not exist
    file_name: the String file name used to save file
    content: expecting objects which can be written to file with open function
    mod: String of writing mode code used in writing file
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(f"{output_dir}{file_name}", mod) as f:
            f.write(content)
    except Exception as e:
        print(f"****** Writing file failure: {output_dir}{file_name}")
        print(e)
        print("******")
        return False
    return True



SA2 code to district names
- ../data/raw/ABS/SA2_TO_Name.csv
- code(int), name(String)

In [4]:
### Pull SA2 match table
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical\
-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads\
/allocation-files/SA2_2021_AUST.xlsx"
file_name = "SA2_TO_Name.xlsx"
pull_direct(url, OUTPUT_DIR, file_name)

# Select victoria data then save
vic_df = pd.read_excel(f"{OUTPUT_DIR}{file_name}")
vic_df = vic_df.loc[vic_df["STATE_NAME_2021"] == "Victoria"]
vic_df = vic_df[["SA2_CODE_2021", "SA2_NAME_2021"]].rename(
    columns={"SA2_CODE_2021": "code", "SA2_NAME_2021": "name"}
)
vic_df.to_csv(f"{OUTPUT_DIR}SA2_TO_Name.csv")

Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/SA2_2021_AUST.xlsx
File saved in: ../data/raw/ABS/SA2_TO_Name.xlsx


Digital boundary (AUS range, manually select VIC if needed)
- ../data/raw/ABS/digitalBoundary/SA2_2021_AUST_GDA2020.shp

In [5]:
### pull shape file
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical\
-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads\
/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip"
file_name = "SA2_2021_AUST_SHP_GDA2020.zip"
pull_direct(url, OUTPUT_DIR, file_name)

# unzip zip file
with zipfile.ZipFile(f"{OUTPUT_DIR}{file_name}", "r") as zip_ref:
    zip_ref.extractall(f"{OUTPUT_DIR}digitalBoundary/")


Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip
File saved in: ../data/raw/ABS/SA2_2021_AUST_SHP_GDA2020.zip


ABS Data

Estimated Resident Population (ERP) (2001 to 2021) (By SA2)
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)
- SA2: interger code marking each district, e.g. 206041117 for Carlton

In [6]:
### Pull ERP data
url = "https://api.data.abs.gov.au/data/ABS,ABS_ANNUAL_ERP_ASGS2021,1.2.0/.\
SA2..A?startPeriod=2010&endPeriod=2021&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}ERP/", "ERP_raw.csv", response.text)

True

In [7]:
# read file
erp_sdf = spark.read.csv(f"{OUTPUT_DIR}ERP/ERP_raw.csv", header=True)
print("Before selection:")
erp_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
erp_sdf = erp_sdf[["ASGS_2021", "TIME_PERIOD", "OBS_VALUE"]]
erp_sdf = (
    erp_sdf.withColumnRenamed("ASGS_2021", "SA2")
    .withColumn("SA2", F.col("SA2").cast("int"))
    .withColumnRenamed("TIME_PERIOD", "year")
    .withColumn("year", F.col("year").cast("int"))
    .withColumnRenamed("OBS_VALUE", "population")
    .withColumn("population", F.col("population").cast("int"))
)

# Filter: victoria SA2
vic_df = pd.read_csv(f"{OUTPUT_DIR}SA2_TO_Name.csv")
vic_sa2 = vic_df["code"].tolist()
erp_sdf = erp_sdf.filter(F.col("SA2").isin(vic_sa2))

# save file
print("After selection:")
erp_sdf.show(2, vertical=True, truncate=100)
erp_sdf.write.option("header", True).mode("overwrite").csv(f"{OUTPUT_DIR}ERP/ERP.csv")


Before selection:
-RECORD 0------------------------------------------
 DATAFLOW     | ABS:ABS_ANNUAL_ERP_ASGS2021(1.2.0) 
 MEASURE      | ERP                                
 REGION_TYPE  | SA2                                
 ASGS_2021    | 101021010                          
 FREQ         | A                                  
 TIME_PERIOD  | 2010                               
 OBS_VALUE    | 4813                               
 UNIT_MEASURE | PSNS                               
 OBS_STATUS   | null                               
 OBS_COMMENT  | null                               
-RECORD 1------------------------------------------
 DATAFLOW     | ABS:ABS_ANNUAL_ERP_ASGS2021(1.2.0) 
 MEASURE      | ERP                                
 REGION_TYPE  | SA2                                
 ASGS_2021    | 101021010                          
 FREQ         | A                                  
 TIME_PERIOD  | 2011                               
 OBS_VALUE    | 4951                          

Median household income (weekly) (2021) (By SA2)
- ../data/raw/ABS/Household_income/Household_income.csv
- SA2 (int), year (int),  household_type (String), income_level (String), popultaion (int)

In [8]:
### pull median household income data
url = f"https://api.data.abs.gov.au/data/ABS,C21_G02_SA2,1.0.0/4..SA2.2?startPeriod=2021&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Household_income/", "Household_income_raw.csv",
            response.text)

True

In [9]:
# read file
income_sdf = spark.read.csv(
    f"{OUTPUT_DIR}Household_income/Household_income_raw.csv", header=True
)
print("Before selection:")
income_sdf.show(2, vertical=True, truncate=100)

# filt data / rename
income_sdf = income_sdf[["REGION", "OBS_VALUE"]]
income_sdf = (
    income_sdf.withColumnRenamed("REGION", "SA2")
    .withColumn("SA2", F.col("SA2").cast("int"))
    .withColumnRenamed("OBS_VALUE", "median_income")
    .withColumn("median_income", F.col("median_income").cast("int"))
)

# save data
print("Before after:")
income_sdf.show(2, vertical=True, truncate=100)
income_sdf.write.option("header", True).mode("overwrite").csv(
    f"{OUTPUT_DIR}Household_income/Household_income.csv"
)

Before selection:
-RECORD 0-----------------------------
 DATAFLOW    | ABS:C21_G02_SA2(1.0.0) 
 MEDAVG      | 4                      
 REGION      | 213051589              
 REGION_TYPE | SA2                    
 STATE       | 2                      
 TIME_PERIOD | 2021                   
 OBS_VALUE   | 1862                   
-RECORD 1-----------------------------
 DATAFLOW    | ABS:C21_G02_SA2(1.0.0) 
 MEDAVG      | 4                      
 REGION      | 209041437              
 REGION_TYPE | SA2                    
 STATE       | 2                      
 TIME_PERIOD | 2021                   
 OBS_VALUE   | 1979                   
only showing top 2 rows

Before after:
-RECORD 0------------------
 SA2           | 213051589 
 median_income | 1862      
-RECORD 1------------------
 SA2           | 209041437 
 median_income | 1979      
only showing top 2 rows



Population projection (2017 - 2066) (VIC overall)
- ../data/raw/ABS/Population/Population.csv
- year (int), popultaion (int)

In [10]:
### pull popultation projection data
url = "https://api.data.abs.gov.au/data/ABS,POP_PROJ_REGION_2012_2061,\
1.0.0/2.3.TT.1.1.1.1.A?startPeriod=2017&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Population/", "Population_raw.csv", response.text)


True

In [3]:
# read data
population_sdf = spark.read.csv(
    f"{OUTPUT_DIR}Population/Population_raw.csv", header=True
)
print("Before selection:")
population_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
population_sdf = population_sdf[["TIME_PERIOD", "OBS_VALUE"]]
population_sdf = (
    population_sdf.withColumnRenamed("TIME_PERIOD", "year")
    .withColumn("year", F.col("year").cast("int"))
    .withColumnRenamed("OBS_VALUE", "popultaion")
    .withColumn("popultaion", F.col("popultaion").cast("int"))
)

# save modified file
print("After selection:")
population_sdf.show(2, vertical=True, truncate=100)
population_sdf.write.option("header", True).mode("overwrite").csv(
    f"{OUTPUT_DIR}Population/Population.csv"
)
population_sdf.write.option("header", True).mode("overwrite").csv(
    f"../data/curated/population_projection.csv"
)

Before selection:
-RECORD 0--------------------------------------------
 DATAFLOW     | ABS:POP_PROJ_REGION_2012_2061(1.0.0) 
 REGION       | 2                                    
 SEX_ABS      | 3                                    
 AGE          | TT                                   
 FERTILITY    | 1                                    
 MORTALITY    | 1                                    
 NOM          | 1                                    
 NIM          | 1                                    
 FREQUENCY    | A                                    
 TIME_PERIOD  | 2017                                 
 OBS_VALUE    | 6321648                              
 UNIT_MEASURE | PSNS                                 
 OBS_STATUS   | null                                 
 OBS_COMMENT  | null                                 
-RECORD 1--------------------------------------------
 DATAFLOW     | ABS:POP_PROJ_REGION_2012_2061(1.0.0) 
 REGION       | 2                                    
 SEX_ABS  

Residential Property Price Index (1/2011 - 12/2021) (AUS)
- ../data/raw/ABS/Price_index/Price_index.csv
- year (int), price_index (int)

In [12]:
output_dir = f"../data/raw/ABS/"

### Pull Price_index data
url = "https://api.data.abs.gov.au/data/ABS,RPPI,1.0.0/1.3.2GMEL.Q?startPeriod=2011-Q1&endPeriod=2021-Q4&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{output_dir}Price_index/", "Price_index_raw.csv", response.text)


True

In [13]:
# read file
price_sdf = spark.read.csv(f"{output_dir}Price_index/Price_index_raw.csv", header=True)
print("Before selection:")
price_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
price_sdf = price_sdf[["TIME_PERIOD", "OBS_VALUE"]]
price_sdf = (
    price_sdf.withColumnRenamed("TIME_PERIOD", "year")
    .withColumnRenamed("OBS_VALUE", "price_index")
    .withColumn("price_index", F.col("price_index").cast("int"))
)
price_sdf = price_sdf.filter(F.col("year").contains("Q4"))
price_sdf = price_sdf.withColumn("year", F.substring("year", 1, 4).cast("int"))

# save file
print("After selection:")
price_sdf.show(2, vertical=True, truncate=100)
price_sdf.write.option("header", True).mode("overwrite").csv(
    f"{output_dir}Price_index/Price_index.csv"
)


Before selection:
-RECORD 0------------------------
 DATAFLOW      | ABS:RPPI(1.0.0) 
 MEASURE       | 1               
 PROPERTY_TYPE | 3               
 REGION        | 2GMEL           
 FREQ          | Q               
 TIME_PERIOD   | 2011-Q1         
 OBS_VALUE     | 104.7           
 UNIT_MEASURE  | IN              
 OBS_STATUS    | null            
 OBS_COMMENT   | null            
-RECORD 1------------------------
 DATAFLOW      | ABS:RPPI(1.0.0) 
 MEASURE       | 1               
 PROPERTY_TYPE | 3               
 REGION        | 2GMEL           
 FREQ          | Q               
 TIME_PERIOD   | 2011-Q2         
 OBS_VALUE     | 103.5           
 UNIT_MEASURE  | IN              
 OBS_STATUS    | null            
 OBS_COMMENT   | null            
only showing top 2 rows

After selection:
-RECORD 0-----------
 year        | 2011 
 price_index | 100  
-RECORD 1-----------
 year        | 2012 
 price_index | 100  
only showing top 2 rows



## ACARA

#### School location
- ../data/raw/ACARA/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)

In [5]:
output_dir = f"../data/raw/ACARA/"

# pull school location file
url = "https://www.acara.edu.au/docs/default-source/default-document-\
library/school-location-2021e23a2f404c94637ead88ff00003e0139.xlsx\
?sfvrsn=51ae4c07_0"
response = requests.get(url)
write_file(f"{output_dir}", "School_location_raw.xlsx",
            response.content, "wb")
# pull_direct(url, output_dir, f"School_location.xlsx")

True

In [10]:
output_dir = f"../data/raw/ACARA/"

# prepare boundary gdf
boundary_df = pd.DataFrame(gpd.read_file(f"../data/raw/ABS/digitalBoundary/\
SA2_2021_AUST_GDA2020.shp")).rename(columns={"SA2_CODE21": "SA2"})
boundary_df = boundary_df.loc[boundary_df["STE_NAME21"] == "Victoria"][["SA2"]]
boundary_df["SA2"] = pd.to_numeric(boundary_df["SA2"])

# feature selection / VIC filtering
school_df = pd.read_excel(
    f"{output_dir}School_location_raw.xlsx",
    sheet_name="SchoolLocations 2021",
)[["School Name", "Statistical Area 2", "State", "Latitude", 
    "Longitude", "School Type"]]
school_df = school_df.loc[school_df["School Type"] != "Special"]
school_df = school_df.loc[school_df["State"] == "VIC"]
school_df = school_df.rename(columns={"Statistical Area 2": "SA2"})
school_df = school_df.drop(columns="State")
school_df = school_df.dropna()

# right join
school_df = pd.merge(school_df, boundary_df, on="SA2", how="right")

# add index
school_df["school_id"] = range(school_df.shape[0])
school_df = school_df.reset_index().set_index("school_id").drop(["index"], axis=1)

print(school_df.head())
print(school_df.shape)

# save filtered data
school_df = school_df.dropna().drop_duplicates()
school_df.to_csv(f"{output_dir}School_location.csv")


Interest rate (2013 - 2021) (AUS)

In [16]:
output_dir = f"../data/raw/rba/"

### Pull Price_index data
url = "https://www.rba.gov.au/statistics/tables/csv/f2.1-data.csv?v=2022-09-08-09-00-18"
response = requests.get(url, headers=headers)
write_file(f"{output_dir}interest_rate/", "interest_rate_raw.csv", response.text)


True

In [24]:
# read file
interest_df = pd.read_csv(f"{output_dir}interest_rate/interest_rate_raw.csv")

# data cleaning and renaming
col_names = ["time"] + list(interest_df.iloc[[0]].values[0][1:])
interest_df = interest_df.iloc[11:]
interest_df.columns = col_names
interest_df = interest_df[["time", "Commonwealth Government 3 year bond"]]

# data selection (fourth quarter bond)
interest_df.columns = ["time", "bond"]
interest_df = interest_df.loc[interest_df["time"].str.contains("Dec")]
digit_re = re.compile(r"(\d+)")
interest_df["time"] = interest_df["time"].apply(lambda x: int(digit_re.findall(x)[0]))

# rename and change datatype
interest_df = interest_df.rename(columns={"time": "year"})
interest_df["bond"] = interest_df["bond"].astype("float")

# save data
print(interest_df.head())
interest_df.to_csv(f"{output_dir}interest_rate/interest_rate.csv")


    year   bond
16  2013 2.9600
28  2014 2.2800
40  2015 2.1000
52  2016 1.9700
64  2017 2.0300
