In [1]:
from urllib.request import urlretrieve
import requests
import sys
import pandas as pd
import geopandas as gpd
import folium
import math
import zipfile
import os
import re
from itertools import compress
from pyspark.sql import SparkSession
from pyspark.sql import functions as F



spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
pd.options.display.float_format = "{:,.4f}".format

OUTPUT_DIR = "../data/raw/ABS/"

headers = {"accept": "text/csv"}

22/09/08 22:29:56 WARN Utils: Your hostname, Bruce-PC resolves to a loopback address: 127.0.1.1; using 172.21.207.100 instead (on interface eth0)
22/09/08 22:29:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/08 22:29:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def pull_direct(url, output_dir, file_name):
    """
    use urlretrieve function to directly pull data from given url and save it 
        to path: {output_dir}{file_name}
    url: the String url which needs to be pulled from
    output_dir: the String output folder directory, automatically create if not exist
    file_name: the String file name of the file needed to be pulled from
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        urlretrieve(url, f"{output_dir}{file_name}")
        print(
            f"Request succeed: pulling from{url}\nFile saved in: {output_dir}{file_name}"
        )
    except Exception as e:
        print(f"********\nRequest failure: ")
        print(e)
        print("********")


def write_file(output_dir, file_name, content, mod="w"):
    """
    write given content to local file at: {output_dir}{file_name} with mode: {mod}
    output_dir: the String output folder directory, automatically create if not exist
    file_name: the String file name used to save file
    content: expecting objects which can be written to file with open function
    mod: String of writing mode code used in writing file
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(f"{output_dir}{file_name}", mod) as f:
            f.write(content)
    except Exception as e:
        print(f"****** Writing file failure: {output_dir}{file_name}")
        print(e)
        print("******")
        return False
    return True


def get_match_list(
    url,
    output_dir,
    file_name,
    xpath=".//structure:Code",
    name_space={
        "structure": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure"
    },
):
    """
    Used to download match list to {output_dir}{file_name} using given {url}
    Using {xpath} and {name_space} when reading the pulled xml
    url: the String url which needs to be pulled from
    output_dir: the String output folder directory, 
        automatically create if not exist
    file_name: the String file name used to save file
    xpath: A String used to select data in xml
    name_space: A String used to select data in xml
    """

    # pull_direct(url, output_dir, f"{file_name}.xml")
    response = requests.get(url, allow_redirects=True)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(f"{output_dir}{file_name}.xml", "w") as f:
        f.write(response.text)

    print(f"data saved: {output_dir}{file_name}.xml")

    # select data from xml
    df = pd.read_xml(f"{output_dir}{file_name}.xml", xpath=xpath, namespaces=name_space)
    df.to_csv(f"{output_dir}{file_name}.csv")


Greater Capital City Statistical Areas (GCCSA)

In [65]:
### Pull SA2 match table
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/GCCSA_2021_AUST.xlsx"
file_name = "GCCSA_TO_Name.xlsx"
pull_direct(url, OUTPUT_DIR, file_name)

Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/GCCSA_2021_AUST.xlsx
File saved in: ../data/raw/Prediction/GCCSA_TO_Name.xlsx


In [66]:
# read data
vic_df = pd.read_excel(f"{OUTPUT_DIR}GCCSA_TO_Name.xlsx")

# Select greater melbourne areas
gmelb_code = vic_df.loc[vic_df["GCCSA_NAME_2021"] == "Greater Melbourne"]["GCCSA_NAME_2021"].values.tolist()[0]

print(gmelb_code)

Greater Melbourne


Residential Property Price Index (1/2011 - 12/2021)

In [67]:
### Pull Price_index data
url = "https://api.data.abs.gov.au/data/ABS,RPPI,1.0.0/1.3.2GMEL.Q?startPeriod=2011-Q1&endPeriod=2021-Q4&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Price_index/", "Price_index_raw.csv", response.text)



True

In [68]:
# read file
price_sdf = spark.read.csv(f"{OUTPUT_DIR}Price_index/Price_index_raw.csv", header=True)
print("Before selection:")
price_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
price_sdf = price_sdf[["TIME_PERIOD", "OBS_VALUE"]]
price_sdf = (
    price_sdf.withColumnRenamed("TIME_PERIOD", "year")
    .withColumnRenamed("OBS_VALUE", "price_index")
    .withColumn("price_index", F.col("price_index").cast("int"))
)
price_sdf = price_sdf.filter(F.col("year").contains("Q4"))
price_sdf = price_sdf.withColumn("year", F.substring("year", 1, 4).cast("int"))

# save file
print("After selection:")
price_sdf.show(2, vertical=True, truncate=100)
price_sdf.write.option("header", True).mode("overwrite").csv(f"{OUTPUT_DIR}Price_index/Price_index.csv")


Before selection:
-RECORD 0------------------------
 DATAFLOW      | ABS:RPPI(1.0.0) 
 MEASURE       | 1               
 PROPERTY_TYPE | 3               
 REGION        | 2GMEL           
 FREQ          | Q               
 TIME_PERIOD   | 2011-Q1         
 OBS_VALUE     | 104.7           
 UNIT_MEASURE  | IN              
 OBS_STATUS    | null            
 OBS_COMMENT   | null            
-RECORD 1------------------------
 DATAFLOW      | ABS:RPPI(1.0.0) 
 MEASURE       | 1               
 PROPERTY_TYPE | 3               
 REGION        | 2GMEL           
 FREQ          | Q               
 TIME_PERIOD   | 2011-Q2         
 OBS_VALUE     | 103.5           
 UNIT_MEASURE  | IN              
 OBS_STATUS    | null            
 OBS_COMMENT   | null            
only showing top 2 rows

After selection:
-RECORD 0-----------
 year        | 2011 
 price_index | 100  
-RECORD 1-----------
 year        | 2012 
 price_index | 100  
only showing top 2 rows



Interest rate

In [3]:

### Pull Price_index data
url = "https://www.rba.gov.au/statistics/tables/csv/f2.1-data.csv?v=2022-09-08-09-00-18"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}interest_rate/", "interest_rate_raw.csv", response.text)


True

In [13]:
# read file
interest_df = pd.read_csv(f"{OUTPUT_DIR}interest_rate/interest_rate_raw.csv")

# data cleaning and renaming
col_names = ["time"] + list(interest_df.iloc[[0]].values[0][1:])
interest_df = interest_df.iloc[11:]
interest_df.columns = col_names
interest_df = interest_df[["time", "Commonwealth Government 3 year bond"]]

# data selection
interest_df.columns = ["time", "bond"]
interest_df = interest_df.loc[interest_df["time"].str.contains("Dec")]
digit_re = re.compile(r"(\d+)")
print(digit_re.findall("Dec-2013"))

interest_df["time"] = interest_df["time"].apply(lambda x: int(digit_re.findall(x)[0]))

print(interest_df.head())


# interest_sdf = spark.read.csv(f"{OUTPUT_DIR}interest_rate/interest_rate_raw.csv")
# print("Before selection:")
# interest_sdf.show(2, vertical=True, truncate=100)

# # feature selection / rename / reset datatype
# interest_sdf = interest_sdf[["TIME_PERIOD", "OBS_VALUE"]]
# interest_sdf = (
#     interest_sdf.withColumnRenamed("TIME_PERIOD", "year")
#     .withColumnRenamed("OBS_VALUE", "price_index")
#     .withColumn("price_index", F.col("price_index").cast("int"))
# )
# interest_sdf = interest_sdf.filter(F.col("year").contains("Q4"))
# interest_sdf = interest_sdf.withColumn("year", F.substring("year", 1, 4).cast("int"))

# # # Filter: victoria SA2
# # vic_df = pd.read_csv(f"{OUTPUT_DIR}SA2_TO_Name.csv")
# # vic_sa2 = vic_df["code"].tolist()
# # interest_sdf = interest_sdf.filter(F.col("SA2").isin(vic_sa2))

# # save file
# print("After selection:")
# interest_sdf.show(2, vertical=True, truncate=100)
# interest_sdf.write.option("header", True).mode("overwrite").csv(f"{OUTPUT_DIR}interest_rate/interest_rate.csv")


['2013']
    time  bond
16  2013  2.96
28  2014  2.28
40  2015  2.10
52  2016  1.97
64  2017  2.03


Household income (weekly) (exclude visitor/non-classifiable) (2021)

In [69]:

### pull household income data
url = f"https://api.data.abs.gov.au/data/ABS,C21_G33_SA2,1.0.0/...SA2.?\
startPeriod=2021&dimensionAtObservation=AllDimensions"
response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Household_income_group/", "Household_income_raw.csv",
            response.text)

### pull match data
match_url = "https://api.data.abs.gov.au/datastructure/ABS/C21_G33_SA2/1.\
0.0?references=all"
match_output_dir = f"{OUTPUT_DIR}Household_income_group/"
match_file_name = "Household_income_match"
get_match_list(match_url, match_output_dir, match_file_name)




data saved: ../data/raw/Prediction/Household_income_group/Household_income_match.xml


In [70]:
# read data
income_sdf = spark.read.csv(
    f"{OUTPUT_DIR}Household_income_group/Household_income_raw.csv", header=True
)
print("Before selection:")
income_sdf.show(2, vertical=True, truncate=100)

# feature selection / rename / reset datatype
income_sdf = income_sdf[["REGION", "HIND", "HHCD", "OBS_VALUE", "STATE", "TIME_PERIOD"]]
income_sdf = (
    income_sdf.withColumnRenamed("REGION", "SA2")
    .withColumn("SA2", F.col("SA2").cast("int"))
    .withColumnRenamed("TIME_PERIOD", "year")
    .withColumn("year", F.col("year").cast("int"))
    .withColumnRenamed("OBS_VALUE", "popultaion")
    .withColumn("popultaion", F.col("popultaion").cast("int"))
    .withColumnRenamed("STATE", "state")
    .withColumn("state", F.col("state").cast("int"))
    .withColumnRenamed("HIND", "income_level")
    .withColumnRenamed("HHCD", "household_type")
)
match_sdf = spark.read.csv(
    f"{OUTPUT_DIR}Household_income_group/Household_income_match.csv", header=True
)[["id", "Name"]].withColumnRenamed("id", "code")

# Filter: victoria
income_sdf = income_sdf.filter(F.col("state") == 2)
income_sdf = income_sdf[["SA2", "year", "household_type", "income_level",\
                        "popultaion"]]


# inner join
match_sdf.createOrReplaceTempView("match")
income_sdf.createOrReplaceTempView("income_sdf")
matched_sdf = spark.sql(
    """
    SELECT SA2, year, Name AS household_type, income_level, popultaion
    FROM(
        SELECT SA2, year, household_type, Name AS income_level, popultaion
        FROM income_sdf
        INNER JOIN match ON income_sdf.income_level = match.code) AS half
    INNER JOIN match ON half.household_type = match.code
"""
)

# write filtered file
print("After selection:")
matched_sdf.show(2, vertical=True, truncate=100)
matched_sdf.write.option("header", True).mode("overwrite").csv(
    f"{OUTPUT_DIR}Household_income_group/Household_income.csv"
)

Before selection:
-RECORD 0-----------------------------
 DATAFLOW    | ABS:C21_G33_SA2(1.0.0) 
 HIND        | 7                      
 HHCD        | 1_2                    
 REGION      | 101021008              
 REGION_TYPE | SA2                    
 STATE       | 1                      
 TIME_PERIOD | 2021                   
 OBS_VALUE   | 94                     
-RECORD 1-----------------------------
 DATAFLOW    | ABS:C21_G33_SA2(1.0.0) 
 HIND        | 7                      
 HHCD        | _T                     
 REGION      | 101021611              
 REGION_TYPE | SA2                    
 STATE       | 1                      
 TIME_PERIOD | 2021                   
 OBS_VALUE   | 124                    
only showing top 2 rows

After selection:
-RECORD 0---------------------------
 SA2            | 201011002         
 year           | 2021              
 household_type | Family households 
 income_level   | Total             
 popultaion     | 3032              
-RECORD 1-------

In [71]:
# read data
household_sdf = spark.read.csv(f"{OUTPUT_DIR}Household_income_group/Household_income.csv", header=True)
print("before processing:")
household_sdf.show(2, vertical = True, truncate=100)

# filte household_type
household_sdf = household_sdf.filter(F.col("household_type") == "Total")

# filte income_level and find lower bound of each income_level
income_values = household_sdf.select(F.col('income_level'))\
                            .distinct().select("income_level")\
                            .rdd.flatMap(lambda x: x).collect()
digital_re = re.compile(r'[0-9\,]+')
int_re = re.compile(r'\d+')
income_list = []
income_lb = []
for income_value in income_values:
    result = digital_re.findall(income_value)
    if result:
        income_list.append(income_value)
        income_lb.append((income_value, "$"+''.join(int_re.findall(result[0]))))
household_sdf = household_sdf.filter(F.col("income_level").isin(income_list))


# change full boundary to lower boundary
match_sdf = spark.createDataFrame(income_lb, ["income_level", "lower_bound"])
print("\nmatch_table: ")
match_sdf.show(2, vertical = True, truncate=100)
household_sdf.createOrReplaceTempView("household")
match_sdf.createOrReplaceTempView("match")
household_sdf = spark.sql("""
    SELECT SA2, year, lower_bound, popultaion
    FROM household
    INNER JOIN match ON household.income_level = match.income_level
""")

print("\nafter processing:")
household_sdf.show(2, vertical = True, truncate=100)

before processing:
-RECORD 0---------------------------
 SA2            | 201011002         
 year           | 2021              
 household_type | Family households 
 income_level   | Total             
 popultaion     | 3032              
-RECORD 1---------------------------
 SA2            | 201011002         
 year           | 2021              
 household_type | Family households 
 income_level   | Total             
 popultaion     | 3032              
only showing top 2 rows


match_table: 
-RECORD 0-----------------
 income_level | $1-$149   
 lower_bound  | $1        
-RECORD 1-----------------
 income_level | $500-$649 
 lower_bound  | $500      
only showing top 2 rows


after processing:
-RECORD 0----------------
 SA2         | 204011056 
 year        | 2021      
 lower_bound | $1        
 popultaion  | 31        
-RECORD 1----------------
 SA2         | 204011056 
 year        | 2021      
 lower_bound | $1        
 popultaion  | 31        
only showing top 2 rows

