In [1]:
from urllib.request import urlretrieve
import sys
import pandas as pd
import requests
import math
import zipfile
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
pd.options.display.float_format = '{:,.4f}'.format

OUTPUT_DIR = "../data/raw/ABS/"

22/09/01 20:46:09 WARN Utils: Your hostname, Bruce-PC resolves to a loopback address: 127.0.1.1; using 172.21.207.231 instead (on interface eth0)
22/09/01 20:46:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/01 20:46:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [19]:
def pull_direct(url, output_dir, file_name):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        urlretrieve(url, f"{output_dir}{file_name}")
        print(f"Request succeed: pulling from{url}\nFile saved in: {output_dir}{file_name}")
    except Exception as e:
        print(f"********\nRequest failure: ")
        print(e)
        print("********")

def write_file(output_dir, file_name, content, mod = "w"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        with open(f"{output_dir}{file_name}", mod) as f:
            f.write(content)
    except Exception as e:
        print(f"****** Writing file failure: {output_dir}{file_name}")
        print(e)
        print("******")
        return False
    return True

def get_match_list(url, output_dir, file_name, xpath = ".//structure:Code",
    name_space = {"structure": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure"}):
    
    # pull_direct(url, output_dir, f"{file_name}.xml")
    response = requests.get(url, allow_redirects=True)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(f"{output_dir}{file_name}.xml", "w") as f:
        f.write(response.text)

    print(f"{output_dir}{file_name}.xml")

    df = pd.read_xml(f"{output_dir}{file_name}.xml", 
        xpath=xpath,
        namespaces=name_space
    )
    print(True)

    df.to_csv(f"{output_dir}{file_name}.csv")




SA2 match tabel

In [3]:
### Pull SA2 match table
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/SA2_2021_AUST.xlsx"
file_name = "SA2_2021_AUST.xlsx"
pull_direct(url, OUTPUT_DIR, file_name)

Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/allocation-files/SA2_2021_AUST.xlsx
File saved in: ../data/raw/ABS/SA2_2021_AUST.xlsx


SA2 shape file

In [4]:
url = f"https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip"
file_name = "SA2_2021_AUST_SHP_GDA2020.zip"
pull_direct(url, OUTPUT_DIR, file_name)

with zipfile.ZipFile(f"{OUTPUT_DIR}{file_name}", "r") as zip_ref:
    zip_ref.extractall(f"{OUTPUT_DIR}digitalBoundary/")

Request succeed: pulling fromhttps://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip
File saved in: ../data/raw/ABS/SA2_2021_AUST_SHP_GDA2020.zip


In [5]:
with zipfile.ZipFile(f"{OUTPUT_DIR}{file_name}", "r") as zip_ref:
    zip_ref.extractall(f"{OUTPUT_DIR}digitalBoundary/")

ABS Data

In [6]:
headers = {
    "accept": "text/csv"
}

Estimated Resident Population (ERP) (2001 to 2021)

In [23]:
url = "https://api.data.abs.gov.au/data/ABS,ABS_ANNUAL_ERP_ASGS2021,1.2.0/.SA2..A?startPeriod=2010&endPeriod=2021&dimensionAtObservation=AllDimensions"

response = requests.get(url, headers=headers)
# response = requests.get(url)
write_file(f"{OUTPUT_DIR}ERP/", "ERP_raw.csv", response.text)

match_url = "https://api.data.abs.gov.au/datastructure/ABS/ABS_ANNUAL_ERP_ASGS2021/1.2.0?references=all"
match_output_dir = f"{OUTPUT_DIR}ERP/"
match_file_name = "ERP_match"
get_match_list(match_url, match_output_dir, match_file_name)



../data/raw/ABS/ERP/ERP_match.xml
True


In [20]:
rsdf = spark.read.csv(f"{OUTPUT_DIR}ERP/ERP.csv")
rsdf.show(2, vertical = True, truncate=100)

-RECORD 0---------------------------------
 _c0 | DATAFLOW                           
 _c1 | MEASURE                            
 _c2 | REGION_TYPE                        
 _c3 | ASGS_2021                          
 _c4 | FREQ                               
 _c5 | TIME_PERIOD                        
 _c6 | OBS_VALUE                          
 _c7 | UNIT_MEASURE                       
 _c8 | OBS_STATUS                         
 _c9 | OBS_COMMENT                        
-RECORD 1---------------------------------
 _c0 | ABS:ABS_ANNUAL_ERP_ASGS2021(1.2.0) 
 _c1 | ERP                                
 _c2 | SA2                                
 _c3 | 101021010                          
 _c4 | A                                  
 _c5 | 2010                               
 _c6 | 4813                               
 _c7 | PSNS                               
 _c8 | null                               
 _c9 | null                               
only showing top 2 rows



Household income (weekly) (exclude visitor/non-classifiable) (2021)

In [24]:
url = "https://api.data.abs.gov.au/data/ABS,C21_G33_SA2,1.0.0/...SA2.?startPeriod=2021&dimensionAtObservation=AllDimensions"


response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Household_income/", "Household_income_raw.csv", response.text)


match_url = "https://api.data.abs.gov.au/datastructure/ABS/C21_G33_SA2/1.0.0?references=all"
match_output_dir = f"{OUTPUT_DIR}Household_income/"
match_file_name = "Household_income_match"
get_match_list(match_url, match_output_dir, match_file_name)


../data/raw/ABS/Household_income/Household_income_match.xml
True


In [7]:
rsdf = spark.read.csv(f"{OUTPUT_DIR}Household_income/Household_income.csv")
rsdf.show(2, vertical = True, truncate=100)

-RECORD 0---------------------
 _c0 | DATAFLOW               
 _c1 | HIND                   
 _c2 | HHCD                   
 _c3 | REGION                 
 _c4 | REGION_TYPE            
 _c5 | STATE                  
 _c6 | TIME_PERIOD            
 _c7 | OBS_VALUE              
-RECORD 1---------------------
 _c0 | ABS:C21_G33_SA2(1.0.0) 
 _c1 | 7                      
 _c2 | 1_2                    
 _c3 | 101021008              
 _c4 | SA2                    
 _c5 | 1                      
 _c6 | 2021                   
 _c7 | 94                     
only showing top 2 rows



Population projection (2017 - 2066)

In [25]:
url = "https://api.data.abs.gov.au/data/ABS,POP_PROJ_REGION_2012_2061,1.0.0/2+21+22.3.TT.1.1.1.1.A?startPeriod=2017&dimensionAtObservation=AllDimensions"

response = requests.get(url, headers=headers)
write_file(f"{OUTPUT_DIR}Population/", "Population_raw.csv", response.text)

match_url = "https://api.data.abs.gov.au/datastructure/ABS/POP_PROJ_REGION_2012_2061/1.0.0?references=all"
match_output_dir = f"{OUTPUT_DIR}Population/"
match_file_name = "Population_match"
get_match_list(match_url, match_output_dir, match_file_name)

../data/raw/ABS/Population/Population_match.xml
True


In [21]:
rsdf = spark.read.csv(f"{OUTPUT_DIR}Population/Population.csv")
rsdf.show(2, vertical = True, truncate=100)

-RECORD 0------------------------------------
 _c0  | DATAFLOW                             
 _c1  | REGION                               
 _c2  | SEX_ABS                              
 _c3  | AGE                                  
 _c4  | FERTILITY                            
 _c5  | MORTALITY                            
 _c6  | NOM                                  
 _c7  | NIM                                  
 _c8  | FREQUENCY                            
 _c9  | TIME_PERIOD                          
 _c10 | OBS_VALUE                            
 _c11 | UNIT_MEASURE                         
 _c12 | OBS_STATUS                           
 _c13 | OBS_COMMENT                          
-RECORD 1------------------------------------
 _c0  | ABS:POP_PROJ_REGION_2012_2061(1.0.0) 
 _c1  | 4                                    
 _c2  | 3                                    
 _c3  | TT                                   
 _c4  | 1                                    
 _c5  | 1                         

School location

In [26]:
url = "https://www.acara.edu.au/docs/default-source/default-document-library/school-location-2021e23a2f404c94637ead88ff00003e0139.xlsx?sfvrsn=51ae4c07_0"


response = requests.get(url)

write_file(f"{OUTPUT_DIR}School_location/", "School_location.csv", response.content, "wb")
# pull_direct(url, OUTPUT_DIR, f"School_location.xlsx")

True

In [15]:
s_df = pd.read_excel(f"{OUTPUT_DIR}School_location.xlsx", sheet_name="SchoolLocations 2021")
print(s_df.head())

   Calendar Year  ACARA SML ID  Location AGE ID  School AGE ID  \
0           2021         40000      40,003.0000         3.0000   
1           2021         40001      40,004.0000         4.0000   
2           2021         40002      40,005.0000         5.0000   
3           2021         40003      40,007.0000         7.0000   
4           2021         40004      40,009.0000         9.0000   

                                School Name        Suburb State  Postcode  \
0            Corpus Christi Catholic School     BELLERIVE   TAS      7018   
1                              Fahan School     SANDY BAY   TAS      7005   
2                  Geneva Christian College       LATROBE   TAS      7307   
3               Holy Rosary Catholic School     CLAREMONT   TAS      7011   
4  Immaculate Heart of Mary Catholic School  LENAH VALLEY   TAS      7008   

  School Sector School Type  ... Statistical Area 3 Statistical Area 3 Name  \
0      Catholic     Primary  ...              60102     Hobar

In [12]:
type(response.content)

bytes

API test (draft)

In [6]:
boundary_df = pd.read_csv("../data/raw/ABS/Summary of ASGS resources.csv")
boundary_df.head()

Unnamed: 0,Summary of ASGS resources
Action,ASGS Resource
View and compare boundaries,ABS Maps
View a list of ASGS regions,Allocation files
Convert data from one region to another,Correspondences
View and analyse boundaries in desktop Geospatial Information Systems,Digital boundary files
