In [23]:
# Download the ABS Data API Swagger JSON file from GitHub

import requests

# URL of the ABS Data API Swagger YAML file, provided by the Australian Bureau of Statistics
url = "https://raw.githubusercontent.com/apigovau/api-descriptions/gh-pages/abs/DataAPI.openapi.yaml"

try:
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    with open("../data/raw/swagger.yaml", "wb") as file:
        file.write(response.content)
    print("Swagger YAML downloaded successfully!")

except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except Exception as err:
    print(f"An error occurred: {err}")

Swagger YAML downloaded successfully!


In [24]:
from dotenv import load_dotenv
import os
import requests

load_dotenv()  # Load environment variables from .env file
api_key = os.getenv('API_KEY')

# API endpoint
url = "https://api.data.abs.gov.au/dataflow/ABS"  # Replace 'ABS' with the agency ID if needed

# API Key (if required)
headers = {
    "x-api-key": api_key, #api_key  
    "Accept": "application/xml"  # Specify that we want XML format
}

# Make the API call
response = requests.get(url, headers=headers)

# Check the response status
if response.status_code == 200:
    xml_data = response.text  # Get the XML response as text
    print("Dataflows retrieved successfully in XML format!")
    with open("../data/raw/dataflows.xml", "w") as file:
        file.write(xml_data)
    print("Dataflows saved successfully to ../temp/dataflows.xml")
else:
    print(f"Failed to retrieve dataflows. Status code: {response.status_code}")
    print(response.text)


Dataflows retrieved successfully in XML format!
Dataflows saved successfully to ../temp/dataflows.xml


In [25]:
import xml.etree.ElementTree as ET

# Load and parse the XML file
xml_file_path = '../data/raw/dataflows.xml'
tree = ET.parse(xml_file_path)
root = tree.getroot()

# Adjust the namespace to include the common prefix for data extraction
namespace = {
    'sdmx': 'http://www.sdmx.org/resources/sdmxml/sdmx-2.1',
    'structure': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure',
    'common': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common'
}

# Attempt to retrieve C21_G02_POA dataflow details again
dataflow_info_poa = None

# Search through the Dataflows to find C21_G02_POA
for dataflow in root.findall('.//structure:Dataflow', namespace):
    if dataflow.get('id') == 'C21_G02_POA':
        dataflow_info_poa = {
            'id': dataflow.get('id'),
            'agencyID': dataflow.get('agencyID'),
            'version': dataflow.get('version'),
            'isFinal': dataflow.get('isFinal'),
            'name': dataflow.find('common:Name', namespace).text if dataflow.find('common:Name', namespace) is not None else None,
            'description': dataflow.find('common:Description', namespace).text if dataflow.find('common:Description', namespace) is not None else None,
        }
        break

dataflow_info_poa

{'id': 'C21_G02_POA',
 'agencyID': 'ABS',
 'version': '1.0.0',
 'isFinal': 'true',
 'name': 'Census 2021, G02 Selected medians and averages, Postal Areas (POA)',
 'description': "Selected medians and averages data for Postal Areas (POA), 2021 Census.\n\nMedian age of persons excludes overseas visitors.\n\nMedian total personal income is applicable to persons aged 15 years and over.\n\nMedian total family income is applicable to families in family households. It excludes families where at least one member aged 15 years and over did not state an income and families where at \n\nleast one member aged 15 years and over was temporarily absent on Census Night. \n\nMedian total household income is applicable to occupied private dwellings. It excludes households where at least one member aged 15 years and over did not state an income and \n\nhouseholds where at least one member aged 15 years and over was temporarily absent on Census Night. It excludes 'Visitors only' and 'Other non-classifiabl

In [26]:
import requests

# API base URL
base_url = "https://api.data.abs.gov.au/data"


# Function to make API call to retrieve data in CSV format
def get_data_csv(dataflow_id, data_key, params=None):
    url = f"{base_url}/{dataflow_id}/{data_key}?format=csv"  # Specify CSV format in the URL
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        return response.text  # Return the CSV response as text
    else:
        print(f"Failed to retrieve data for {dataflow_id}. Status code: {response.status_code}")
        return None

# Retrieve all data for C21_G02_POA
dataflow_id = "C21_G02_POA"  # Dataflow ID for C21_G02_POA
data_key = "all"  # Use "all" to retrieve all data

# Get the data
c21_g02_poa_csv_result = get_data_csv(dataflow_id, data_key)

# Save the data to a CSV file
if c21_g02_poa_csv_result:
    with open("../data/raw/C21_G02_POA_data.csv", "w") as f:
        f.write(c21_g02_poa_csv_result)
    print("C21_G02_POA data retrieved and saved successfully to 'C21_G02_POA_data.csv'.")


C21_G02_POA data retrieved and saved successfully to 'C21_G02_POA_data.csv'.


In [27]:
import pandas as pd

# Load the CSV file into a DataFrame
df_c21_g02_poa = pd.read_csv("../data/raw/C21_G02_POA_data.csv")

# Display the first few rows of the DataFrame
df_c21_g02_poa.head()

Unnamed: 0,DATAFLOW,MEDAVG,REGION,REGION_TYPE,STATE,TIME_PERIOD,OBS_VALUE
0,ABS:C21_G02_POA(1.0.0),1,2076,POA,1,2021,43.0
1,ABS:C21_G02_POA(1.0.0),1,2222,POA,1,2021,38.0
2,ABS:C21_G02_POA(1.0.0),1,2370,POA,1,2021,48.0
3,ABS:C21_G02_POA(1.0.0),1,2406,POA,1,2021,41.0
4,ABS:C21_G02_POA(1.0.0),1,2565,POA,1,2021,34.0


In [28]:
# Sort the DataFrame by the 'REGION' column and then by the 'MEDAVG' column within each region
df_c21_g02_poa_sorted = df_c21_g02_poa.sort_values(by=['REGION', 'MEDAVG'])

# Display the first few rows of the sorted DataFrame
df_c21_g02_poa_sorted.head(10)

Unnamed: 0,DATAFLOW,MEDAVG,REGION,REGION_TYPE,STATE,TIME_PERIOD,OBS_VALUE
1169,ABS:C21_G02_POA(1.0.0),1,800,POA,7,2021,33.0
14242,ABS:C21_G02_POA(1.0.0),2,800,POA,7,2021,1236.0
19685,ABS:C21_G02_POA(1.0.0),3,800,POA,7,2021,2403.0
16255,ABS:C21_G02_POA(1.0.0),4,800,POA,7,2021,2151.0
7196,ABS:C21_G02_POA(1.0.0),5,800,POA,7,2021,1781.0
2914,ABS:C21_G02_POA(1.0.0),6,800,POA,7,2021,450.0
13045,ABS:C21_G02_POA(1.0.0),7,800,POA,7,2021,1.0
2634,ABS:C21_G02_POA(1.0.0),8,800,POA,7,2021,2.0
20312,ABS:C21_G02_POA(1.0.0),1,810,POA,7,2021,34.0
10957,ABS:C21_G02_POA(1.0.0),2,810,POA,7,2021,1058.0


In [29]:
# Create a new DataFrame with the 'REGION' column renamed to 'POA'
df_new = df_c21_g02_poa_sorted.rename(columns={'REGION': 'POA'})

# Pivot the DataFrame to ensure each POA only has one row
df_pivot = df_new.pivot_table(index='POA', columns='MEDAVG', values='OBS_VALUE', aggfunc='first').reset_index()

# Display the first few rows of the pivoted DataFrame
df_pivot.head()


MEDAVG,POA,1,2,3,4,5,6,7,8
0,800,33.0,1236.0,2403.0,2151.0,1781.0,450.0,1.0,2.0
1,810,34.0,1058.0,2455.0,2199.0,2058.0,345.0,0.9,2.7
2,812,37.0,979.0,2299.0,2046.0,1998.0,360.0,0.9,2.8
3,820,35.0,1271.0,2732.0,2308.0,2037.0,400.0,0.9,2.4
4,822,30.0,298.0,833.0,1347.0,2000.0,80.0,1.4,3.8


In [30]:
# Remove the column titles
df_pivot.columns = [None] * len(df_pivot.columns)

# Display the first few rows of the DataFrame without column titles
df_pivot.head()

Unnamed: 0,None,None.1,None.2,None.3,None.4,None.5,None.6,None.7,None.8
0,800,33.0,1236.0,2403.0,2151.0,1781.0,450.0,1.0,2.0
1,810,34.0,1058.0,2455.0,2199.0,2058.0,345.0,0.9,2.7
2,812,37.0,979.0,2299.0,2046.0,1998.0,360.0,0.9,2.8
3,820,35.0,1271.0,2732.0,2308.0,2037.0,400.0,0.9,2.4
4,822,30.0,298.0,833.0,1347.0,2000.0,80.0,1.4,3.8


In [31]:
# Rename the columns of the DataFrame
df_pivot.columns = [
    'POA_CODE_2021', 
    'Median_age_persons', 
    'Median_tot_prsnl_inc_weekly', 
    'Median_tot_fam_inc_weekly', 
    'Median_tot_hhd_inc_weekly', 
    'Median_mortgage_repay_monthly', 
    'Median_rent_weekly', 
    'Average_num_psns_per_bedroom', 
    'Average_household_size'
]

# Display the first few rows of the DataFrame with renamed columns
df_pivot.head()

Unnamed: 0,POA_CODE_2021,Median_age_persons,Median_tot_prsnl_inc_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,Median_mortgage_repay_monthly,Median_rent_weekly,Average_num_psns_per_bedroom,Average_household_size
0,800,33.0,1236.0,2403.0,2151.0,1781.0,450.0,1.0,2.0
1,810,34.0,1058.0,2455.0,2199.0,2058.0,345.0,0.9,2.7
2,812,37.0,979.0,2299.0,2046.0,1998.0,360.0,0.9,2.8
3,820,35.0,1271.0,2732.0,2308.0,2037.0,400.0,0.9,2.4
4,822,30.0,298.0,833.0,1347.0,2000.0,80.0,1.4,3.8


In [32]:
# Remove specified columns from the DataFrame
columns_to_remove = ['Median_tot_fam_inc_weekly', 'Average_num_psns_per_bedroom', 'Median_tot_prsnl_inc_weekly', 'Average_household_size']
df_pivot.drop(columns=columns_to_remove, inplace=True)

df_pivot.head()

Unnamed: 0,POA_CODE_2021,Median_age_persons,Median_tot_hhd_inc_weekly,Median_mortgage_repay_monthly,Median_rent_weekly
0,800,33.0,2151.0,1781.0,450.0
1,810,34.0,2199.0,2058.0,345.0
2,812,37.0,2046.0,1998.0,360.0
3,820,35.0,2308.0,2037.0,400.0
4,822,30.0,1347.0,2000.0,80.0


In [34]:
# Export the DataFrame to a CSV file in the specified directory
output_path = '../data/curated/2021Census_G02_AUST_POA_curated.csv'
df_pivot.to_csv(output_path, index=False)