## 4. External dataset

We will look at the Census dataset 2021 from ABS.

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium
import geopandas as gpd
from shapely import wkt
import requests
import os

The shapefile was downloaded from "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip".

In [None]:
sa2_boundary_gdf = gpd.read_file("../data/tables/sa2_boundary/SA2_2021_AUST_GDA2020.shp")
sa2_boundary_gdf.shape

In [None]:
sa2_boundary_gdf.head(5)

It includes SA2 geometries along with additional attributes, such as the corresponding SA3 and SA4 areas to which each region belongs.

In [None]:
# Check missing values
sa2_boundary_gdf[sa2_boundary_gdf.geometry.isna() == True]

We will just remove them because they are special purpose codes for Indigenous Regions, Indigenous Areas and Indigenous Locations, Migratory – Offshore – Shipping, and outside Australia.

In [7]:
sns.set_style("darkgrid")

In [None]:
# Visualize SA2 boundary
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

sa2_boundary_gdf.plot(
    ax=ax,
    # edgecolor='red',
    linewidth = 0.2
)

We would like to rechieve the Census data from ABS API.
- **Dictionary**: https://www.abs.gov.au/census/guide-census-data/census-dictionary/2021/variables-index

- API Guide: https://www.abs.gov.au/about/data-services/application-programming-interfaces-apis/data-api-user-guide/using-api

- All available dataflow_id for API url: https://api.data.abs.gov.au/dataflow/all?detail=allstubs

- Information on how to get available dataflow_id csv file: https://www.abs.gov.au/about/data-services/application-programming-interfaces-apis/data-api-user-guide/tutorials-helpful-files-links

- This file gives us information on **which dataflow_id to request**: https://www.abs.gov.au/census/guide-census-data/about-census-tools/download-product-templates/2021_General%20Community%20Profile_R1_R2.xlsx

- Link template for DSD: https://api.data.abs.gov.au/dataflow/ABS/{dataflow_id}?references=codelist



- Download all data - Census DataPacks: https://www.abs.gov.au/census/find-census-data/datapacks (we don't use it)

We decide to consider household income weekly (G33), median mortgage repayments monthly (G02), mortgage repayment monthly (G38) and rent weekly (G40).

In [9]:
# Create path
directory_path = '../data/tables/sa2_dataset/main/'
os.makedirs(directory_path, exist_ok=True)

In [None]:
dataflow_ids = ['C21_G33_SA2', 'C21_G38_SA2', 'C21_G40_SA2', 'C21_G02_SA2']

base_path = '../data/tables/sa2_dataset/main/'

for dataflow_id in dataflow_ids:
    url = f'https://api.data.abs.gov.au/data/{dataflow_id}/all'
    headers = {'accept': 'text/csv'}
    
    # Define paths
    file_path = os.path.join(base_path, f'{dataflow_id}.csv')
    filtered_file_path = os.path.join(base_path, f'{dataflow_id}_filtered.csv')

    try:
        # Download file
        response = requests.get(url, headers=headers, stream=True)
        response.raise_for_status()
    
        if 'text/csv' in response.headers.get('Content-Type', ''):
            with open(file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
        else:
            print(f'Unexpected content type for {dataflow_id}:', response.headers.get('Content-Type'))
            continue 
    
        # Filter location
        data = pd.read_csv(file_path)
        if dataflow_id == 'C21_G02_SA2':
            filtered_data = data[data['MEDAVG'] == 5]
            filtered_data = data[data['REGION_TYPE'] == 'SA2']
        else:
            filtered_data = data[data['REGION_TYPE'] == 'SA2']
            
        filtered_data.to_csv(filtered_file_path, index=False)
        
    except requests.RequestException as e:
        print(f'An error occurred for {dataflow_id}: {e}')


Check data

In [None]:
median_mortgage = pd.read_csv("../data/tables/sa2_dataset/main/C21_G02_SA2_filtered.csv")
median_mortgage.shape

In [None]:
median_mortgage.head()

In [None]:
household_income_weekly = pd.read_csv("../data/tables/sa2_dataset/main/C21_G33_SA2_filtered.csv")
household_income_weekly.shape

In [None]:
household_income_weekly.head()

### Consideration
- Rename columns
- Remove time_period, dataflow column
- Instead of replace the code with its value, we will mention the value meaning when we perform data analysis.

# Create Cleaning Function

In [15]:
import sys
sys.path.append('../scripts')
from etl import clean_external_df, clean_shapefile_sa2

In [16]:
# Clean and save data in curated folder
clean_external_df()

sa2_boundary_gdf = gpd.read_file("../data/tables/sa2_boundary/SA2_2021_AUST_GDA2020.shp")
clean_sa2 = clean_shapefile_sa2(sa2_boundary_gdf)

Check dataset in curated/sa2_dataset/

In [None]:
dataflow_ids = ['C21_G33_SA2', 'C21_G38_SA2', 'C21_G40_SA2', 'C21_G02_SA2']
base_path = '../data/curated/sa2_dataset/'

for dataflow_id in dataflow_ids:
    clean_file_path = os.path.join(base_path, f'{dataflow_id}_clean.csv')
    print(f"Shape of {dataflow_id}:")
    data = pd.read_csv(clean_file_path)
    print(data.shape)
    display(data.head())

In [None]:
sa2_boundary_gdf = gpd.read_file("../data/curated/sa2_boundary/SA2_2021_AUST_GDA2020_clean.shp")
sa2_boundary_gdf

In [None]:
# Visualize SA2 boundary
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

sa2_boundary_gdf.plot(
    ax=ax,
    # edgecolor='red',
    linewidth = 0.2
)