In [2]:
# remove hash to install package
# !pip3 install owslib==0.25.0 fiona==1.8.21 geopandas==0.10.2 requests==2.28.0 folium==0.12.1

Defaulting to user installation because normal site-packages is not writeable
Collecting owslib==0.25.0
  Using cached OWSLib-0.25.0-py2.py3-none-any.whl (216 kB)
Collecting geopandas==0.10.2
  Using cached geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
Collecting requests==2.28.0
  Using cached requests-2.28.0-py3-none-any.whl (62 kB)
Collecting folium==0.12.1
  Using cached folium-0.12.1-py2.py3-none-any.whl (94 kB)
Collecting pyyaml
  Downloading PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl (192 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.2/192.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting charset-normalizer~=2.0.0
  Downloading charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Installing collected packages: pyyaml, charset-normalizer, requests, owslib, geopandas, folium
  Attempting uninstall: charset-normalizer
    Found existing installation: charset-normalizer 2.1.0
    Uninstalling charset-normalizer-2.1.0:
      Success

In [3]:
from owslib.wfs import WebFeatureService
import geopandas
import folium
import io
import zipfile
import pandas as pd
import os
from urllib.request import urlretrieve

## Download external data from AURIN

In [4]:
# User credential to connect with API
WFS_USERNAME = 'nyjhp'
WFS_PASSWORD= 'aFdYtPH7foNjcD58'
WFS_URL='https://adp.aurin.org.au/geoserver/wfs'

In [5]:
# Connect with API
adp_client = WebFeatureService(url=WFS_URL,username=WFS_USERNAME, password=WFS_PASSWORD, version='2.0.0')

In [6]:
def download_aurin_df(type_name, file_name):
    """
        This function downloads the dataset from AURIN API
        
        type_name: dataset identifier from the website
        file_name: output file name 
    """

    output_dir = '../data/abs'
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    response = adp_client.getfeature(typename=type_name)
    out = open(f'{output_dir}/{file_name}.gml', 'wb')
    out.write(response.read())
    out.close()
    return geopandas.read_file(f'{output_dir}/{file_name}.gml')



In [13]:
# Download selected external data

pop_df = download_aurin_df('datasource-AU_Govt_ABS-UoM_AURIN_DB_3:abs_regional_population_sa2_2001_2021', 
                           '2021_population_census')

sa2_bound = download_aurin_df('datasource-AU_Govt_ABS-UoM_AURIN_DB_GeoLevel:sa2_2016_aust',
                        'sa2_boundaries')

poa_bound = download_aurin_df('datasource-AU_Govt_ABS-UoM_AURIN_DB_GeoLevel:poa_2016_aust',
                        'poa_boundaries')

sa2_income = download_aurin_df('datasource-AU_Govt_ABS-UoM_AURIN_DB_3:abs_personal_income_total_income_distribution_sa2_2017_18', 
                        'sa2_income')

In [7]:
# Select 2021 population census

area_id = ['gml_id', 'primaryindex', 'state_code_2016', 'sa2_maincode_2016',
           'sa2_name_2016']
col_2021 = [x for x in pop_df.columns if '2021' in x or '2020_21' in x]
pop_21 = pop_df[area_id + col_2021]

In [9]:
pop_21.shape

(2292, 18)

There should be 2,310 SA2 regions.

In [10]:
# ERP refers to estimated resident population
pop_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2292 entries, 0 to 2291
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   gml_id                           2292 non-null   object 
 1   primaryindex                     2292 non-null   int64  
 2   state_code_2016                  2292 non-null   int64  
 3   sa2_maincode_2016                2292 non-null   int64  
 4   sa2_name_2016                    2292 non-null   object 
 5   erp_2021                         2292 non-null   int64  
 6   erp_change_number_2020_21        2292 non-null   int64  
 7   erp_change_per_cent_2020_21      2292 non-null   float64
 8   pop_density_2021_people_per_km2  2292 non-null   float64
 9   births_2020_21                   2288 non-null   float64
 10  deaths_2020_21                   2288 non-null   float64
 11  natural_increase_2020_21         2288 non-null   float64
 12  internal_arrivals_20

Do we want to include these regions in the data???
If so, will we include the attributes with missing data?
If yes, how?
If no, why?

In [11]:
pop_21[pop_21["births_2020_21"].isnull()]

Unnamed: 0,gml_id,primaryindex,state_code_2016,sa2_maincode_2016,sa2_name_2016,erp_2021,erp_change_number_2020_21,erp_change_per_cent_2020_21,pop_density_2021_people_per_km2,births_2020_21,deaths_2020_21,natural_increase_2020_21,internal_arrivals_2020_21,internal_departures_2020_21,net_internal_migration_2020_21,overseas_arrivals_2020_21,overseas_departures_2020_21,net_overseas_migration_2020_21
2231,abs_regional_population_sa2_2001_2021.2292,2292,9,901041004,Norfolk Island,1749,14,0.8069,45.251202,,,,,,,,,
2289,abs_regional_population_sa2_2001_2021.2289,2289,9,901011001,Christmas Island,1979,15,0.7637,14.537,,,,,,,,,
2290,abs_regional_population_sa2_2001_2021.2290,2290,9,901021002,Cocos (Keeling) Islands,579,6,1.0471,42.212601,,,,,,,,,
2291,abs_regional_population_sa2_2001_2021.2291,2291,9,901031003,Jervis Bay,397,2,0.5063,5.8543,,,,,,,,,


---
## Download Postcode to SA2 table

In [16]:
def download_url(url, filename):
    '''
        This function downloads data from the specified url.

        url: url of specified website
        filename: output file name
    '''
    output_dir = '../data/abs'
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    print(f"Begin downloading file_name data")
    output_dir = f"{output_dir}/{filename}.zip"
    urlretrieve(url, output_dir)
    print(f"Completed")

In [17]:
download_url('https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&1270055006_CG_POSTCODE_2011_SA2_2011.zip&1270.0.55.006&Data%20Cubes&70A3CE8A2E6F9A6BCA257A29001979B2&0&July%202011&27.06.2012&Latest'
             , 'poa_sa2_lookup')

Begin downloading file_name data
Completed


In [18]:
# Open zipfile
unzip_poa_sa2 = zipfile.ZipFile('../data/abs/poa_sa2_lookup.zip') 

In [19]:
poa_to_sa2 = pd.read_excel(unzip_poa_sa2.open('1270055006_CG_POSTCODE_2011_SA2_2011.xls')
                        , sheet_name='Table 3', skiprows=5)

In [20]:
poa_to_sa2 = poa_to_sa2.dropna()

There are 2162 SA2 codes in Postcode to SA2 data, but there are 2292. Try to find newer data for poa_to_sa2?

In [22]:
poa_to_sa2["SA2_MAINCODE_2011"].unique().size

2162