In [18]:
import geopandas
import folium
import io
import os
import requests
import zipfile
import pandas as pd

from pyspark.sql import SparkSession, functions as F
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService

In [12]:
# Create a spark session
spark = (
    SparkSession.builder.appName("BNPL Get external data")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/09/19 02:03:45 WARN Utils: Your hostname, DESKTOP-1ML24G5 resolves to a loopback address: 127.0.1.1; using 172.30.25.153 instead (on interface eth0)
22/09/19 02:03:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/19 02:03:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Location of root directory.

root_dir = '../data/tables/'

In [3]:
# Create "external_datasets" folder under the root directory where all the external data will be stored.

external_data_dir = 'test_external_datasets'
os.makedirs(root_dir + external_data_dir)

FileExistsError: [Errno 17] File exists: '../data/tables/test_external_datasets'

In [4]:
path = root_dir + external_data_dir + '/'

**Getting external datasets**<br>
1) URL retrieve:<br>
a) Postcode and SA2 data csv file.<br>
b) Total income 2014-2019 excel file.<br>
c) Shapefile for states.<br>
d) Shapefile for post-codes.<br>
2) API call:<br>
Population data 2001-2021 csv file.

In [7]:
# 1a) Postcode and SA2 data:

url = "https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv"
r = requests.get(url)
target_dir = path + 'postcode_SA2_data.csv'

with open(target_dir, 'wb') as outfile:
    outfile.write(r.content)
    outfile.close()

In [27]:
# 1b) Total income 2014-2019 excel file:

url = 'https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2014-15-2018-19/6524055002_DO001.xlsx'
r = requests.get(url)
target_dir = path + 'income_data.xlsx'

with open(target_dir, 'wb') as outfile:
    outfile.write(r.content)
    outfile.close()

# GO THROUGH WITH NOAH ***********************************************************************
# Convert needed sheet from excel file to csv format, and then delete the excel file.
# Is this in the correct format you want it Noah for your pre-processing functions to run?
read_file = pd.read_excel(target_dir, sheet_name='Table 1.4')
os.remove(target_dir)

target_dir = path + 'income_data_raw.csv'
read_file.to_csv(target_dir, index = None)
# df = pd.read_excel('data.xlsx', sheet_name=None)  

In [8]:
# 1c) Australian state shapefiles:

url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/STE_2021_AUST_SHP_GDA2020.zip"
target_dir = path + 'state_data.zip'
urlretrieve(url, target_dir)

# unzip state_data.zip
with zipfile.ZipFile(target_dir,"r") as zip_ref:
    zip_ref.extractall(path + "state_data")

In [9]:
# 1d) Australian post-code shapefiles:

url = "https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/POA_2021_AUST_GDA94_SHP.zip"
target_dir = path + 'postcode_data.zip'
urlretrieve(url, target_dir)

# unzip state_data.zip
with zipfile.ZipFile(target_dir,"r") as zip_ref:
    zip_ref.extractall(path + "postcode_data")

In [10]:
# 2) API call:

# Set up API connection.

WFS_USERNAME = 'xrjps'
WFS_PASSWORD= 'Jmf16l4TcswU3Or7'
WFS_URL='https://adp.aurin.org.au/geoserver/wfs'

adp_client = WebFeatureService(url=WFS_URL,username=WFS_USERNAME, password=WFS_PASSWORD, version='2.0.0')

In [16]:
# Extract files and store into external dataset folder directory.

response = adp_client.getfeature(typename='datasource-AU_Govt_ABS-UoM_AURIN_DB_3:abs_regional_population_sa2_2001_2021', outputFormat='csv')
target_dir = path + 'population_data.csv'

out = open(target_dir, 'wb')
out.write(response.read())
out.close

<function BufferedWriter.close>