In [1]:
import geopandas as gpd
import pandas as pd
import requests
from lxml import html
import os
import csv



In [2]:
postcode = list(gpd.read_file("../data/ABS/post_shapefile/POA_2021_AUST_GDA2020.shp")["POA_CODE21"])
consumer = list(pd.read_csv("../data/tables/tbl_consumer.csv", delimiter="|")["postcode"].unique())
consumer = [str(i).rjust(4, '0') for i in consumer]

In [3]:
to_do = [i for i in consumer if i not in postcode]
print(f"Number of mismatched postcodes (PO Box): {len(to_do)}")

Number of mismatched postcodes (PO Box): 527


In [4]:
merchant_consumer_info = pd.read_parquet("../data/curated/merchant_consumer_info")
missing_transactions = len(merchant_consumer_info[merchant_consumer_info["consumer_postcode"].isin(to_do)])
print(f"Total of transactions: {len(merchant_consumer_info)}")
print(f"Number of transactions about PO Box: {missing_transactions}")

Total of transactions: 13306364
Number of transactions about PO Box: 2132779


In [5]:
def get_suburb(pobox: str):
    r = requests.get(f"https://auspost.com.au/postcode/{pobox}")
    tree = html.fromstring(r.content)
    suburb = tree.xpath('//td[@class="second"]//a[@class="result_1"]/text()')
    return suburb[0]

def get_post(pobox: str,
             sub: str):
    r = requests.get(f"https://auspost.com.au/postcode/{sub.split(', ')[0].lower()}")
    tree = html.fromstring(r.content)
    postcode = tree.xpath('//td[@class="first"]//a/text()')
    suburb = tree.xpath('//td[@class="second"]//a/text()')
    type = tree.xpath('//td[@class="third"]/text()')
    cur = []
    for i in range(len(postcode)):
        if suburb[i] == sub and postcode[i] != pobox and type[i] != "Post Office Boxes":
            cur += [postcode[i]]
    return cur

out = [['pobox', 'postcode']]
for i in to_do:
    try:
        sub = get_suburb(i)
        match = get_post(i, sub)
        # use closest suburb by name if no other postcodes are found
        # e.g. perth bc (business centre), wa -> perth, wa
        # e.g. kent town dc, sa -> kent town, sa
        # e.g. brisbane, qld -> brisbane city, qld
        if not match and ", " in sub:
            cur_sub = f"{' '.join(sub.split()[:-2])}, {sub.split()[-1]}"
            match = get_post(i, cur_sub)
        # the rest are mostly universitiy or shopping centre po boxes and will be discarded
        if match:
            out += [[i] + match]
    except Exception as e:
        # large volume receivers have been discontinued so these will be treated as outliers and skipped
        pass

In [None]:
os.makedirs("../data/meta/", exist_ok=True)
with open("../data/meta/pobox.csv", "w") as f:
    csv.writer(f).writerows(out)