In [1]:
# download and preprocess data 
import requests
import pandas as pd
import csv
from pathlib import Path  

# population data
r = requests.get('https://www.abs.gov.au/statistics/people/population/regional-population/2021/32180DS0001_2001-21.xlsx',allow_redirects=True)
open('../data/raw/population16-21.xlsx', 'wb').write(r.content)
df = pd.read_excel('../data/raw/popu.xlsx', sheet_name='Table 1',header=7)  
# select Victoria data and modify column name
df = df.loc[df['S/T name'] == 'Victoria'][['SA2 code','no..15','no..16','no..17','no..18','no..19','no..20']]
df = df.rename(columns={'no..15': 2016, 'no..16': 2017,'no..17': 2018,'no..18': 2019,'no..19': 2020,'no..20': 2021})
# income data
r = requests.get('https://www.abs.gov.au/statistics/labour/earnings-and-working-conditions/personal-income-australia/2014-15-2018-19/6524055002_DO001.xlsx',allow_redirects=True)
open('../data/raw/income14-19.xlsx', 'wb').write(r.content)
dff = pd.read_excel('../data/raw/income.xlsx', sheet_name='Table 1.4',header=6)  
# select Victoria data and modify column name
dff = dff.iloc[579:1041][['SA2','2014-15.4','2015-16.4','2016-17.4','2017-18.4','2018-19.4']]
dff = dff.rename(columns={'2014-15.4':'2014-15', '2015-16.4': '2015-16','2016-17.4': '2016-17','2017-18.4': '2017-18','2018-19.4': '2018-19'})





In [2]:


# natural growth rate without migration in Australia
mor_fer_rate = 1.0072

def predict22(row):  
    if row[2016] * mor_fer_rate * mor_fer_rate < row[2018] and row[2019] * mor_fer_rate * mor_fer_rate > row[2021]:
        # have a high prob that this district is influenced by Covid
        return max(row[2021] * mor_fer_rate, 0)
    else:
        # have a high prob that this district is not very influenced by Covid
        return max((row[2021] - row[2016]) / 5 + row[2021], 0)

def predict23(row):  
    if row[2016] * mor_fer_rate * mor_fer_rate < row[2018] and row[2019] * mor_fer_rate * mor_fer_rate > row[2021]:
        return max(row[2020] * mor_fer_rate * mor_fer_rate, 0)
    else:
        return max((row[2021] - row[2016]) / 5 * 2 + row[2021], 0)

df[2022] = df.apply(predict22, axis=1)
df[2023] = df.apply(predict23, axis=1)
df = df.astype({'SA2 code': 'int'})

filepath = Path('../data/curated/predictpopu.csv')  
df[['SA2 code',2022,2023]].to_csv(filepath, index=False)  





In [3]:
def rate(row): 
    if not isinstance(row['2018-19'], int):
        # entry is np, no income in this district
        return 0
    else:
        if isinstance(row['2014-15'], int):
            # calculate growth rate over the past 5 years
            return (row['2018-19'] - row['2014-15']) / 5 / row['2018-19']
        else:
            # 2014 entry is np, so use 2016 instead
            return (row ['2018-19'] - row ['2016-17']) / 5 / row['2018-19']

def predict2019(row):
    if not isinstance(row['2018-19'], int):
        # entry is np
        return 0.0
    return max(float(row['2018-19']) * (1 + 1 * row['growthrate']),0)
def predict2020(row):
    return max(row['2019']* (1 + 2 * row['growthrate']),0)
def predict2021(row):
    return max(row['2020']* (1 + 0.3 * row['growthrate']),0)
def predict2022(row):
    return max(row['2021']* (1 + 2 * row['growthrate']),0)
def predict2023(row):
    return max(row['2022']* (1 + 2 * row['growthrate']),0)

dff['growthrate'] = dff.apply(rate, axis=1)
dff['2019'] = dff.apply(predict2019, axis=1)
dff['2020'] = dff.apply(predict2020, axis=1)
dff['2021'] = dff.apply(predict2021, axis=1)
dff['2022'] = dff.apply(predict2022, axis=1)
dff['2023'] = dff.apply(predict2023, axis=1)

filepath = Path('../data/curated/predictincome.csv')  
dff[['SA2','2019','2020','2021','2022','2023']].to_csv(filepath, index=False)  

