In [191]:
# import required libries
import os
import numpy as np
import spicy as sp
import pandas as pd

# define required path
raw_data = '../data/raw/'
curated_data = '../data/curated/'

In [192]:
# read in dataset
popu = pd.read_csv(raw_data + 'population.csv', index_col=0)

In [193]:
required = ['SA2_code', '2021']
popu_sa2 = popu[required]

In [194]:
# Select data in Victoria
popu = popu.loc[(popu['SA2_code'].astype(int)>199999999) & (popu['SA2_code'].astype(int)<300000000)]
popu_sa2 = popu_sa2.loc[(popu_sa2['SA2_code'].astype(int)>199999999) & (popu_sa2['SA2_code'].astype(int)<300000000)]

In [195]:
popu_sa2.to_csv(curated_data + 'population.csv')

In [196]:
suburb = pd.read_csv(raw_data + 'suburb.csv', index_col=0)

In [197]:
# drop duplicate suburbs
suburb['Suburb'] = suburb['Suburb'].drop_duplicates()
suburb = suburb.dropna()

In [198]:
# read in dataset
properties = pd.read_csv(curated_data + 'properties.csv')

In [199]:
# calcuate the number of suburbs in properties
suburb_count = properties.groupby(['suburb'],as_index=False)['name'].count()
suburb_count.columns = ['suburb', 'count']

In [200]:
# prepare for joining
suburb_count['suburb'] = suburb_count['suburb'].str.upper()

In [201]:
# join the suburb and dataset with sa2
suburb_count = suburb[['Suburb', 'SA2_code']].join(suburb_count.set_index('suburb'), on = 'Suburb').dropna().reset_index(drop=True)

In [202]:
# calculate the number of properties in each sa2 area
sa2_count = suburb_count.groupby(['SA2_code'], as_index=False).sum()
sa2_count.columns = ['SA2_code', 'sum']

In [203]:
# merge the sum of sa2 properties data
suburb_count = suburb_count.join(sa2_count.set_index('SA2_code'), on = 'SA2_code')

In [204]:
# calculate the properties proportions of suburbs
suburb_count['popu_ratio'] = suburb_count['count'] / suburb_count['sum']

In [205]:
# merge the population dataframe
suburb_popu = suburb_count.join(popu.set_index('SA2_code'), on = 'SA2_code')

In [206]:
# calculate population of suburbs according to the properties ratio
suburb_popu.iloc[:, 6:] = suburb_popu.iloc[:, 6:].multiply(suburb_popu['popu_ratio'], axis="index").astype(int)

In [207]:
# discard columns that not required
suburb_popu = suburb_popu.drop(['SA2_code', 'SA2_name', 'count', 'sum', 'popu_ratio', 'Area', 'Population_density_2021'], axis=1)

In [208]:
popu_2021 = suburb_popu[['Suburb','2021']]

In [209]:
popu_2021.columns = ['Suburb','2021_population']

In [210]:
# idea from https://stackoverflow.com/questions/39772896/add-prefix-to-specific-columns-of-dataframe
new_names = [(i,i+'_population') for i in suburb_popu.iloc[:, 1:].columns.values]
suburb_popu.rename(columns = dict(new_names), inplace=True)

In [211]:
popu_2021.to_csv(curated_data + 'suburb_population2021.csv', index = False)

In [213]:
suburb_popu.to_csv(curated_data + 'suburb_population2001-2021.csv', index = False)