In [1]:
# import required libries
import os
import numpy as np
import spicy as sp
import pandas as pd

# define required path
raw_data = '../data/raw/'
curated_data = '../data/curated/'

In [2]:
popu = pd.read_csv(raw_data + 'population.csv')

In [3]:
required = ['SA2_code','2014', '2015', '2016', '2017', '2018', '2021']
popu = popu[required]

In [4]:
# Select data in Victoria
popu = popu.loc[(popu['SA2_code'].astype(int)>199999999) & (popu['SA2_code'].astype(int)<300000000)]

In [5]:
popu.to_csv(curated_data + 'population.csv')

In [6]:
suburb = pd.read_csv(raw_data + 'suburb.csv')
del suburb['Unnamed: 0']

In [7]:
# drop duplicate suburbs
suburb['Suburb'] = suburb['Suburb'].drop_duplicates()
suburb = suburb.dropna()

In [8]:
# read in dataset
properties = pd.read_csv(curated_data + 'properties.csv')

In [9]:
# calcuate the number of suburbs in properties
suburb_count = properties.groupby(['suburb'],as_index=False)['name'].count()
suburb_count.columns = ['suburb', 'count']

In [10]:
# prepare for joining
suburb_count['suburb'] = suburb_count['suburb'].str.upper()

In [11]:
# join the suburb and dataset with sa2
suburb_count = suburb[['Suburb', 'SA2_code']].join(suburb_count.set_index('suburb'), on = 'Suburb').dropna().reset_index(drop=True)

In [12]:
# calculate the number of properties in each sa2 area
sa2_count = suburb_count.groupby(['SA2_code'], as_index=False).sum()
sa2_count.columns = ['SA2_code', 'sum']

In [13]:
# merge the sum of sa2 properties data
suburb_count = suburb_count.join(sa2_count.set_index('SA2_code'), on = 'SA2_code')

In [14]:
# calculate the properties proportions of suburbs
suburb_count['popu_ratio'] = suburb_count['count'] / suburb_count['sum']

In [15]:
# merge the population dataframe
suburb_popu = suburb_count.join(popu.set_index('SA2_code'), on = 'SA2_code')

In [16]:
# calculate population of suburbs according to the properties ratio
years = ['2014', '2015', '2016', '2017', '2018', '2021']
suburb_popu[years] = suburb_popu[years].multiply(suburb_popu['popu_ratio'], axis="index").astype(int)

In [17]:
suburb_popu = suburb_popu.drop(['SA2_code', 'count', 'sum', 'popu_ratio'], axis=1)

In [19]:
popu_2021 = suburb_popu[['Suburb','2021']]

In [21]:
popu_2021.columns = ['Suburb','2021_population']

In [22]:
del suburb_popu['2021']

In [23]:
suburb_popu.columns = ['Suburb', '2014_population', '2015_population', '2016_population', '2017_population', '2018_population']

In [27]:
popu_2021.to_csv(curated_data + 'suburb_population2021.csv', index = False)

In [28]:
suburb_popu.to_csv(curated_data + 'suburb_population2014-2018.csv', index = False)