In [34]:
# import required libries
import os
import numpy as np
import spicy as sp
import pandas as pd

# define required path
raw_data = '../../data/raw/'
curated_data = '../../data/curated/'

In [35]:
# read in dataset
popu = pd.read_csv(raw_data + 'population.csv', index_col=0)

In [36]:
required = ['SA2_code', '2021']
popu_sa2 = popu[required]

In [37]:
# Select data in Victoria
popu = popu.loc[(popu['SA2_code'].astype(int)>199999999) &
                (popu['SA2_code'].astype(int)<300000000)]
popu_sa2 = popu_sa2.loc[(popu_sa2['SA2_code'].astype(int)>199999999) &
                        (popu_sa2['SA2_code'].astype(int)<300000000)]

In [38]:
popu_sa2.to_csv(curated_data + 'population.csv')

In [39]:
suburb = pd.read_csv(raw_data + 'suburb.csv', index_col=0)

In [40]:
# drop duplicate suburbs
suburb['Suburb'] = suburb['Suburb'].drop_duplicates()
suburb = suburb.dropna()

In [41]:
# read in dataset
properties = pd.read_csv(curated_data + 'properties.csv')

In [42]:
# calcuate the number of suburbs in properties
suburb_count = properties.groupby(['suburb'],as_index=False)['name'].count()
suburb_count.columns = ['suburb', 'count']

In [43]:
# prepare for joining
suburb_count['suburb'] = suburb_count['suburb'].str.upper()

In [44]:
# join the suburb and dataset with sa2
suburb_count = suburb[['Suburb', 'SA2_code']].join(suburb_count.set_index('suburb'), 
                                                   on = 'Suburb').dropna().reset_index(drop=True)

In [47]:
# calculate the number of properties in each sa2 area
sa2_count = suburb_count.groupby(['SA2_code'], as_index=False).sum()
sa2_count.columns = ['SA2_code', 'sum']

  sa2_count = suburb_count.groupby(['SA2_code'], as_index=False).sum()


In [48]:
# merge the sum of sa2 properties data
suburb_count = suburb_count.join(sa2_count.set_index('SA2_code'), on = 'SA2_code')

In [49]:
# calculate the properties proportions of suburbs
suburb_count['popu_ratio'] = suburb_count['count'] / suburb_count['sum']

In [50]:
# merge the population dataframe
suburb_popu = suburb_count.join(popu.set_index('SA2_code'), on='SA2_code')

In [51]:
suburb_popu

Unnamed: 0,Suburb,SA2_code,count,sum,popu_ratio,SA2_name,2001,2002,2003,2004,...,2014,2015,2016,2017,2018,2019,2020,2021,Area,Population_density_2021
0,MELBOURNE,206041122.0,482.0,482.0,1.000000,Melbourne,7644.0,9592.0,11400.0,12727.0,...,33626.0,37162.0,40181.0,44599.0,47615.0,49743.0,50425.0,43823.0,2.4,18498.5
1,EAST MELBOURNE,206041119.0,32.0,32.0,1.000000,East Melbourne,3731.0,3859.0,4243.0,4460.0,...,5374.0,5411.0,5475.0,5495.0,5409.0,5413.0,5378.0,4962.0,2.9,1711.2
2,WEST MELBOURNE,206041127.0,43.0,43.0,1.000000,West Melbourne,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,3.0,3.0,3.0,0.0,0.0,6.2,0.0
3,SOUTHBANK,206051132.0,149.0,216.0,0.689815,South Melbourne,8529.0,8771.0,9021.0,9375.0,...,11972.0,12253.0,12492.0,12700.0,12956.0,13153.0,13144.0,12330.0,2.5,4942.3
4,DOCKLANDS,206041118.0,111.0,111.0,1.000000,Docklands,154.0,926.0,1913.0,3089.0,...,9170.0,10444.0,11832.0,13923.0,15439.0,16375.0,16678.0,15942.0,2.4,6522.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639,DALYSTON,205031093.0,1.0,16.0,0.062500,Wonthaggi - Inverloch,17647.0,17801.0,17956.0,18117.0,...,21611.0,22039.0,22577.0,23270.0,23966.0,24739.0,25631.0,26723.0,728.8,36.7
640,CAPE PATERSON,205031093.0,1.0,16.0,0.062500,Wonthaggi - Inverloch,17647.0,17801.0,17956.0,18117.0,...,21611.0,22039.0,22577.0,23270.0,23966.0,24739.0,25631.0,26723.0,728.8,36.7
641,NORTH WONTHAGGI,205031093.0,1.0,16.0,0.062500,Wonthaggi - Inverloch,17647.0,17801.0,17956.0,18117.0,...,21611.0,22039.0,22577.0,23270.0,23966.0,24739.0,25631.0,26723.0,728.8,36.7
642,WONTHAGGI,205031093.0,5.0,16.0,0.312500,Wonthaggi - Inverloch,17647.0,17801.0,17956.0,18117.0,...,21611.0,22039.0,22577.0,23270.0,23966.0,24739.0,25631.0,26723.0,728.8,36.7


In [52]:
# calculate population of suburbs according to the properties ratio
suburb_popu.iloc[:, 6:] = suburb_popu.iloc[:, 6:].multiply(suburb_popu['popu_ratio'], axis="index").astype(int)

  suburb_popu.iloc[:, 6:] = suburb_popu.iloc[:, 6:].multiply(suburb_popu['popu_ratio'], axis="index").astype(int)


In [53]:
# discard columns that not required
suburb_popu = suburb_popu.drop(['SA2_code', 'SA2_name', 'count', 
                                'sum', 'popu_ratio', 'Area', 'Population_density_2021'], axis=1)

In [54]:
popu_2021 = suburb_popu[['Suburb','2021']]

In [55]:
popu_2021.columns = ['Suburb','2021_population']

In [56]:
# idea from https://stackoverflow.com/questions/39772896/add-prefix-to-specific-columns-of-dataframe
new_names = [(i,i+'_population') for i in suburb_popu.iloc[:, 1:].columns.values]
suburb_popu.rename(columns=dict(new_names), inplace=True)

In [57]:
popu_2021.to_csv(curated_data + 'suburb_population2021.csv', index=False)

In [58]:
suburb_popu.to_csv(curated_data + 'suburb_population2001-2021.csv', index=False)