## Notebook to preprocess all external data for each SA2 region

In [1]:
import pandas as pd
import geopandas as gpd

In [2]:
## Population Data

In [3]:
# Read the GeoPackage
population_gdf = gpd.read_file("../data/population/population_extracted/32180_ERP_2023_SA2_GDA2020.gpkg")


In [4]:
# extract SA2 name and ERP (estimated residential population) for victoria only

population_gdf = population_gdf[population_gdf['State_name_2021'] == 'Victoria']

population_gdf = population_gdf[[ 'SA2_name_2021', 
 'ERP_2001',
 'ERP_2002',
 'ERP_2003',
 'ERP_2004',
 'ERP_2005',
 'ERP_2006',
 'ERP_2007',
 'ERP_2008',
 'ERP_2009',
 'ERP_2010',
 'ERP_2011',
 'ERP_2012',
 'ERP_2013',
 'ERP_2014',
 'ERP_2015',
 'ERP_2016',
 'ERP_2017',
 'ERP_2018',
 'ERP_2019',
 'ERP_2020',
 'ERP_2021',
 'ERP_2022',
 'ERP_2023']]

In [5]:
population_gdf

Unnamed: 0,SA2_name_2021,ERP_2001,ERP_2002,ERP_2003,ERP_2004,ERP_2005,ERP_2006,ERP_2007,ERP_2008,ERP_2009,...,ERP_2014,ERP_2015,ERP_2016,ERP_2017,ERP_2018,ERP_2019,ERP_2020,ERP_2021,ERP_2022,ERP_2023
642,Alfredton,5756.0,6092.0,6293.0,6480.0,6648.0,6761.0,7034.0,7272.0,7614.0,...,10338.0,11039.0,11852,12649,13537,14434,15507,16841,18002,18997
643,Ballarat,11497.0,11708.0,12015.0,12189.0,12269.0,12356.0,12408.0,12480.0,12476.0,...,12327.0,12300.0,12301,12266,12244,12320,12196,12071,11938,11809
644,Buninyong,5320.0,5399.0,5557.0,5620.0,5857.0,6037.0,6131.0,6252.0,6431.0,...,7082.0,7191.0,7311,7409,7418,7458,7377,7229,7247,7323
645,Delacombe,4154.0,4225.0,4371.0,4465.0,4704.0,5041.0,5206.0,5349.0,5557.0,...,6583.0,6846.0,7195,7622,8183,8890,9755,10648,11798,12869
646,Smythes Creek,3317.0,3378.0,3411.0,3473.0,3508.0,3542.0,3594.0,3658.0,3714.0,...,3945.0,3966.0,3990,4004,4042,4112,4152,4211,4223,4268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159,Otway,3452.0,3479.0,3511.0,3511.0,3492.0,3459.0,3489.0,3501.0,3490.0,...,3519.0,3538.0,3556,3635,3710,3802,3911,3979,3974,3983
1160,Moyne - East,6718.0,6704.0,6676.0,6643.0,6638.0,6652.0,6606.0,6631.0,6703.0,...,6734.0,6716.0,6709,6717,6746,6798,6883,6990,7046,7132
1161,Moyne - West,8317.0,8387.0,8450.0,8487.0,8517.0,8601.0,8694.0,8792.0,8878.0,...,9383.0,9467.0,9603,9686,9783,9845,9859,9967,10098,10148
1162,Warrnambool - North,17053.0,17449.0,17726.0,17937.0,18172.0,18528.0,18877.0,19107.0,19369.0,...,20930.0,21217.0,21442,21688,21954,22184,22416,22470,22586,22762


In [6]:
population_df = pd.DataFrame(population_gdf)

In [7]:
## Homelessness

In [8]:
# must conda/pip install openpyxl

homelessness_df_21 = pd.read_excel('../data/homelessness/homelessness21.xlsx', sheet_name='Table_5.3')

In [9]:
homelessness_df_21 = homelessness_df_21.loc[775:1375]
homelessness_df_21 = homelessness_df_21[pd.notna(homelessness_df_21['Unnamed: 3'])]
homelessness_df_21 = homelessness_df_21.iloc[:, -2:]
homelessness_df_21.columns = ['SA2_name_2021', 'all_homeless_persons_2021']

In [10]:
homelessness_df_21

Unnamed: 0,SA2_name_2021,all_homeless_persons_2021
775,Buninyong,43
776,Delacombe,43
777,Smythes Creek,5
778,Wendouree - Miners Rest,93
779,Ballarat East - Warrenheip,148
...,...,...
1370,Otway,27
1372,Moyne - East,25
1373,Moyne - West,6
1374,Warrnambool - North,95


In [11]:
# must conda/pip install openpyxl

homelessness_df_16 = pd.read_excel('../data/homelessness/homelessness16.xlsx', sheet_name='Table_5.3')

In [12]:
homelessness_df_16 = homelessness_df_16.loc[707:1249]
homelessness_df_16 = homelessness_df_16[pd.notna(homelessness_df_16['Unnamed: 3'])]
homelessness_df_16 = homelessness_df_16.iloc[:, -2:]
homelessness_df_16.columns = ['SA2_name_2021', 'all_homeless_persons_2016']

In [13]:
homelessness_df_16

Unnamed: 0,SA2_name_2021,all_homeless_persons_2016
707,Alfredton,3
708,Ballarat,123
709,Ballarat - North,92
710,Ballarat - South,74
711,Buninyong,0
...,...,...
1244,Otway,3
1246,Moyne - East,6
1247,Moyne - West,16
1248,Warrnambool - North,87


In [14]:
# must conda/pip install openpyxl

homelessness_df_11 = pd.read_excel('../data/homelessness/homelessness11.xlsx', sheet_name='Table_1')

In [15]:
homelessness_df_11 = homelessness_df_11.loc[668:1180]
homelessness_df_11 = homelessness_df_11[pd.notna(homelessness_df_11['Unnamed: 3'])]
homelessness_df_11 = homelessness_df_11.iloc[:, -2:]
homelessness_df_11.columns = ['SA2_name_2021', 'all_homeless_persons_2011']

In [16]:
homelessness_df_11

Unnamed: 0,SA2_name_2021,all_homeless_persons_2011
668,Alfredton,10
669,Ballarat,97
670,Ballarat - North,84
671,Ballarat - South,128
672,Buninyong,3
...,...,...
1176,Moyne - East,0
1177,Moyne - West,3
1178,Otway,9
1179,Warrnambool - North,74


In [17]:
homelessness_df = homelessness_df_21.merge(homelessness_df_16, on='SA2_name_2021').merge(homelessness_df_11, on='SA2_name_2021')

In [18]:
# socioeconomic

In [19]:
# must conda/pip install openpyxl

socioeconomic_df_21 = pd.read_excel('../data/socioeconomic/socioeconomic21.xlsx', sheet_name='Table 1')

In [20]:
socioeconomic_df_21 = socioeconomic_df_21.loc[634:1149]
socioeconomic_df_21 = socioeconomic_df_21.iloc[:, [1, 4]]
socioeconomic_df_21.columns = ['SA2_name_2021', 'Index of Relative Socio-economic Advantage and Disadvantage 2021']

In [21]:
socioeconomic_df_21

Unnamed: 0,SA2_name_2021,Index of Relative Socio-economic Advantage and Disadvantage 2021
634,Alfredton,1011
635,Ballarat,1040
636,Buninyong,1040
637,Delacombe,947
638,Smythes Creek,1005
...,...,...
1145,Otway,977
1146,Moyne - East,986
1147,Moyne - West,1005
1148,Warrnambool - North,956


In [22]:
# must conda/pip install xlrd

socioeconomic_df_16 = pd.read_excel('../data/socioeconomic/socioeconomic16.xlsx', sheet_name='Table 1')

In [23]:
socioeconomic_df_16 = socioeconomic_df_16.loc[565:1019]
socioeconomic_df_16 = socioeconomic_df_16.iloc[:, [1, 4]]
socioeconomic_df_16.columns = ['SA2_name_2021', 'Index of Relative Socio-economic Advantage and Disadvantage 2016']

In [24]:
socioeconomic_df_16

Unnamed: 0,SA2_name_2021,Index of Relative Socio-economic Advantage and Disadvantage 2016
565,Alfredton,1025
566,Ballarat,1037
567,Ballarat - North,983
568,Ballarat - South,912
569,Buninyong,1051
...,...,...
1015,Otway,976
1016,Moyne - East,995
1017,Moyne - West,998
1018,Warrnambool - North,958


In [25]:
# must conda/pip install xlrd

socioeconomic_df_11 = pd.read_excel('../data/socioeconomic/socioeconomic11.xlsx', sheet_name='Table 2')

In [26]:
socioeconomic_df_11 = socioeconomic_df_11.loc[1459:1882]
socioeconomic_df_11 = socioeconomic_df_11.iloc[:, [1, 3]]
socioeconomic_df_11.columns = ['SA2_name_2021', 'Index of Relative Socio-economic Advantage and Disadvantage 2011']

In [27]:
socioeconomic_df_11

Unnamed: 0,SA2_name_2021,Index of Relative Socio-economic Advantage and Disadvantage 2011
1459,Alfredton,1036.118125
1460,Ballarat,1023.752255
1461,Ballarat - North,982.705859
1462,Ballarat - South,924.379674
1463,Buninyong,1064.12227
...,...,...
1878,Moyne - East,999.233
1879,Moyne - West,1007.253286
1880,Otway,976.754857
1881,Warrnambool - North,965.073742


In [28]:
socioeconomic_df = socioeconomic_df_21.merge(socioeconomic_df_16, on='SA2_name_2021').merge(socioeconomic_df_11, on='SA2_name_2021')

In [29]:
socioeconomic_df

Unnamed: 0,SA2_name_2021,Index of Relative Socio-economic Advantage and Disadvantage 2021,Index of Relative Socio-economic Advantage and Disadvantage 2016,Index of Relative Socio-economic Advantage and Disadvantage 2011
0,Alfredton,1011,1025,1036.118125
1,Ballarat,1040,1037,1023.752255
2,Buninyong,1040,1051,1064.12227
3,Delacombe,947,940,947.363463
4,Smythes Creek,1005,1012,1022.078563
...,...,...,...,...
343,Otway,977,976,976.754857
344,Moyne - East,986,995,999.233
345,Moyne - West,1005,998,1007.253286
346,Warrnambool - North,956,958,965.073742


In [30]:
## inflation

In [31]:
# lets extract housing CPI index, as well as overall CPI with housing removed as a measure of all other inflation
# this will be best for analysis to keep both variables as independent as possible

In [32]:
housing_cpi_df = pd.read_excel('../data/inflation/inflation.xlsx', sheet_name='Data1')

In [33]:
excluding_housing_cpi_df = pd.read_excel('../data/inflation/inflation.xlsx', sheet_name='Data2')

In [34]:
housing_cpi_df = housing_cpi_df[['Unnamed: 0','Index Numbers ;  Housing ;  Melbourne ;']]

In [35]:
excluding_housing_cpi_df = excluding_housing_cpi_df[['Unnamed: 0','Index Numbers ;  All groups CPI excluding Housing ;  Melbourne ;']]

In [36]:
housing_cpi_df = housing_cpi_df.rename(columns={'Unnamed: 0': "quarter"})

In [37]:
inflation_df = pd.concat([housing_cpi_df,excluding_housing_cpi_df],axis=1).drop('Unnamed: 0', axis=1).dropna().iloc[9:].rename(columns={'Index Numbers ;  Housing ;  Melbourne ;': "housing_index", 'Index Numbers ;  All groups CPI excluding Housing ;  Melbourne ;': 'CPI_without_housing'})

In [38]:
inflation_df

Unnamed: 0,quarter,housing_index,CPI_without_housing
105,1972-09-01 00:00:00,11.2,11.3
106,1972-12-01 00:00:00,11.3,11.5
107,1973-03-01 00:00:00,11.5,11.7
108,1973-06-01 00:00:00,11.7,12.1
109,1973-09-01 00:00:00,12,12.6
...,...,...,...
308,2023-06-01 00:00:00,144,130.6
309,2023-09-01 00:00:00,147.5,131.8
310,2023-12-01 00:00:00,148.2,132.7
311,2024-03-01 00:00:00,149.3,134.1


In [39]:
# Now let's combine our external datasets

In [40]:
external_df = population_df.merge(socioeconomic_df, on='SA2_name_2021').merge(homelessness_df, on='SA2_name_2021')

In [41]:
external_df

Unnamed: 0,SA2_name_2021,ERP_2001,ERP_2002,ERP_2003,ERP_2004,ERP_2005,ERP_2006,ERP_2007,ERP_2008,ERP_2009,...,ERP_2020,ERP_2021,ERP_2022,ERP_2023,Index of Relative Socio-economic Advantage and Disadvantage 2021,Index of Relative Socio-economic Advantage and Disadvantage 2016,Index of Relative Socio-economic Advantage and Disadvantage 2011,all_homeless_persons_2021,all_homeless_persons_2016,all_homeless_persons_2011
0,Buninyong,5320.0,5399.0,5557.0,5620.0,5857.0,6037.0,6131.0,6252.0,6431.0,...,7377,7229,7247,7323,1040,1051,1064.12227,43,0,3
1,Delacombe,4154.0,4225.0,4371.0,4465.0,4704.0,5041.0,5206.0,5349.0,5557.0,...,9755,10648,11798,12869,947,940,947.363463,43,38,33
2,Smythes Creek,3317.0,3378.0,3411.0,3473.0,3508.0,3542.0,3594.0,3658.0,3714.0,...,4152,4211,4223,4268,1005,1012,1022.078563,5,0,0
3,Wendouree - Miners Rest,13355.0,13255.0,13161.0,13095.0,13028.0,12963.0,13033.0,13334.0,13718.0,...,15261,15312,15388,15431,886,888,898.897977,93,74,51
4,Creswick - Clunes,7467.0,7423.0,7402.0,7395.0,7332.0,7316.0,7351.0,7388.0,7421.0,...,8031,8041,7969,7987,955,961,955.024591,15,4,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,Otway,3452.0,3479.0,3511.0,3511.0,3492.0,3459.0,3489.0,3501.0,3490.0,...,3911,3979,3974,3983,977,976,976.754857,27,3,9
342,Moyne - East,6718.0,6704.0,6676.0,6643.0,6638.0,6652.0,6606.0,6631.0,6703.0,...,6883,6990,7046,7132,986,995,999.233,25,6,0
343,Moyne - West,8317.0,8387.0,8450.0,8487.0,8517.0,8601.0,8694.0,8792.0,8878.0,...,9859,9967,10098,10148,1005,998,1007.253286,6,16,3
344,Warrnambool - North,17053.0,17449.0,17726.0,17937.0,18172.0,18528.0,18877.0,19107.0,19369.0,...,22416,22470,22586,22762,956,958,965.073742,95,87,74


In [42]:
for i in range(105,313):
    external_df[f'{inflation_df.loc[i][f'quarter']}_housing_index'] = inflation_df.loc[i]['housing_index']
    external_df[f'{inflation_df.loc[i][f'quarter']}_CPI_without_housing'] = inflation_df.loc[i]['CPI_without_housing']

  external_df[f'{inflation_df.loc[i][f'quarter']}_housing_index'] = inflation_df.loc[i]['housing_index']
  external_df[f'{inflation_df.loc[i][f'quarter']}_CPI_without_housing'] = inflation_df.loc[i]['CPI_without_housing']
  external_df[f'{inflation_df.loc[i][f'quarter']}_housing_index'] = inflation_df.loc[i]['housing_index']
  external_df[f'{inflation_df.loc[i][f'quarter']}_CPI_without_housing'] = inflation_df.loc[i]['CPI_without_housing']
  external_df[f'{inflation_df.loc[i][f'quarter']}_housing_index'] = inflation_df.loc[i]['housing_index']
  external_df[f'{inflation_df.loc[i][f'quarter']}_CPI_without_housing'] = inflation_df.loc[i]['CPI_without_housing']
  external_df[f'{inflation_df.loc[i][f'quarter']}_housing_index'] = inflation_df.loc[i]['housing_index']
  external_df[f'{inflation_df.loc[i][f'quarter']}_CPI_without_housing'] = inflation_df.loc[i]['CPI_without_housing']
  external_df[f'{inflation_df.loc[i][f'quarter']}_housing_index'] = inflation_df.loc[i]['housing_index']
  exter

In [43]:
external_df

Unnamed: 0,SA2_name_2021,ERP_2001,ERP_2002,ERP_2003,ERP_2004,ERP_2005,ERP_2006,ERP_2007,ERP_2008,ERP_2009,...,2023-06-01 00:00:00_housing_index,2023-06-01 00:00:00_CPI_without_housing,2023-09-01 00:00:00_housing_index,2023-09-01 00:00:00_CPI_without_housing,2023-12-01 00:00:00_housing_index,2023-12-01 00:00:00_CPI_without_housing,2024-03-01 00:00:00_housing_index,2024-03-01 00:00:00_CPI_without_housing,2024-06-01 00:00:00_housing_index,2024-06-01 00:00:00_CPI_without_housing
0,Buninyong,5320.0,5399.0,5557.0,5620.0,5857.0,6037.0,6131.0,6252.0,6431.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
1,Delacombe,4154.0,4225.0,4371.0,4465.0,4704.0,5041.0,5206.0,5349.0,5557.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
2,Smythes Creek,3317.0,3378.0,3411.0,3473.0,3508.0,3542.0,3594.0,3658.0,3714.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
3,Wendouree - Miners Rest,13355.0,13255.0,13161.0,13095.0,13028.0,12963.0,13033.0,13334.0,13718.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
4,Creswick - Clunes,7467.0,7423.0,7402.0,7395.0,7332.0,7316.0,7351.0,7388.0,7421.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,Otway,3452.0,3479.0,3511.0,3511.0,3492.0,3459.0,3489.0,3501.0,3490.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
342,Moyne - East,6718.0,6704.0,6676.0,6643.0,6638.0,6652.0,6606.0,6631.0,6703.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
343,Moyne - West,8317.0,8387.0,8450.0,8487.0,8517.0,8601.0,8694.0,8792.0,8878.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
344,Warrnambool - North,17053.0,17449.0,17726.0,17937.0,18172.0,18528.0,18877.0,19107.0,19369.0,...,144,130.6,147.5,131.8,148.2,132.7,149.3,134.1,150.2,135
