# Data Merging
This notebook looks to merges all of the data gathered in preprocessing.

Here we split the notebook into 3 sections:
1. Simple Merges
2. Complex Merges
3. Rental Data Merges 

#### Datasets we will look to merge:
Simple Merges:
- Crime records
- Proximity data 1 (train stations & CBD)
- Proximity data 2 (to the beach)
- Open Space data

Complex Merges
- Business data
- ABS data (Australian Bureau of Statistics)
- Population demographics

Rental Merges
- Historical Rental Data (2000-2023)
- Inflation
- Currental Rental Data Scraped from Domain (2024)

In [42]:
import pandas as pd
import json

### 1 Simple Merges

In [43]:
# read in the crime data
crime_df = pd.read_csv('../data/curated/crime_2015-2028.csv')
crime_df = crime_df.rename(columns={"SAL_CODE_2021": "SAL_CODE"})
crime_df

Unnamed: 0,SAL_CODE,2015_A Crimes against the person,2015_B Property and deception offences,2015_C Drug offences,2015_D Public order and security offences,2015_E Justice procedures offences,2015_F Other offences,2015_total,2016_A Crimes against the person,2016_B Property and deception offences,...,2027_D Public order and security offences,2028_D Public order and security offences,2025_E Justice procedures offences,2026_E Justice procedures offences,2027_E Justice procedures offences,2028_E Justice procedures offences,2025_F Other offences,2026_F Other offences,2027_F Other offences,2028_F Other offences
0,20002,65.0,688.0,52.0,62.0,35.0,0.0,902.0,70.0,790.0,...,55.178,55.344,69.714,66.251,68.037,67.116,13.375,14.950,15.150,15.176
1,20003,14.0,89.0,6.0,0.0,1.0,0.0,110.0,14.0,102.0,...,2.796,2.796,10.809,10.918,10.924,10.924,0.616,0.602,0.603,0.603
2,20011,0.0,8.0,0.0,0.0,0.0,0.0,8.0,2.0,13.0,...,10.438,10.423,18.313,15.757,14.608,14.091,1.130,1.104,1.105,1.105
3,20015,60.0,527.0,33.0,45.0,26.0,1.0,692.0,77.0,503.0,...,10.753,9.338,57.265,65.000,57.265,64.999,5.304,6.016,6.111,6.124
4,20017,41.0,198.0,30.0,10.0,9.0,0.0,288.0,47.0,252.0,...,4.629,4.631,26.400,26.616,26.538,26.566,2.550,3.132,3.265,3.295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,22916,5.0,30.0,2.0,4.0,2.0,0.0,43.0,4.0,39.0,...,1.724,1.744,3.737,4.140,4.234,4.256,0.755,0.910,0.942,0.949
563,22917,60.0,497.0,14.0,27.0,17.0,2.0,617.0,61.0,561.0,...,30.192,18.000,81.321,80.001,81.320,80.002,4.358,4.551,4.562,4.563
564,22925,2.0,13.0,2.0,2.0,2.0,0.0,21.0,5.0,25.0,...,0.000,0.000,2.486,1.939,2.059,2.033,0.455,0.403,0.409,0.408
565,22930,0.0,4.0,0.0,0.0,0.0,0.0,4.0,2.0,3.0,...,0.399,0.399,0.319,0.373,0.382,0.384,0.248,0.354,0.400,0.420


In [44]:
# read in the proximity data
proximity1_df = pd.read_csv('../data/landing/suburb_proximity_data.csv')
proximity1_df = proximity1_df.drop(columns="Unnamed: 0")
proximity1_df = proximity1_df.rename(columns={"SAL_CODE_2021": "SAL_CODE"})
proximity1_df

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,closest_station_1_LONGITUDE,closest_station_2_STOP_NAME,closest_station_2_LATITUDE,closest_station_2_LONGITUDE,suburb_centre_latitude,suburb_centre_longitude,distance_to_CBD,time_to_CBD,distance_to_station,time_to_station
0,Inner Melbourne,Albert Park,20018,Albert Park-Middle Park-West St Kilda,2.8392,POLYGON ((144.9544458560001 -37.83793759699995...,0.0,"[{'STOP_NAME': 'Flinders Street', 'LATITUDE': ...",Flinders Street,-37.818305,144.966964,Southern Cross,-37.817936,144.951411,-37.845178,144.961053,4640.7,590.0,3801.0,499.7
1,Inner Melbourne,Middle Park,21677,Albert Park-Middle Park-West St Kilda,0.8475,POLYGON ((144.9613463640001 -37.84558110699993...,0.0,"[{'STOP_NAME': 'Prahran', 'LATITUDE': -37.8495...",Prahran,-37.849518,144.989860,Windsor,-37.856053,144.992035,-37.851145,144.962020,5451.7,681.5,3704.9,492.8
2,Inner Melbourne,St Kilda West,22345,Albert Park-Middle Park-West St Kilda,0.5290,POLYGON ((144.9701782210001 -37.85371962799996...,0.0,"[{'STOP_NAME': 'Prahran', 'LATITUDE': -37.8495...",Prahran,-37.849518,144.989860,Windsor,-37.856053,144.992035,-37.857961,144.971254,6547.7,772.4,2494.7,406.4
3,Inner Melbourne,Armadale,20066,Armadale,2.1430,"POLYGON ((145.024852482 -37.85092855199997, 14...",2.0,"[{'STOP_NAME': 'Armadale', 'LATITUDE': -37.856...",Armadale,-37.856452,145.019326,Toorak,-37.850774,145.013909,-37.856664,145.020235,9160.9,963.8,520.5,106.9
4,Inner Melbourne,Carlton North,20496,Carlton North,1.8709,"POLYGON ((144.972592307 -37.79269848799998, 14...",0.0,"[{'STOP_NAME': 'Jewell', 'LATITUDE': -37.77498...",Jewell,-37.774987,144.958717,Royal Park,-37.781193,144.952301,-37.786593,144.968494,3661.4,431.2,2360.6,305.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,Other Regional Centres,Traralgon,22569,Traralgon,56.2821,"POLYGON ((146.561066146 -38.18788998399998, 14...",1.0,"[{'STOP_NAME': 'Traralgon', 'LATITUDE': -38.19...",Traralgon,-38.198885,146.537882,Morwell,-38.236719,146.396753,-38.203343,146.519781,161750.0,7073.1,1707.7,136.0
564,Other Regional Centres,Wangaratta,22680,Wanagaratta,48.7989,POLYGON ((146.3101672580001 -36.38755634399996...,1.0,"[{'STOP_NAME': 'Wangaratta', 'LATITUDE': -36.3...",Wangaratta,-36.355101,146.317038,Springhurst,-36.185893,146.470417,-36.369384,146.316005,240991.9,10137.1,2716.7,340.3
565,Other Regional Centres,Warragul,22698,Warragul,55.3164,POLYGON ((145.9445994800001 -38.12758481799995...,1.0,"[{'STOP_NAME': 'Warragul', 'LATITUDE': -38.165...",Warragul,-38.165229,145.932674,Drouin,-38.136452,145.855947,-38.153720,145.929971,104026.8,5013.4,1879.0,228.4
566,Other Regional Centres,Warrnambool,22710,Warrnambool,65.5160,"POLYGON ((142.460390336 -38.33167768199996, 14...",2.0,"[{'STOP_NAME': 'Warrnambool', 'LATITUDE': -38....",Warrnambool,-38.385014,142.475545,Warrnambool,-38.386392,142.538871,-38.368860,142.498179,255755.7,11370.1,3084.6,390.3


In [45]:
# read in the open space data
open_spaces_df = pd.read_csv('../data/curated/open_space.csv')
open_spaces_df

Unnamed: 0,SAL_CODE,Cemeteries,Civic squares and promenades,Conservation reserves,Government schools,Natural and semi-natural open space,Non-government schools,Parks and gardens,Public housing reserves,Recreation corridor,Services and utilities reserves,Sportsfields and organised recreation,Tertiary institutions,Transport reservations
0,20002,0,0,0,1,15,2,31,0,1,0,1,0,0
1,20003,0,0,0,1,2,5,3,0,2,0,2,0,1
2,20006,0,0,0,0,2,0,0,0,0,0,0,0,0
3,20011,0,0,0,0,6,0,0,0,0,0,0,0,0
4,20015,0,0,0,1,3,1,16,0,8,0,2,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,22911,1,0,2,3,24,3,3,0,6,31,6,0,0
565,22916,0,0,0,1,23,1,0,0,0,0,7,0,3
566,22917,1,0,0,6,21,1,33,0,0,0,15,0,2
567,22925,0,0,64,1,22,0,0,0,0,15,3,0,0


In [46]:
# Start by merging the first DataFrame (proximity1_df) with the crime data
combined_df = proximity1_df
combined_df = combined_df.merge(crime_df, on='SAL_CODE', how='right')
combined_df


Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,2027_D Public order and security offences,2028_D Public order and security offences,2025_E Justice procedures offences,2026_E Justice procedures offences,2027_E Justice procedures offences,2028_E Justice procedures offences,2025_F Other offences,2026_F Other offences,2027_F Other offences,2028_F Other offences
0,Inner Melbourne,Abbotsford,20002,Collingwood-Abbotsford,1.7405,POLYGON ((145.0019511820001 -37.79664577999995...,2.0,"[{'STOP_NAME': 'Collingwood', 'LATITUDE': -37....",Collingwood,-37.804526,...,55.178,55.344,69.714,66.251,68.037,67.116,13.375,14.950,15.150,15.176
1,North Western Melbourne,Aberfeldie,20003,Moonee Ponds-Ascot Vale,1.5515,POLYGON ((144.8957592740001 -37.76513729299995...,0.0,"[{'STOP_NAME': 'Essendon', 'LATITUDE': -37.756...",Essendon,-37.756012,...,2.796,2.796,10.809,10.918,10.924,10.924,0.616,0.602,0.603,0.603
2,Outer Western Melbourne,Aintree,20011,Sydenham,6.7302,"POLYGON ((144.671793757 -37.72627663699996, 14...",0.0,"[{'STOP_NAME': 'Rockbank', 'LATITUDE': -37.729...",Rockbank,-37.729261,...,10.438,10.423,18.313,15.757,14.608,14.091,1.130,1.104,1.105,1.105
3,North Western Melbourne,Airport West,20015,Essendon,3.6748,"POLYGON ((144.879789112 -37.71565333099994, 14...",0.0,"[{'STOP_NAME': 'Oak Park', 'LATITUDE': -37.717...",Oak Park,-37.717950,...,10.753,9.338,57.265,65.000,57.265,64.999,5.304,6.016,6.111,6.124
4,Outer Western Melbourne,Albanvale,20017,St Albans-Deer Park,1.8634,POLYGON ((144.7692623780001 -37.74081983399998...,0.0,"[{'STOP_NAME': 'Keilor Plains', 'LATITUDE': -3...",Keilor Plains,-37.729279,...,4.629,4.631,26.400,26.616,26.538,26.566,2.550,3.132,3.265,3.295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,North Eastern Melbourne,Yarrambat,22916,Bundoora-Greensborough-Hurstbridge,15.3453,POLYGON ((145.1665694620001 -37.62473551999994...,0.0,"[{'STOP_NAME': 'Hawkstowe', 'LATITUDE': -37.62...",Hawkstowe,-37.622992,...,1.724,1.744,3.737,4.140,4.234,4.256,0.755,0.910,0.942,0.949
564,Outer Western Melbourne,Yarraville,22917,Yarraville-Seddon,5.6587,"POLYGON ((144.88986405 -37.80976599199994, 144...",1.0,"[{'STOP_NAME': 'Yarraville', 'LATITUDE': -37.8...",Yarraville,-37.815850,...,30.192,18.000,81.321,80.001,81.320,80.002,4.358,4.551,4.562,4.563
565,Outer Eastern Melbourne,Yellingbo,22925,Yarra Ranges,42.4012,"POLYGON ((145.505065853 -37.81357686499996, 14...",0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,0.000,0.000,2.486,1.939,2.059,2.033,0.455,0.403,0.409,0.408
566,Outer Eastern Melbourne,Yering,22930,Yarra Ranges,24.6363,"POLYGON ((145.369123872 -37.66830801499998, 14...",0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,0.399,0.399,0.319,0.373,0.382,0.384,0.248,0.354,0.400,0.420


In [47]:
combined_df = combined_df.drop(92) # this suburb is a dupilcate, remove it
combined_df

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,2027_D Public order and security offences,2028_D Public order and security offences,2025_E Justice procedures offences,2026_E Justice procedures offences,2027_E Justice procedures offences,2028_E Justice procedures offences,2025_F Other offences,2026_F Other offences,2027_F Other offences,2028_F Other offences
0,Inner Melbourne,Abbotsford,20002,Collingwood-Abbotsford,1.7405,POLYGON ((145.0019511820001 -37.79664577999995...,2.0,"[{'STOP_NAME': 'Collingwood', 'LATITUDE': -37....",Collingwood,-37.804526,...,55.178,55.344,69.714,66.251,68.037,67.116,13.375,14.950,15.150,15.176
1,North Western Melbourne,Aberfeldie,20003,Moonee Ponds-Ascot Vale,1.5515,POLYGON ((144.8957592740001 -37.76513729299995...,0.0,"[{'STOP_NAME': 'Essendon', 'LATITUDE': -37.756...",Essendon,-37.756012,...,2.796,2.796,10.809,10.918,10.924,10.924,0.616,0.602,0.603,0.603
2,Outer Western Melbourne,Aintree,20011,Sydenham,6.7302,"POLYGON ((144.671793757 -37.72627663699996, 14...",0.0,"[{'STOP_NAME': 'Rockbank', 'LATITUDE': -37.729...",Rockbank,-37.729261,...,10.438,10.423,18.313,15.757,14.608,14.091,1.130,1.104,1.105,1.105
3,North Western Melbourne,Airport West,20015,Essendon,3.6748,"POLYGON ((144.879789112 -37.71565333099994, 14...",0.0,"[{'STOP_NAME': 'Oak Park', 'LATITUDE': -37.717...",Oak Park,-37.717950,...,10.753,9.338,57.265,65.000,57.265,64.999,5.304,6.016,6.111,6.124
4,Outer Western Melbourne,Albanvale,20017,St Albans-Deer Park,1.8634,POLYGON ((144.7692623780001 -37.74081983399998...,0.0,"[{'STOP_NAME': 'Keilor Plains', 'LATITUDE': -3...",Keilor Plains,-37.729279,...,4.629,4.631,26.400,26.616,26.538,26.566,2.550,3.132,3.265,3.295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,North Eastern Melbourne,Yarrambat,22916,Bundoora-Greensborough-Hurstbridge,15.3453,POLYGON ((145.1665694620001 -37.62473551999994...,0.0,"[{'STOP_NAME': 'Hawkstowe', 'LATITUDE': -37.62...",Hawkstowe,-37.622992,...,1.724,1.744,3.737,4.140,4.234,4.256,0.755,0.910,0.942,0.949
564,Outer Western Melbourne,Yarraville,22917,Yarraville-Seddon,5.6587,"POLYGON ((144.88986405 -37.80976599199994, 144...",1.0,"[{'STOP_NAME': 'Yarraville', 'LATITUDE': -37.8...",Yarraville,-37.815850,...,30.192,18.000,81.321,80.001,81.320,80.002,4.358,4.551,4.562,4.563
565,Outer Eastern Melbourne,Yellingbo,22925,Yarra Ranges,42.4012,"POLYGON ((145.505065853 -37.81357686499996, 14...",0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,0.000,0.000,2.486,1.939,2.059,2.033,0.455,0.403,0.409,0.408
566,Outer Eastern Melbourne,Yering,22930,Yarra Ranges,24.6363,"POLYGON ((145.369123872 -37.66830801499998, 14...",0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,0.399,0.399,0.319,0.373,0.382,0.384,0.248,0.354,0.400,0.420


In [48]:
# merge open space data in
combined_df = combined_df.merge(open_spaces_df, on='SAL_CODE', how='left')

# open space data did not have records for regional suburbs e.g. Ballarat
# we will impute these values as the average amount of that column
for col in open_spaces_df.columns:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mean())
combined_df

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,Government schools,Natural and semi-natural open space,Non-government schools,Parks and gardens,Public housing reserves,Recreation corridor,Services and utilities reserves,Sportsfields and organised recreation,Tertiary institutions,Transport reservations
0,Inner Melbourne,Abbotsford,20002,Collingwood-Abbotsford,1.7405,POLYGON ((145.0019511820001 -37.79664577999995...,2.0,"[{'STOP_NAME': 'Collingwood', 'LATITUDE': -37....",Collingwood,-37.804526,...,1.000000,15.000000,2.000000,31.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000
1,North Western Melbourne,Aberfeldie,20003,Moonee Ponds-Ascot Vale,1.5515,POLYGON ((144.8957592740001 -37.76513729299995...,0.0,"[{'STOP_NAME': 'Essendon', 'LATITUDE': -37.756...",Essendon,-37.756012,...,1.000000,2.000000,5.000000,3.000000,0.000000,2.000000,0.000000,2.000000,0.000000,1.000000
2,Outer Western Melbourne,Aintree,20011,Sydenham,6.7302,"POLYGON ((144.671793757 -37.72627663699996, 14...",0.0,"[{'STOP_NAME': 'Rockbank', 'LATITUDE': -37.729...",Rockbank,-37.729261,...,0.000000,6.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,North Western Melbourne,Airport West,20015,Essendon,3.6748,"POLYGON ((144.879789112 -37.71565333099994, 14...",0.0,"[{'STOP_NAME': 'Oak Park', 'LATITUDE': -37.717...",Oak Park,-37.717950,...,1.000000,3.000000,1.000000,16.000000,0.000000,8.000000,0.000000,2.000000,0.000000,4.000000
4,Outer Western Melbourne,Albanvale,20017,St Albans-Deer Park,1.8634,POLYGON ((144.7692623780001 -37.74081983399998...,0.0,"[{'STOP_NAME': 'Keilor Plains', 'LATITUDE': -3...",Keilor Plains,-37.729279,...,1.000000,11.000000,0.000000,3.000000,0.000000,7.000000,2.000000,1.000000,0.000000,6.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,North Eastern Melbourne,Yarrambat,22916,Bundoora-Greensborough-Hurstbridge,15.3453,POLYGON ((145.1665694620001 -37.62473551999994...,0.0,"[{'STOP_NAME': 'Hawkstowe', 'LATITUDE': -37.62...",Hawkstowe,-37.622992,...,1.000000,23.000000,1.000000,0.000000,0.000000,0.000000,0.000000,7.000000,0.000000,3.000000
563,Outer Western Melbourne,Yarraville,22917,Yarraville-Seddon,5.6587,"POLYGON ((144.88986405 -37.80976599199994, 144...",1.0,"[{'STOP_NAME': 'Yarraville', 'LATITUDE': -37.8...",Yarraville,-37.815850,...,6.000000,21.000000,1.000000,33.000000,0.000000,0.000000,0.000000,15.000000,0.000000,2.000000
564,Outer Eastern Melbourne,Yellingbo,22925,Yarra Ranges,42.4012,"POLYGON ((145.505065853 -37.81357686499996, 14...",0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,1.000000,22.000000,0.000000,0.000000,0.000000,0.000000,15.000000,3.000000,0.000000,0.000000
565,Outer Eastern Melbourne,Yering,22930,Yarra Ranges,24.6363,"POLYGON ((145.369123872 -37.66830801499998, 14...",0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,1.000000,7.000000,0.000000,0.000000,0.000000,15.000000,0.000000,4.000000,0.000000,0.000000


In [49]:
# read in the proximity to the beach data
coast_df = pd.read_csv('../data/curated/vic_beach_proximity.csv')
coast_df

Unnamed: 0,SAL_CODE,proximity_to_beach
0,20001,1.332311
1,20002,0.062930
2,20003,0.080923
3,20004,0.973612
4,20005,0.939350
...,...,...
2939,22940,1.793233
2940,22941,0.263413
2941,22942,0.024391
2942,22943,1.634950


In [50]:
# merge the proximity to the beach data
combined_df = combined_df.merge(coast_df, on='SAL_CODE', how='left')
combined_df

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,Natural and semi-natural open space,Non-government schools,Parks and gardens,Public housing reserves,Recreation corridor,Services and utilities reserves,Sportsfields and organised recreation,Tertiary institutions,Transport reservations,proximity_to_beach
0,Inner Melbourne,Abbotsford,20002,Collingwood-Abbotsford,1.7405,POLYGON ((145.0019511820001 -37.79664577999995...,2.0,"[{'STOP_NAME': 'Collingwood', 'LATITUDE': -37....",Collingwood,-37.804526,...,15.000000,2.000000,31.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.062930
1,North Western Melbourne,Aberfeldie,20003,Moonee Ponds-Ascot Vale,1.5515,POLYGON ((144.8957592740001 -37.76513729299995...,0.0,"[{'STOP_NAME': 'Essendon', 'LATITUDE': -37.756...",Essendon,-37.756012,...,2.000000,5.000000,3.000000,0.000000,2.000000,0.000000,2.000000,0.000000,1.000000,0.080923
2,Outer Western Melbourne,Aintree,20011,Sydenham,6.7302,"POLYGON ((144.671793757 -37.72627663699996, 14...",0.0,"[{'STOP_NAME': 'Rockbank', 'LATITUDE': -37.729...",Rockbank,-37.729261,...,6.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.218195
3,North Western Melbourne,Airport West,20015,Essendon,3.6748,"POLYGON ((144.879789112 -37.71565333099994, 14...",0.0,"[{'STOP_NAME': 'Oak Park', 'LATITUDE': -37.717...",Oak Park,-37.717950,...,3.000000,1.000000,16.000000,0.000000,8.000000,0.000000,2.000000,0.000000,4.000000,0.118830
4,Outer Western Melbourne,Albanvale,20017,St Albans-Deer Park,1.8634,POLYGON ((144.7692623780001 -37.74081983399998...,0.0,"[{'STOP_NAME': 'Keilor Plains', 'LATITUDE': -3...",Keilor Plains,-37.729279,...,11.000000,0.000000,3.000000,0.000000,7.000000,2.000000,1.000000,0.000000,6.000000,0.136645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,North Eastern Melbourne,Yarrambat,22916,Bundoora-Greensborough-Hurstbridge,15.3453,POLYGON ((145.1665694620001 -37.62473551999994...,0.0,"[{'STOP_NAME': 'Hawkstowe', 'LATITUDE': -37.62...",Hawkstowe,-37.622992,...,23.000000,1.000000,0.000000,0.000000,0.000000,0.000000,7.000000,0.000000,3.000000,0.277664
563,Outer Western Melbourne,Yarraville,22917,Yarraville-Seddon,5.6587,"POLYGON ((144.88986405 -37.80976599199994, 144...",1.0,"[{'STOP_NAME': 'Yarraville', 'LATITUDE': -37.8...",Yarraville,-37.815850,...,21.000000,1.000000,33.000000,0.000000,0.000000,0.000000,15.000000,0.000000,2.000000,0.030599
564,Outer Eastern Melbourne,Yellingbo,22925,Yarra Ranges,42.4012,"POLYGON ((145.505065853 -37.81357686499996, 14...",0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,22.000000,0.000000,0.000000,0.000000,0.000000,15.000000,3.000000,0.000000,0.000000,0.381006
565,Outer Eastern Melbourne,Yering,22930,Yarra Ranges,24.6363,"POLYGON ((145.369123872 -37.66830801499998, 14...",0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,7.000000,0.000000,0.000000,0.000000,15.000000,0.000000,4.000000,0.000000,0.000000,0.429037


### 2 Complex Merges

In [51]:
# merge in all business data (2016-2028)
for year in range(2016,2029):
    
    # merge in the data
    df = pd.read_csv(f'../data/curated/osm_data_{year}.csv')
    df = df.rename(columns=lambda x: f"{year}_{x}")
    df = df.rename(columns={f"{year}_SAL_CODE": "SAL_CODE"})
    combined_df = combined_df.merge(df, on='SAL_CODE', how='left')

    # impute missing business data as the average amount for that business in a suburb
    for col in df.columns:
        if col != "SAL_CODE":
            combined_df[col] = combined_df[col].fillna(combined_df[col].mean())

combined_df

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,2028_Unnamed: 0,2028_commercial,2028_education,2028_food_establishments,2028_healthcare,2028_industrial,2028_public_transport,2028_recreation,2028_residential,2028_shopping
0,Inner Melbourne,Abbotsford,20002,Collingwood-Abbotsford,1.7405,POLYGON ((145.0019511820001 -37.79664577999995...,2.0,"[{'STOP_NAME': 'Collingwood', 'LATITUDE': -37....",Collingwood,-37.804526,...,11995,21.847,4.000,69.301,2.000,5.00,0.0,24.974,18.687,27.580
1,North Western Melbourne,Aberfeldie,20003,Moonee Ponds-Ascot Vale,1.5515,POLYGON ((144.8957592740001 -37.76513729299995...,0.0,"[{'STOP_NAME': 'Essendon', 'LATITUDE': -37.756...",Essendon,-37.756012,...,12000,3.000,5.000,6.000,0.000,0.00,0.0,11.000,2.000,2.000
2,Outer Western Melbourne,Aintree,20011,Sydenham,6.7302,"POLYGON ((144.671793757 -37.72627663699996, 14...",0.0,"[{'STOP_NAME': 'Rockbank', 'LATITUDE': -37.729...",Rockbank,-37.729261,...,12005,1.000,3.019,1.000,0.000,0.00,0.0,61.262,2.000,1.000
3,North Western Melbourne,Airport West,20015,Essendon,3.6748,"POLYGON ((144.879789112 -37.71565333099994, 14...",0.0,"[{'STOP_NAME': 'Oak Park', 'LATITUDE': -37.717...",Oak Park,-37.717950,...,12010,8.000,8.000,10.439,2.392,8.00,0.0,26.000,9.000,11.334
4,Outer Western Melbourne,Albanvale,20017,St Albans-Deer Park,1.8634,POLYGON ((144.7692623780001 -37.74081983399998...,0.0,"[{'STOP_NAME': 'Keilor Plains', 'LATITUDE': -3...",Keilor Plains,-37.729279,...,12015,0.000,1.000,0.000,0.000,0.00,0.0,9.021,1.000,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,North Eastern Melbourne,Yarrambat,22916,Bundoora-Greensborough-Hurstbridge,15.3453,POLYGON ((145.1665694620001 -37.62473551999994...,0.0,"[{'STOP_NAME': 'Hawkstowe', 'LATITUDE': -37.62...",Hawkstowe,-37.622992,...,14780,3.000,1.000,2.000,0.000,0.00,0.0,5.000,0.000,1.000
563,Outer Western Melbourne,Yarraville,22917,Yarraville-Seddon,5.6587,"POLYGON ((144.88986405 -37.80976599199994, 144...",1.0,"[{'STOP_NAME': 'Yarraville', 'LATITUDE': -37.8...",Yarraville,-37.815850,...,14785,27.732,7.000,46.218,6.801,9.71,1.0,53.976,12.886,15.363
564,Outer Eastern Melbourne,Yellingbo,22925,Yarra Ranges,42.4012,"POLYGON ((145.505065853 -37.81357686499996, 14...",0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,14790,0.000,0.000,1.000,0.000,0.00,0.0,0.000,0.000,0.000
565,Outer Eastern Melbourne,Yering,22930,Yarra Ranges,24.6363,"POLYGON ((145.369123872 -37.66830801499998, 14...",0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,14795,0.000,1.000,0.000,0.000,0.00,0.0,12.000,1.000,0.000


In [52]:
# merge in all ABS data (2016-2028)
# note most of this data is artifical made through ARIMA
for year in range(2016,2029):
    df = pd.read_csv(f'../data/curated/ABS_extrapolated/ABS_data_{year}.csv')
    df = df.rename(columns=lambda x: f"{year}_{x}")
    df = df.rename(columns={f"{year}_SAL_CODE": "SAL_CODE"})
    df = df.drop(columns=f"{year}_Median_rent_weekly")
    combined_df = combined_df.merge(df, on='SAL_CODE', how='left')
    print(combined_df.shape)
combined_df


(567, 258)
(567, 265)
(567, 272)
(567, 279)
(567, 286)
(567, 293)
(567, 300)
(567, 307)
(567, 314)
(567, 321)
(567, 328)
(567, 335)
(567, 342)


Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,2027_Average_num_psns_per_bedroom,2027_Median_tot_hhd_inc_weekly,2027_Average_household_size,2028_Median_age_persons,2028_Median_mortgage_repay_monthly,2028_Median_tot_prsnl_inc_weekly,2028_Median_tot_fam_inc_weekly,2028_Average_num_psns_per_bedroom,2028_Median_tot_hhd_inc_weekly,2028_Average_household_size
0,Inner Melbourne,Abbotsford,20002,Collingwood-Abbotsford,1.7405,POLYGON ((145.0019511820001 -37.79664577999995...,2.0,"[{'STOP_NAME': 'Collingwood', 'LATITUDE': -37....",Collingwood,-37.804526,...,0.88,2432.2,1.66,34.4,2202.0,1754.4,3815.0,0.86,2471.4,1.62
1,North Western Melbourne,Aberfeldie,20003,Moonee Ponds-Ascot Vale,1.5515,POLYGON ((144.8957592740001 -37.76513729299995...,0.0,"[{'STOP_NAME': 'Essendon', 'LATITUDE': -37.756...",Essendon,-37.756012,...,0.68,3016.2,2.68,42.4,3160.0,1299.4,3970.0,0.66,3090.4,2.66
2,Outer Western Melbourne,Aintree,20011,Sydenham,6.7302,"POLYGON ((144.671793757 -37.72627663699996, 14...",0.0,"[{'STOP_NAME': 'Rockbank', 'LATITUDE': -37.729...",Rockbank,-37.729261,...,0.90,3834.8,4.68,16.0,3380.8,1614.2,3681.2,0.90,4082.6,4.86
3,North Western Melbourne,Airport West,20015,Essendon,3.6748,"POLYGON ((144.879789112 -37.71565333099994, 14...",0.0,"[{'STOP_NAME': 'Oak Park', 'LATITUDE': -37.717...",Oak Park,-37.717950,...,0.80,2224.2,2.30,37.6,2344.0,1163.0,3126.4,0.80,2301.4,2.30
4,Outer Western Melbourne,Albanvale,20017,St Albans-Deer Park,1.8634,POLYGON ((144.7692623780001 -37.74081983399998...,0.0,"[{'STOP_NAME': 'Keilor Plains', 'LATITUDE': -3...",Keilor Plains,-37.729279,...,0.90,1483.0,2.78,38.4,1708.6,558.0,1603.6,0.90,1517.0,2.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,North Eastern Melbourne,Yarrambat,22916,Bundoora-Greensborough-Hurstbridge,15.3453,POLYGON ((145.1665694620001 -37.62473551999994...,0.0,"[{'STOP_NAME': 'Hawkstowe', 'LATITUDE': -37.62...",Hawkstowe,-37.622992,...,0.80,3143.6,2.98,52.6,3160.0,1058.8,3067.4,0.80,3234.2,2.96
563,Outer Western Melbourne,Yarraville,22917,Yarraville-Seddon,5.6587,"POLYGON ((144.88986405 -37.80976599199994, 144...",1.0,"[{'STOP_NAME': 'Yarraville', 'LATITUDE': -37.8...",Yarraville,-37.815850,...,0.90,3010.6,2.50,38.4,2966.2,1696.8,4196.0,0.90,3098.2,2.50
564,Outer Eastern Melbourne,Yellingbo,22925,Yarra Ranges,42.4012,"POLYGON ((145.505065853 -37.81357686499996, 14...",0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,0.80,2363.0,3.02,47.4,2827.4,922.8,2418.2,0.80,2425.0,3.04
565,Outer Eastern Melbourne,Yering,22930,Yarra Ranges,24.6363,"POLYGON ((145.369123872 -37.66830801499998, 14...",0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,0.70,4095.4,2.78,42.6,4166.2,1221.4,4305.8,0.70,4305.8,2.76


In [53]:
# read in all population data (2016-2028)
for year in range(2016,2029):
    df = pd.read_csv(f'../data/raw/pop_projections/pop_{year}.csv')
    df = df.rename(columns=lambda x: f"{year}_{x}")
    df = df.rename(columns={f"{year}_SAL_CODE": "SAL_CODE"})
    combined_df = combined_df.merge(df, on='SAL_CODE', how='left')
    print(combined_df.shape)
combined_df

(567, 344)
(567, 346)
(567, 348)
(567, 350)
(567, 352)
(567, 354)
(567, 356)
(567, 358)
(567, 360)
(567, 362)
(567, 364)
(567, 366)
(567, 368)


Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,2024_Total_dwelings,2024_Tot_P_P,2025_Total_dwelings,2025_Tot_P_P,2026_Total_dwelings,2026_Tot_P_P,2027_Total_dwelings,2027_Tot_P_P,2028_Total_dwelings,2028_Tot_P_P
0,Inner Melbourne,Abbotsford,20002,Collingwood-Abbotsford,1.7405,POLYGON ((145.0019511820001 -37.79664577999995...,2.0,"[{'STOP_NAME': 'Collingwood', 'LATITUDE': -37....",Collingwood,-37.804526,...,34841.0,72006.185753,36985.0,74368.914337,39129.0,76731.642921,40028.0,78400.637444,40927.0,80069.631966
1,North Western Melbourne,Aberfeldie,20003,Moonee Ponds-Ascot Vale,1.5515,POLYGON ((144.8957592740001 -37.76513729299995...,0.0,"[{'STOP_NAME': 'Essendon', 'LATITUDE': -37.756...",Essendon,-37.756012,...,30218.8,68015.435726,31638.4,69190.580968,33058.0,70365.726210,33820.0,71824.519384,34582.0,73283.312559
2,Outer Western Melbourne,Aintree,20011,Sydenham,6.7302,"POLYGON ((144.671793757 -37.72627663699996, 14...",0.0,"[{'STOP_NAME': 'Rockbank', 'LATITUDE': -37.729...",Rockbank,-37.729261,...,16409.2,51659.672507,19330.6,59662.563342,22252.0,67665.454178,24732.0,74699.331506,27212.0,81733.208834
3,North Western Melbourne,Airport West,20015,Essendon,3.6748,"POLYGON ((144.879789112 -37.71565333099994, 14...",0.0,"[{'STOP_NAME': 'Oak Park', 'LATITUDE': -37.717...",Oak Park,-37.717950,...,20064.0,50768.358416,20665.0,51258.477888,21266.0,51748.597360,21509.0,52234.454823,21752.0,52720.312285
4,Outer Western Melbourne,Albanvale,20017,St Albans-Deer Park,1.8634,POLYGON ((144.7692623780001 -37.74081983399998...,0.0,"[{'STOP_NAME': 'Keilor Plains', 'LATITUDE': -3...",Keilor Plains,-37.729279,...,22434.0,66790.511534,23127.0,67294.348711,23820.0,67798.185889,24033.0,68234.079682,24246.0,68669.973476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,North Eastern Melbourne,Yarrambat,22916,Bundoora-Greensborough-Hurstbridge,15.3453,POLYGON ((145.1665694620001 -37.62473551999994...,0.0,"[{'STOP_NAME': 'Hawkstowe', 'LATITUDE': -37.62...",Hawkstowe,-37.622992,...,24026.8,70128.775255,24631.4,70847.033673,25236.0,71565.292091,25549.0,72270.893362,25862.0,72976.494633
563,Outer Western Melbourne,Yarraville,22917,Yarraville-Seddon,5.6587,"POLYGON ((144.88986405 -37.80976599199994, 144...",1.0,"[{'STOP_NAME': 'Yarraville', 'LATITUDE': -37.8...",Yarraville,-37.815850,...,45671.8,104555.241660,49236.4,109684.655547,52801.0,114814.069434,55266.0,119591.755997,57731.0,124369.442561
564,Outer Eastern Melbourne,Yellingbo,22925,Yarra Ranges,42.4012,"POLYGON ((145.505065853 -37.81357686499996, 14...",0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,18515.8,49988.427228,19069.4,50267.902971,19623.0,50547.378714,19732.0,50741.591568,19841.0,50935.804421
565,Outer Eastern Melbourne,Yering,22930,Yarra Ranges,24.6363,"POLYGON ((145.369123872 -37.66830801499998, 14...",0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,15808.4,40480.208069,16341.2,41055.277425,16874.0,41630.346781,17151.8,42232.203423,17429.6,42834.060066


### 3 Rental Data Merges

#### 3.1 Historical Rent Merging

In [54]:
# read in the historical rent data
historical_rent_df = pd.read_csv("../data/curated/historical_rent_cleaned.csv")
historical_rent_df

Unnamed: 0,SAL_CODE,2000_average_weekly_rent,2000_average_quarterly_count,2001_average_weekly_rent,2001_average_quarterly_count,2002_average_weekly_rent,2002_average_quarterly_count,2003_average_weekly_rent,2003_average_quarterly_count,2004_average_weekly_rent,...,2020_average_weekly_rent,2020_average_quarterly_count,2021_average_weekly_rent,2021_average_quarterly_count,2022_average_weekly_rent,2022_average_quarterly_count,2023_average_weekly_rent,2023_average_quarterly_count,2024_average_weekly_rent,2024_average_quarterly_count
0,20111,137.00,979.50,141.25,857.00,151.25,896.00,161.25,948.25,171.25,...,317.50,1743.75,331.25,1632.25,355.00,1546.50,371.25,1473.25,380.0,1345.0
1,20198,190.00,505.75,207.75,502.50,216.25,529.00,221.25,525.75,226.25,...,466.25,720.00,457.00,643.25,432.50,872.25,495.00,764.75,550.0,729.0
2,21193,200.00,608.25,207.50,765.50,210.00,1059.50,210.00,1334.00,217.50,...,400.00,2389.25,400.00,2591.50,411.25,3047.50,437.50,3557.00,470.0,3777.0
3,21640,320.00,2278.75,320.00,2752.50,320.00,3382.75,305.00,3972.75,300.00,...,483.75,10206.25,366.25,16559.25,426.25,14627.75,587.50,13547.25,640.0,13582.0
4,21938,142.50,443.50,151.25,434.75,161.25,429.50,171.25,473.00,178.75,...,390.75,434.50,407.50,428.50,432.50,365.25,465.00,312.50,475.0,356.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,22916,185.00,1511.00,196.25,1579.75,208.75,1565.75,216.25,1703.50,222.50,...,402.50,2991.00,407.50,2879.00,423.75,3105.00,462.50,2728.75,500.0,2440.0
563,22917,178.75,947.25,187.50,939.00,200.00,947.00,210.00,1037.25,218.75,...,491.25,1268.25,477.50,1321.25,475.00,1391.75,521.25,1280.00,570.0,1070.0
564,22925,152.50,1285.00,162.50,1280.50,170.00,1258.50,180.00,1315.00,192.50,...,400.00,873.50,415.00,721.75,450.75,733.00,486.25,767.75,500.0,729.0
565,22930,152.50,1285.00,162.50,1280.50,170.00,1258.50,180.00,1315.00,192.50,...,400.00,873.50,415.00,721.75,450.75,733.00,486.25,767.75,500.0,729.0


In [55]:
print(combined_df['SAL_CODE'].duplicated().sum())  # To check duplicates in combined_df
print(historical_rent_df['SAL_CODE'].duplicated().sum())  # To check duplicates in df for each year

1
0


In [56]:
# merge all the historical data
# IMPORTANT SIDENOTE: this data has data for 2024 even though it is not in specified data range
# this explained in the domain data merging (subsection below as to why this is done)
print("combined shape before:", combined_df.shape)
combined_df = combined_df.merge(historical_rent_df, on='SAL_CODE', how='left')
print("combined shape after:", combined_df.shape)

# ensure no entry is null
df = combined_df
df[df.isna().any(axis=1)]

combined shape before: (567, 368)
combined shape after: (567, 418)


Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,2020_average_weekly_rent,2020_average_quarterly_count,2021_average_weekly_rent,2021_average_quarterly_count,2022_average_weekly_rent,2022_average_quarterly_count,2023_average_weekly_rent,2023_average_quarterly_count,2024_average_weekly_rent,2024_average_quarterly_count


In [57]:
l = list(combined_df["SAL_CODE"])
for s in list(historical_rent_df["SAL_CODE"]):
    if s not in l:
        print(s)

20442


#### 3.2 Combined Data Restructuring

In [58]:
# spilt the columns in of the combined df into two categories
# those column with entries of a particular and those that do not change with time (annually)
df = combined_df

year_columns = [col for col in df.columns if any(str(year) in col for year in range(2000, 2029))]
print(year_columns)
not_year_columns = []
for col in combined_df.columns:
    if str(col) not in year_columns:
        not_year_columns.append(col)
print(not_year_columns)

['2015_A Crimes against the person', '2015_B Property and deception offences', '2015_C Drug offences', '2015_D Public order and security offences', '2015_E Justice procedures offences', '2015_F Other offences', '2015_total', '2016_A Crimes against the person', '2016_B Property and deception offences', '2016_C Drug offences', '2016_D Public order and security offences', '2016_E Justice procedures offences', '2016_F Other offences', '2016_total', '2017_A Crimes against the person', '2017_B Property and deception offences', '2017_C Drug offences', '2017_D Public order and security offences', '2017_E Justice procedures offences', '2017_F Other offences', '2017_total', '2018_A Crimes against the person', '2018_B Property and deception offences', '2018_C Drug offences', '2018_D Public order and security offences', '2018_E Justice procedures offences', '2018_F Other offences', '2018_total', '2019_A Crimes against the person', '2019_B Property and deception offences', '2019_C Drug offences', '

In [59]:
# This sections melts the df columns so that the columns that contain a year in their title are reduced into one column.
# The year is now an extra column.
# This dramatrically decreases the number of columns and gives as for rows (observations for modelling)

# Step 1: Melt the DataFrame from wide to long format
melted_df = pd.melt(df, 
                    id_vars=not_year_columns, 
                    value_vars=year_columns, 
                    var_name='variable_year', 
                    value_name='value')

# Step 2: Extract the 'year' and 'variable' from the 'variable_year' column
melted_df['year'] = melted_df['variable_year'].str.extract(r'(\d{4})')
melted_df['variable'] = melted_df['variable_year'].str.replace(r'\d{4}_', '', regex=True)

# Step 3: Drop the original 'variable_year' column as it's no longer needed
melted_df = melted_df.drop(columns=['variable_year'])

not_year_columns.append('year')
# Step 4: Pivot the table to separate 'average_weekly_rent', 'average_quarterly_count' back into columns
df_final = melted_df.pivot_table(index=not_year_columns, 
                                 columns='variable', 
                                 values='value').reset_index()


# Step 5: View the final DataFrame
df_final

variable,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,commercial,education,food_establishments,healthcare,industrial,public_transport,recreation,residential,shopping,total
0,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,,,,,,,,,,
1,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,,,,,,,,,,
2,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,,,,,,,,,,
3,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,,,,,,,,,,
4,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16409,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,66.0
16410,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,
16411,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,
16412,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,


In [60]:
# Only have all data on 2016-2024 so filter out data that is not between these years
df_filtered = df_final[df_final['year'].astype(int).between(2016, 2028)]
df_filtered = df_filtered.drop(columns=["total", 'all_crimes_2015-2024'])
df_filtered = df_filtered.rename(columns={"tests":"internet_speed_test_count"})
print(df_filtered.columns)
len(set(df_filtered["SAL_suburb"]))

Index(['Suburb Cluster', 'SAL_suburb', 'SAL_CODE', 'Suburb Group',
       'AREASQKM21', 'geometry', 'station_count_in_suburb', 'closest_stations',
       'closest_station_1_STOP_NAME', 'closest_station_1_LATITUDE',
       'closest_station_1_LONGITUDE', 'closest_station_2_STOP_NAME',
       'closest_station_2_LATITUDE', 'closest_station_2_LONGITUDE',
       'suburb_centre_latitude', 'suburb_centre_longitude', 'distance_to_CBD',
       'time_to_CBD', 'distance_to_station', 'time_to_station', 'Cemeteries',
       'Civic squares and promenades', 'Conservation reserves',
       'Government schools', 'Natural and semi-natural open space',
       'Non-government schools', 'Parks and gardens',
       'Public housing reserves', 'Recreation corridor',
       'Services and utilities reserves',
       'Sportsfields and organised recreation', 'Tertiary institutions',
       'Transport reservations', 'proximity_to_beach', 'year',
       'A Crimes against the person', 'Average_household_size',
      

566

#### 3.3 Inflation
Inflation was gather to be these values from the ABS which can be found at this link ['https://www.abs.gov.au/statistics/economy/price-indexes-and-inflation/consumer-price-index-australia/jun-quarter-2024'].
- Mar Qtr 2015 to Mar Qtr 2016: 1.3
- Mar Qtr 2016 to Mar Qtr 2017: 2.1
- Mar Qtr 2017 to Mar Qtr 2018: 1.9
- Mar Qtr 2018 to Mar Qtr 2019: 1.3
- Mar Qtr 2019 to Mar Qtr 2020: 2.2
- Mar Qtr 2020 to Mar Qtr 2021: 1.1
- Mar Qtr 2021 to Mar Qtr 2022: 5.1
- Mar Qtr 2022 to Mar Qtr 2023: 7.0
- Mar Qtr 2023 to Mar Qtr 2024: 3.6
Forecasted inflation is from statista which can be found at this link ['https://www.statista.com/statistics/271845/inflation-rate-in-australia/']
- 2025: 3.0
- 2026: 2.7
- 2027: 2.7
- 2028: 2.6
The end data of each recording will be taken as the inflation for that year. For example: 2015-2016 inflation is taken as the inflation for 2016 data

In [61]:
# inflation data
inflation_dict = {
"2016": 1.3,
"2017": 2.1,
"2018": 1.9,
"2019": 1.3,
"2020": 2.2,
"2021": 1.1,
"2022": 5.1,
"2023": 7.0,
"2024": 3.6,
"2025": 3.0,
"2026": 2.7,
"2027": 2.7,
"2028": 2.6,
}

# store separately as back up
with open('../data/raw/inflation_data.json', 'w') as file:
    json.dump(inflation_dict, file)

# merge the inflation data
df_filtered['inflation'] = df_filtered['year'].map(inflation_dict)
df_filtered

variable,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,commercial,education,food_establishments,healthcare,industrial,public_transport,recreation,residential,shopping,inflation
16,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,4.0,2.0,0.0,0.0,9.0,0.0,35.0,88.0,0.0,1.3
17,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,5.0,2.0,0.0,0.0,9.0,0.0,43.0,88.0,2.0,2.1
18,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,4.0,3.0,3.0,1.0,9.0,0.0,46.0,91.0,3.0,1.9
19,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,4.0,3.0,3.0,1.0,8.0,0.0,46.0,91.0,3.0,1.3
20,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,4.0,3.0,3.0,1.0,8.0,0.0,49.0,91.0,3.0,2.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16409,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,3.6
16410,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,3.0
16411,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,2.7
16412,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,5.0,0.0,2.7


In [62]:
df_filtered.to_csv("../data/curated/NO_DOMAIN_DATASET.csv", index=False)

#### 3.4 Domain Scraped Data Merging

In [63]:
# read in domain scraped data
rental_data2024_df = pd.read_csv("../data/curated/part-00000-09645e5e-30ca-4d69-9cd8-d63fa18084a4-c000.csv")
rental_data2024_df[rental_data2024_df["Standard_Suburb"]=="Melbourne"]["Standard_Suburb"]
rental_data2024_df[rental_data2024_df["Standard_Suburb"]=="Albert Park"]


Unnamed: 0,Standard_Suburb,average_beds,average_baths,average_parking,average_cost,counts
366,Albert Park,2.125,1.5,0.375,698.625,16


In [64]:
# fix suburb naming inconsistencies
rental_data2024_df.loc[379, "Standard_Suburb"] = "Melbourne Cbd"
rental_data2024_df

Unnamed: 0,Standard_Suburb,average_beds,average_baths,average_parking,average_cost,counts
0,Ivanhoe East,2.000000,1.000000,1.000000,545.000000,2
1,Ashwood,2.818182,1.727273,2.000000,719.454545,11
2,Albanvale,2.333333,1.000000,2.666667,398.333333,3
3,Brighton,3.216667,1.966667,1.866667,1004.433333,60
4,Sandringham,2.090909,1.363636,1.818182,587.727273,11
...,...,...,...,...,...,...
440,South Morang,3.000000,1.894737,1.578947,534.210526,19
441,Gruyere,3.000000,2.000000,5.000000,900.000000,1
442,Maribyrnong,2.145833,1.520833,1.270833,568.229167,48
443,South Yarra,1.775510,1.377551,0.801020,650.193878,196


We see here that domain does not have over 100 suburbs that we have historical rental data on, luckily we have rental data value on 2024 march quarter, suburbs that we do not have domain scraped data on,  we will impute the 2024 march quarterly value instead.

In [65]:
# list out missing suburbs
not_found = []
l = list(combined_df["SAL_suburb"])
for s in l:
    if s not in list(rental_data2024_df["Standard_Suburb"]):
        not_found.append(s)
        print(s)
print(len(not_found))

Arthurs Creek
Arthurs Seat
Attwood
Avonsleigh
Badger Creek
Ballarat
Balnarring
Balnarring Beach
Bangholme
Bayles
Beaconsfield Upper
Beenak
Belgrave Heights
Bend Of Islands
Blind Bight
Braeside
Bunyip North
Caldermeade
Cannons Creek
Cape Schanck
Castlemaine
Catani
Christmas Hills
Chum Creek
Cockatoo
Cocoroc
Cora Lynn
Cottles Bridge
Dalmore
Dewhurst
Dixons Creek
Don Valley
Eden Park
Essendon Fields
Exford
Garfield
Garfield North
Gilderoy
Gladysdale
Grangefields
Guys Hill
Heath Hill
Hmas Cerberus
Hoddles Creek
Humevale
Iona
Kallista
Kangaroo Ground
Keilor North
Koo Wee Rup North
Lang Lang East
Langwarrin South
Laverton North
Lysterfield
Lysterfield South
Macclesfield
Main Ridge
Maryknoll
McCrae
McKinnon
Melbourne Airport
Menzies Creek
Merricks
Merricks North
Millgrove
Monomeith
Moorabbin Airport
Mount Burnett
Mount Dandenong
Mount Toolebewong
Nangana
Narre Warren East
Nutfield
Oaklands Junction
Pakenham South
Pakenham Upper
Panton Hill
Plenty
Plumpton
Point Leo
Powelltown
Quandong
Ravenha

In [66]:
reduced_df = combined_df[combined_df['SAL_suburb'].isin(not_found)]
reduced_df

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,2020_average_weekly_rent,2020_average_quarterly_count,2021_average_weekly_rent,2021_average_quarterly_count,2022_average_weekly_rent,2022_average_quarterly_count,2023_average_weekly_rent,2023_average_quarterly_count,2024_average_weekly_rent,2024_average_quarterly_count
14,North Eastern Melbourne,Arthurs Creek,20071,Bundoora-Greensborough-Hurstbridge,39.1753,"POLYGON ((145.238933919 -37.53735484499998, 14...",0.0,"[{'STOP_NAME': 'Hurstbridge', 'LATITUDE': -37....",Hurstbridge,-37.639398,...,402.5,2991.0,407.5,2879.0,423.75,3105.0,462.5,2728.75,500.0,2440.0
15,Mornington Peninsula,Arthurs Seat,20072,Dromana-Portsea,8.8149,POLYGON ((144.9586627890001 -38.34853594999998...,0.0,"[{'STOP_NAME': 'Bittern', 'LATITUDE': -38.3373...",Bittern,-38.33739,...,392.5,1883.0,445.0,1395.75,497.5,1390.0,526.25,1653.25,545.0,1712.0
21,North Western Melbourne,Attwood,20082,Gladstone Park-Tullamarine,5.0587,POLYGON ((144.8961075330001 -37.67522380199995...,0.0,"[{'STOP_NAME': 'Broadmeadows', 'LATITUDE': -37...",Broadmeadows,-37.683049,...,391.25,652.75,392.5,595.75,405.0,631.0,436.25,548.25,480.0,505.0
23,South Eastern Melbourne,Avonsleigh,20091,Pakenham,7.562,POLYGON ((145.4709217070001 -37.92167011299995...,0.0,"[{'STOP_NAME': 'Belgrave', 'LATITUDE': -37.909...",Belgrave,-37.909102,...,353.75,2278.75,367.5,1930.5,395.0,1932.25,443.75,1772.0,480.0,1693.0
24,Outer Eastern Melbourne,Badger Creek,20097,Yarra Ranges,8.9749,POLYGON ((145.5477898380001 -37.69964939699997...,0.0,"[{'STOP_NAME': 'Lilydale', 'LATITUDE': -37.755...",Lilydale,-37.755518,...,400.0,873.5,415.0,721.75,450.75,733.0,486.25,767.75,500.0,729.0
27,Ballarat,Ballarat,20111,Ballarat,3.7814,"POLYGON ((143.850550147 -37.54589975199997, 14...",0.0,"[{'STOP_NAME': 'Ballarat Central', 'LATITUDE':...",Ballarat Central,-37.558791,...,317.5,1743.75,331.25,1632.25,355.0,1546.5,371.25,1473.25,380.0,1345.0
28,Mornington Peninsula,Balnarring,20120,Hastings-Flinders,27.4606,POLYGON ((145.1251874760001 -38.38386054199998...,0.0,"[{'STOP_NAME': 'Bittern', 'LATITUDE': -38.3373...",Bittern,-38.33739,...,405.0,793.25,427.5,661.0,475.0,640.75,502.5,710.25,520.0,665.0
29,Mornington Peninsula,Balnarring Beach,20121,Hastings-Flinders,0.8013,"POLYGON ((145.127147044 -38.39164329199997, 14...",0.0,"[{'STOP_NAME': 'Morradoo', 'LATITUDE': -38.354...",Morradoo,-38.354033,...,405.0,793.25,427.5,661.0,475.0,640.75,502.5,710.25,520.0,665.0
32,South Eastern Melbourne,Bangholme,20131,Dandenong,23.5104,POLYGON ((145.1458324480001 -38.05420847099998...,0.0,"[{'STOP_NAME': 'Bonbeach', 'LATITUDE': -38.062...",Bonbeach,-38.062945,...,350.0,2095.75,350.0,1929.25,359.25,1970.25,393.75,1794.0,440.0,1563.0
35,South Eastern Melbourne,Bayles,20171,Pakenham,23.7205,POLYGON ((145.6011151440001 -38.18290164399997...,0.0,"[{'STOP_NAME': 'Nar Nar Goon', 'LATITUDE': -38...",Nar Nar Goon,-38.081592,...,353.75,2278.75,367.5,1930.5,395.0,1932.25,443.75,1772.0,480.0,1693.0


In [67]:
# rename domain data so that has a clearer title for the combined data
rental_data2024_df = rental_data2024_df.rename(columns={"Standard_Suburb": "SAL_suburb",
                                                        "average_beds":	"2024_average_beds",
                                                        "average_baths": "2024_average_baths",	
                                                        "average_parking": "2024_average_parking",
                                                        "average_cost":	"average_weekly_rent",
                                                        "counts": "2024_property_count"
                                                        })
rental_data2024_df['year'] = "2024"
rental_data2024_df

Unnamed: 0,SAL_suburb,2024_average_beds,2024_average_baths,2024_average_parking,average_weekly_rent,2024_property_count,year
0,Ivanhoe East,2.000000,1.000000,1.000000,545.000000,2,2024
1,Ashwood,2.818182,1.727273,2.000000,719.454545,11,2024
2,Albanvale,2.333333,1.000000,2.666667,398.333333,3,2024
3,Brighton,3.216667,1.966667,1.866667,1004.433333,60,2024
4,Sandringham,2.090909,1.363636,1.818182,587.727273,11,2024
...,...,...,...,...,...,...,...
440,South Morang,3.000000,1.894737,1.578947,534.210526,19,2024
441,Gruyere,3.000000,2.000000,5.000000,900.000000,1,2024
442,Maribyrnong,2.145833,1.520833,1.270833,568.229167,48,2024
443,South Yarra,1.775510,1.377551,0.801020,650.193878,196,2024


In [68]:
# merge the domain data
df_filtered  = df_filtered.merge(rental_data2024_df, on=['SAL_suburb', 'year'], how='left', suffixes=('', '_new'))

# replace the 2024 march quarterly data if we have scraped data on that suburb
df_filtered['average_weekly_rent'] = df_filtered['average_weekly_rent_new'].combine_first(df_filtered['average_weekly_rent'])
df_filtered['Average_household_size'] = df_filtered['2024_average_beds'].combine_first(df_filtered['Average_household_size'])
df_filtered.head(20)["Average_household_size"]

0     2.800000
1     2.800000
2     2.800000
3     2.800000
4     2.800000
5     2.800000
6     2.800000
7     2.800000
8     3.571429
9     2.800000
10    2.800000
11    2.800000
12    2.800000
13    2.100000
14    2.100000
15    2.100000
16    2.100000
17    2.100000
18    2.100000
19    2.100000
Name: Average_household_size, dtype: float64

In [69]:
# drop now unnecessary columns
df_filtered = df_filtered.drop(columns=["average_weekly_rent_new", "2024_average_beds"])
df_filtered[df_filtered["SAL_suburb"]=="Portsea"]

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,healthcare,industrial,public_transport,recreation,residential,shopping,inflation,2024_average_baths,2024_average_parking,2024_property_count
1846,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,5.0,0.0,0.0,1.3,,,
1847,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,5.0,0.0,0.0,2.1,,,
1848,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,8.0,0.0,0.0,1.9,,,
1849,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,12.0,7.0,0.0,1.3,,,
1850,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,14.0,7.0,0.0,2.2,,,
1851,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,15.0,7.0,0.0,1.1,,,
1852,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,15.0,7.0,0.0,5.1,,,
1853,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,15.0,7.0,0.0,7.0,,,
1854,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,15.58,7.0,0.0,3.6,2.0,1.833333,6.0
1855,Mornington Peninsula,Portsea,22113,Dromana-Portsea,11.3277,POLYGON ((144.7159739350001 -38.31873080399998...,0.0,"[{'STOP_NAME': 'Marshall', 'LATITUDE': -38.198...",Marshall,-38.198549,...,0.0,0.0,0.0,15.829,7.0,0.0,3.0,,,


In [70]:
# list out columns with missing values
df_filtered = df_filtered.drop(columns="Unnamed: 0")
missing_values = df_filtered.isnull().sum()
pd.set_option('display.max_rows', 180)
missing_values

Suburb Cluster                              0
SAL_suburb                                  0
SAL_CODE                                    0
Suburb Group                                0
AREASQKM21                                  0
geometry                                    0
station_count_in_suburb                     0
closest_stations                            0
closest_station_1_STOP_NAME                 0
closest_station_1_LATITUDE                  0
closest_station_1_LONGITUDE                 0
closest_station_2_STOP_NAME                 0
closest_station_2_LATITUDE                  0
closest_station_2_LONGITUDE                 0
suburb_centre_latitude                      0
suburb_centre_longitude                     0
distance_to_CBD                             0
time_to_CBD                                 0
distance_to_station                         0
time_to_station                             0
Cemeteries                                  0
Civic squares and promenades      

In [71]:
df_filtered

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,geometry,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,...,healthcare,industrial,public_transport,recreation,residential,shopping,inflation,2024_average_baths,2024_average_parking,2024_property_count
0,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,0.0,9.0,0.0,35.0,88.0,0.0,1.3,,,
1,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,0.0,9.0,0.0,43.0,88.0,2.0,2.1,,,
2,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,1.0,9.0,0.0,46.0,91.0,3.0,1.9,,,
3,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,1.0,8.0,0.0,46.0,91.0,3.0,1.3,,,
4,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,POLYGON ((143.8181875020001 -37.55899234999998...,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,...,1.0,8.0,0.0,49.0,91.0,3.0,2.2,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7353,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,0.0,11.0,5.0,0.0,3.6,2.0,2.0,1.0
7354,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,0.0,11.0,5.0,0.0,3.0,,,
7355,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,0.0,11.0,5.0,0.0,2.7,,,
7356,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,POLYGON ((145.1400128940001 -38.01510086099995...,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,...,0.0,0.0,0.0,11.0,5.0,0.0,2.7,,,


The domain data columns do not fit indexing structure of year and suburb, as all this data is on 2024. Hence why so many values are missing in these columns as these will be rows will a year entry of not 2024. These columns will have to be preprocessed later on when we are finished analysising them.

In [72]:
# output our combined dataset
df_filtered.to_csv('../data/curated/FINAL_combined.csv', index=False)
df_filtered = df_filtered.drop(columns=["geometry"])
df_filtered.to_csv('../data/curated/FINAL_combined_no_poly.csv', index=False)
df_filtered

Unnamed: 0,Suburb Cluster,SAL_suburb,SAL_CODE,Suburb Group,AREASQKM21,station_count_in_suburb,closest_stations,closest_station_1_STOP_NAME,closest_station_1_LATITUDE,closest_station_1_LONGITUDE,...,healthcare,industrial,public_transport,recreation,residential,shopping,inflation,2024_average_baths,2024_average_parking,2024_property_count
0,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,143.820068,...,0.0,9.0,0.0,35.0,88.0,0.0,1.3,,,
1,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,143.820068,...,0.0,9.0,0.0,43.0,88.0,2.0,2.1,,,
2,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,143.820068,...,1.0,9.0,0.0,46.0,91.0,3.0,1.9,,,
3,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,143.820068,...,1.0,8.0,0.0,46.0,91.0,3.0,1.3,,,
4,Ballarat,Alfredton,20023,Wendouree-Alfredton,8.0095,0.0,"[{'STOP_NAME': 'Wendouree', 'LATITUDE': -37.53...",Wendouree,-37.539716,143.820068,...,1.0,8.0,0.0,49.0,91.0,3.0,2.2,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7353,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,145.102150,...,0.0,0.0,0.0,11.0,5.0,0.0,3.6,2.0,2.0,1.0
7354,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,145.102150,...,0.0,0.0,0.0,11.0,5.0,0.0,3.0,,,
7355,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,145.102150,...,0.0,0.0,0.0,11.0,5.0,0.0,2.7,,,
7356,Southern Melbourne,Waterways,22720,Mentone-Parkdale-Mordialloc,1.6949,0.0,"[{'STOP_NAME': 'Aspendale', 'LATITUDE': -38.02...",Aspendale,-38.027220,145.102150,...,0.0,0.0,0.0,11.0,5.0,0.0,2.7,,,
