Liveability score is defined to be the weighted average of below features

1. Housing affordability
2. Access to public transport
3. Access to city center


# 0.Setup

In [1]:
import sys
sys.path.append('../')
from scripts.proximity import proximity_sjoin

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
# rental data used as the main data for analysis
rental_data = pd.read_csv('../data/curated/rental-17-24.csv')
suburb_shape = gpd.read_file('../data/curated/shapefiles/postcode/POSTCODE_POLYGON.shp')[['POSTCODE', 'geometry']]
ptv_time = pd.read_csv('../data/raw/time_ptv.csv')
crime_data = pd.read_excel('../data/landing/crime2024.xlsx', sheet_name='Table 03', header=0)

In [3]:
rental_gdf = gpd.GeoDataFrame(rental_data, geometry=gpd.points_from_xy(rental_data.lng, rental_data.lat), crs='EPSG:7844')
rental_gdf = gpd.sjoin(rental_gdf, suburb_shape, how='left', predicate='within')\
    .drop(columns=['index_right'])
assert rental_gdf.shape[0] == rental_data.shape[0]  # check if all rows are joined
assert rental_gdf[rental_gdf['POSTCODE'].isnull()].shape[0] == 0 # check if there are any rows that are not joined
rental_gdf.rename(columns={'POSTCODE': 'postcode'}, inplace=True)

In [4]:
rental_gdf = rental_gdf[rental_gdf['year']== 2023]

## postcodes with at least 100 instances


In [5]:
threshold = 100
count_ins = rental_gdf.groupby('postcode').size().reset_index(name='count')
valid_postcode_list = count_ins[count_ins['count'] > threshold]['postcode'].tolist()

In [6]:
def postcode_count_filter(df, valid_postcode_list):
    return df[df['postcode'].isin(valid_postcode_list)]

# 1.Housing affordability

Housing affordability can be expressed as the ratio of housing costs to gross household income (ABS 2022a).

In [7]:
affordability = pd.DataFrame(rental_gdf) # making a copy
affordability = affordability[affordability['year']==2023]
# get rent as a percentage of income (just to make it meaningful)
affordability['price/income'] = affordability['rented_price']/7*365 / affordability['median_income']
affordability = affordability.groupby('postcode').agg({'price/income': 'mean'}).reset_index()

In [8]:
affordability.sort_values('price/income', ascending=False, inplace=True)

In [9]:
affordability_rank = postcode_count_filter(affordability,valid_postcode_list).reset_index(drop=True)
affordability_rank['rank'] = affordability_rank.index + 1
affordability_rank = affordability_rank[['postcode', 'rank']]
affordability_rank

Unnamed: 0,postcode,rank
0,3000,1
1,3151,2
2,3108,3
3,3150,4
4,3109,5
...,...,...
206,3825,207
207,3400,208
208,3184,209
209,3300,210


In [10]:
affordability.to_csv('../data/curated/affordability.csv', index=False)

# 2.Access to Public Transport

defined as the average time to public transport for properties in that suburb

In [11]:
public_transport = proximity_sjoin(pd.DataFrame(rental_gdf).drop(columns=['geometry']), ptv_time,'ptv_time')
assert public_transport[public_transport['ptv_time'].isna()].shape[0] == 0
public_transport = public_transport[public_transport['ptv_time'] != -1] # filter out error entries




In [12]:
# get average time to ptv stops
public_transport_res = public_transport.groupby('postcode').agg({'ptv_time': 'mean'}).reset_index()
# get count
count_by_suburb = public_transport.groupby('postcode').size().reset_index(name='count')
public_transport_res = pd.merge(public_transport_res, count_by_suburb, on='postcode')
# sort
public_transport_res.sort_values('ptv_time', ascending=True, inplace=True)
# join suburb names
public_transport_res

Unnamed: 0,postcode,ptv_time,count
215,3287,5.800000,1
414,3795,13.500000,2
37,3045,18.550000,2
48,3057,55.750000,6
302,3512,57.700000,4
...,...,...,...
224,3323,16397.500000,1
494,3956,18385.797826,46
262,3401,18924.857143,7
454,3874,19883.700000,1


In [13]:
ptv_rank = postcode_count_filter(public_transport_res, valid_postcode_list).reset_index(drop=True)
# get postcode and index
ptv_rank['rank'] = ptv_rank.index +1
ptv_rank = ptv_rank[['postcode', 'rank']]
ptv_rank

Unnamed: 0,postcode,rank
0,3000,1
1,3003,2
2,3055,3
3,3008,4
4,3143,5
...,...,...
206,3936,207
207,3631,208
208,3875,209
209,3753,210


# 3.Crime

https://www.crimestatistics.vic.gov.au/crime-statistics/latest-crime-data-by-area

In [14]:
crime_rank = crime_data[['Postcode', 'Incidents Recorded']].groupby('Postcode').size().reset_index(name='count').sort_values('count').reset_index(drop=True).rename(columns={'Postcode': 'postcode'})
crime_rank['postcode'] = crime_rank['postcode'].astype(str)
crime_rank = postcode_count_filter(crime_rank, valid_postcode_list).reset_index(drop=True)

In [15]:
# population
population = rental_gdf.groupby('postcode').agg({'population': 'mean'}).reset_index()
population = postcode_count_filter(population, valid_postcode_list).reset_index(drop=True)
crime_rank = pd.merge(crime_rank, population, on='postcode', how='left')

In [16]:
crime_rank['crime_rate'] = crime_rank['count'] / crime_rank['population']
crime_rank.sort_values('crime_rate', ascending=True, inplace=True)
crime_rank.reset_index(drop=True, inplace=True)
crime_rank['rank'] = crime_rank.index + 1
crime_rank = crime_rank[['postcode', 'rank']]
crime_rank

Unnamed: 0,postcode,rank
0,3753,1
1,3161,2
2,3059,3
3,3940,4
4,3104,5
...,...,...
206,3551,207
207,3352,208
208,3337,209
209,3550,210


# 4.Proximity to CBDs

In [17]:
city_rank = rental_gdf[rental_gdf['time_city'] != -1] # filter out error entries
city_rank = city_rank.groupby('postcode').agg({'time_city': 'mean'}).reset_index()


In [18]:
city_rank.sort_values('time_city', ascending=True, inplace=True)
city_rank = postcode_count_filter(city_rank, valid_postcode_list).reset_index(drop=True)
city_rank['rank'] = city_rank.index + 1
city_rank = city_rank[['postcode', 'rank']]
city_rank

Unnamed: 0,postcode,rank
0,3220,1
1,3000,2
2,3053,3
3,3006,4
4,3003,5
...,...,...
206,3690,207
207,3875,208
208,3880,209
209,3691,210


# 5.Proximity Parks

In [19]:
park = pd.read_csv('../data/raw/time_park.csv')
# park = gpd.GeoDataFrame(park, geometry=gpd.points_from_xy(park.lng, park.lat), crs='EPSG:7844')
park_rental = proximity_sjoin(pd.DataFrame(rental_gdf).drop(columns=['geometry']), park, 'time_park')
park_rental = park_rental[park_rental['time_park'] != -1] # filter out error entries
park_rank = park_rental.groupby('postcode').agg({'time_park': 'mean'}).reset_index()




In [20]:
park_rank = park_rank.sort_values('time_park', ascending=True).reset_index(drop=True)
park_rank = postcode_count_filter(park_rank, valid_postcode_list).reset_index(drop=True)

In [21]:
park_rank['rank'] = park_rank.index + 1
park_rank = park_rank[['postcode', 'rank']]
park_rank

Unnamed: 0,postcode,rank
0,3975,1
1,3976,2
2,3806,3
3,3803,4
4,3805,5
...,...,...
206,3690,207
207,3691,208
208,3585,209
209,3305,210


# inf.Combine

In [22]:

liveability_weight = 4
affordability_weight = 1

## liveability

In [23]:
liveability_list = [ptv_rank, crime_rank, city_rank, park_rank]
suffixes = ['_ptv', '_crime', '_city', '_park']
weight = [1,1,1,1]
for i, df in enumerate(liveability_list):
    liveability_list[i] = df.rename(columns={'rank': 'rank' + suffixes[i]})

In [24]:
joined_rank = liveability_list[0].rename(columns={'rank': 'rank' + suffixes[0]})
for i in range(1, len(liveability_list)):
    joined_rank = pd.merge(joined_rank, liveability_list[i], on='postcode')

In [25]:
joined_rank['liveability_rank'] = joined_rank.apply(lambda x: sum([x['rank' + suffixes[i]] * weight[i] / sum(weight) for i in range(len(suffixes))]), axis=1)

In [26]:
joined_rank.sort_values('liveability_rank', ascending=True, inplace=True)
joined_rank

Unnamed: 0,postcode,rank_ptv,rank_crime,rank_city,rank_park,liveability_rank
5,3053,6,21,3,91,30.25
20,3161,21,2,53,54,32.50
37,3124,38,16,54,38,36.50
30,3142,31,46,24,50,37.75
4,3143,5,66,35,48,38.50
...,...,...,...,...,...,...
179,3644,180,191,202,202,193.75
207,3631,208,192,191,194,196.25
201,3305,202,175,206,210,198.25
210,3691,211,179,210,208,202.00


## affordability joined

In [27]:
joined_rank = pd.merge(joined_rank, affordability_rank.rename(columns={'rank': 'rank_affordability'}), on='postcode')
joined_rank

Unnamed: 0,postcode,rank_ptv,rank_crime,rank_city,rank_park,liveability_rank,rank_affordability
0,3053,6,21,3,91,30.25,19
1,3161,21,2,53,54,32.50,134
2,3124,38,16,54,38,36.50,96
3,3142,31,46,24,50,37.75,195
4,3143,5,66,35,48,38.50,194
...,...,...,...,...,...,...,...
206,3644,180,191,202,202,193.75,142
207,3631,208,192,191,194,196.25,51
208,3305,202,175,206,210,198.25,113
209,3691,211,179,210,208,202.00,70


In [28]:
joined_rank['final_rank'] = joined_rank.apply(lambda x: (x['liveability_rank']*liveability_weight + x['rank_affordability']*affordability_weight) / (liveability_weight+affordability_weight), axis=1)
joined_rank.sort_values('final_rank', ascending=True, inplace=True)

In [29]:
joined_rank.reset_index(drop=True).head(10)

Unnamed: 0,postcode,rank_ptv,rank_crime,rank_city,rank_park,liveability_rank,rank_affordability,final_rank
0,3053,6,21,3,91,30.25,19,28.0
1,3000,1,83,2,82,42.0,1,33.8
2,3108,65,9,74,53,50.25,3,40.8
3,3008,4,55,9,88,39.0,53,41.8
4,3104,81,5,56,64,51.5,14,44.0
5,3003,2,111,5,87,51.25,25,46.0
6,3129,35,44,75,49,50.75,31,46.8
7,3128,18,51,89,52,52.5,27,47.4
8,3103,48,47,65,66,56.5,11,47.4
9,3124,38,16,54,38,36.5,96,48.4


### save

In [30]:
joined_rank.to_csv('../data/curated/liveability_final.csv', index=False)

In [31]:
joined_rank

Unnamed: 0,postcode,rank_ptv,rank_crime,rank_city,rank_park,liveability_rank,rank_affordability,final_rank
0,3053,6,21,3,91,30.25,19,28.0
7,3000,1,83,2,82,42.00,1,33.8
13,3108,65,9,74,53,50.25,3,40.8
5,3008,4,55,9,88,39.00,53,41.8
17,3104,81,5,56,64,51.50,14,44.0
...,...,...,...,...,...,...,...,...
208,3305,202,175,206,210,198.25,113,181.2
206,3644,180,191,202,202,193.75,142,183.4
204,3377,165,189,183,195,183.00,197,185.8
210,3875,209,204,208,197,204.50,120,187.6


In [34]:
joined_rank.sort_values('final_rank', ascending=True, inplace=True)
joined_rank.reset_index(drop=True, inplace=True)
joined_rank['rank'] = joined_rank.index + 1

In [37]:
joined_rank[['postcode', 'rank']].to_csv('../data/curated/liveability_rank.csv', index=False)