Liveability score is defined to be the weighted average of below features

1. Housing affordability
2. Access to public transport
3. Access to city center


# 0.Setup

In [38]:
import sys
sys.path.append('../')
from scripts.proximity import proximity_sjoin

In [39]:
import pandas as pd
import numpy as np
import geopandas as gpd
# rental data used as the main data for analysis
rental_data = pd.read_csv('../data/curated/rental-17-24.csv')
suburb_shape = gpd.read_file('../data/curated/shapefiles/postcode/POSTCODE_POLYGON.shp')[['POSTCODE', 'geometry']]
ptv_time = pd.read_csv('../data/raw/time_ptv.csv')
crime_data = pd.read_excel('../data/landing/crime2024.xlsx', sheet_name='Table 03', header=0)

In [40]:
rental_gdf = gpd.GeoDataFrame(rental_data, geometry=gpd.points_from_xy(rental_data.lng, rental_data.lat), crs='EPSG:7844')
rental_gdf = gpd.sjoin(rental_gdf, suburb_shape, how='left', predicate='within')\
    .drop(columns=['index_right'])
assert rental_gdf.shape[0] == rental_data.shape[0]  # check if all rows are joined
assert rental_gdf[rental_gdf['POSTCODE'].isnull()].shape[0] == 0 # check if there are any rows that are not joined
rental_gdf.rename(columns={'POSTCODE': 'postcode'}, inplace=True)

In [41]:
rental_gdf = rental_gdf[rental_gdf['year']== 2023]

## postcodes with at least 100 instances


In [42]:
threshold = 0
count_ins = rental_gdf.groupby('postcode').size().reset_index(name='count')
valid_postcode_list = count_ins[count_ins['count'] > threshold]['postcode'].tolist()

In [43]:
def postcode_count_filter(df, valid_postcode_list):
    return df[df['postcode'].isin(valid_postcode_list)]

# 1.Housing affordability

Housing affordability can be expressed as the ratio of housing costs to gross household income (ABS 2022a).

In [44]:
affordability = pd.DataFrame(rental_gdf) # making a copy
affordability = affordability[affordability['year']==2023]
# get rent as a percentage of income (just to make it meaningful)
affordability['price/income'] = affordability['rented_price']/7*365 / affordability['median_income']
affordability = affordability.groupby('postcode').agg({'price/income': 'mean'}).reset_index()

In [45]:
affordability.sort_values('price/income', ascending=True, inplace=True)

In [None]:
affordability_rank = postcode_count_filter(affordability,valid_postcode_list).reset_index(drop=True)
affordability_rank['rank'] = affordability_rank.index + 1
affordability_rank = affordability_rank[['postcode', 'rank']]
affordability_rank

In [47]:
affordability.to_csv('../data/curated/affordability.csv', index=False)

# 2.Access to Public Transport

defined as the average time to public transport for properties in that suburb

In [None]:
public_transport = proximity_sjoin(pd.DataFrame(rental_gdf).drop(columns=['geometry']), ptv_time,'ptv_time')
assert public_transport[public_transport['ptv_time'].isna()].shape[0] == 0
public_transport = public_transport[public_transport['ptv_time'] != -1] # filter out error entries

In [None]:
# get average time to ptv stops
public_transport_res = public_transport.groupby('postcode').agg({'ptv_time': 'mean'}).reset_index()
# get count
count_by_suburb = public_transport.groupby('postcode').size().reset_index(name='count')
public_transport_res = pd.merge(public_transport_res, count_by_suburb, on='postcode')
# sort
public_transport_res.sort_values('ptv_time', ascending=True, inplace=True)
# join suburb names
public_transport_res

In [None]:
ptv_rank = postcode_count_filter(public_transport_res, valid_postcode_list).reset_index(drop=True)
# get postcode and index
ptv_rank['rank'] = ptv_rank.index +1
ptv_rank = ptv_rank[['postcode', 'rank']]
ptv_rank

# 3.Crime

https://www.crimestatistics.vic.gov.au/crime-statistics/latest-crime-data-by-area

In [51]:
crime_rank = crime_data[['Postcode', 'Incidents Recorded']].groupby('Postcode').size().reset_index(name='count').sort_values('count').reset_index(drop=True).rename(columns={'Postcode': 'postcode'})
crime_rank['postcode'] = crime_rank['postcode'].astype(str)
crime_rank = postcode_count_filter(crime_rank, valid_postcode_list).reset_index(drop=True)

In [52]:
# population
population = rental_gdf.groupby('postcode').agg({'population': 'mean'}).reset_index()
population = postcode_count_filter(population, valid_postcode_list).reset_index(drop=True)
crime_rank = pd.merge(crime_rank, population, on='postcode', how='left')

In [None]:
crime_rank['crime_rate'] = crime_rank['count'] / crime_rank['population']
crime_rank.sort_values('crime_rate', ascending=True, inplace=True)
crime_rank.reset_index(drop=True, inplace=True)
crime_rank['rank'] = crime_rank.index + 1
crime_rank = crime_rank[['postcode', 'rank']]
crime_rank

# 4.Proximity to CBDs

In [54]:
city_rank = rental_gdf[rental_gdf['time_city'] != -1] # filter out error entries
city_rank = city_rank.groupby('postcode').agg({'time_city': 'mean'}).reset_index()


In [None]:
city_rank.sort_values('time_city', ascending=True, inplace=True)
city_rank = postcode_count_filter(city_rank, valid_postcode_list).reset_index(drop=True)
city_rank['rank'] = city_rank.index + 1
city_rank = city_rank[['postcode', 'rank']]
city_rank

# 5.Proximity Parks

In [None]:
park = pd.read_csv('../data/raw/time_park.csv')
# park = gpd.GeoDataFrame(park, geometry=gpd.points_from_xy(park.lng, park.lat), crs='EPSG:7844')
park_rental = proximity_sjoin(pd.DataFrame(rental_gdf).drop(columns=['geometry']), park, 'time_park')
park_rental = park_rental[park_rental['time_park'] != -1] # filter out error entries
park_rank = park_rental.groupby('postcode').agg({'time_park': 'mean'}).reset_index()

In [57]:
park_rank = park_rank.sort_values('time_park', ascending=True).reset_index(drop=True)
park_rank = postcode_count_filter(park_rank, valid_postcode_list).reset_index(drop=True)

In [None]:
park_rank['rank'] = park_rank.index + 1
park_rank = park_rank[['postcode', 'rank']]
park_rank

# inf.Combine

In [59]:

liveability_weight = 4
affordability_weight = 1

## liveability

In [60]:
liveability_list = [ptv_rank, crime_rank, city_rank, park_rank]
suffixes = ['_ptv', '_crime', '_city', '_park']
weight = [1,1,1,1]
for i, df in enumerate(liveability_list):
    liveability_list[i] = df.rename(columns={'rank': 'rank' + suffixes[i]})

In [61]:
joined_rank = liveability_list[0].rename(columns={'rank': 'rank' + suffixes[0]})
for i in range(1, len(liveability_list)):
    joined_rank = pd.merge(joined_rank, liveability_list[i], on='postcode')

In [62]:
joined_rank['liveability_rank'] = joined_rank.apply(lambda x: sum([x['rank' + suffixes[i]] * weight[i] / sum(weight) for i in range(len(suffixes))]), axis=1)

In [None]:
joined_rank.sort_values('liveability_rank', ascending=True, inplace=True)
joined_rank

## affordability joined

In [None]:
joined_rank = pd.merge(joined_rank, affordability_rank.rename(columns={'rank': 'rank_affordability'}), on='postcode')
joined_rank

In [65]:
joined_rank['final_rank'] = joined_rank.apply(lambda x: (x['liveability_rank']*liveability_weight + x['rank_affordability']*affordability_weight) / (liveability_weight+affordability_weight), axis=1)
joined_rank.sort_values('final_rank', ascending=True, inplace=True)

In [None]:
joined_rank.reset_index(drop=True).head(10)

### save

In [67]:
joined_rank.to_csv('../data/curated/liveability_final.csv', index=False)

In [None]:
joined_rank.head(10)

In [69]:
joined_rank.sort_values('final_rank', ascending=True, inplace=True)
joined_rank.reset_index(drop=True, inplace=True)
joined_rank['rank'] = joined_rank.index + 1

In [70]:
joined_rank[['postcode', 'rank']].to_csv('../data/curated/liveability_rank.csv', index=False)