## Livability 

### **Livability Metric**:
- **Safety Index**: Weight based on crime rates. Lower crime rates get higher scores. 
- **Transport Accessibility**: Proximity to public transport stations, with higher frequency of services. 
- **CBD Accessibility**: Proximity to CBD.
- **School Accessibility**: Based on the number of local schools. 
- **Healthcare Access**: Number of hospitals or clinics nearby. 
- **Community Amenities**: Number of parks and reservations, shopping centres, and entertainment facilities. 

### Import Libraries

In [666]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F  #filtering
import pandas as pd
import matplotlib.pyplot as plt

### Start a Spark session

In [667]:
spark = (
    SparkSession.builder.appName('Livability Analysis')
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

### Inspect datasets

In [668]:
parkres_data = pd.read_csv("../data/curated/parkres/parkres.csv", index_col=0)
property_lost_data = pd.read_csv('../data/curated/property_data/property_lost.csv', index_col=0)
train_hospital_data = pd.read_parquet('../data/curated/final_train_hospital_cbd_dist_data')
recreation_data = pd.read_csv('../data/curated/recreation_cleaned.csv')
shopping_data = pd.read_csv('../data/curated/agg_shopping_centre.csv')
population_data = pd.read_csv('../data/curated/dwellings_household.csv')

In [669]:
feature = pd.read_parquet('../data/curated/features_domain.parquet')

feature.head()

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,...,P20/P50,P10/P50,Gini coefficient,Top 1%,Top 5%,Top 10%,Lowest Quartile,Second Quartile,Third Quartile,Highest Quartile
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.77,145.18,4.0,3.0,2.0,9125.0,...,0.32,0.11,0.56,13.1,27.9,39.7,28.5,22.1,19.0,30.4
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.79,145.18,4.0,2.0,0.0,3259.0,...,0.32,0.11,0.56,13.1,27.9,39.7,28.5,22.1,19.0,30.4
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.8,145.18,5.0,2.0,2.0,5649.0,...,0.32,0.11,0.56,13.1,27.9,39.7,28.5,22.1,19.0,30.4
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.79,145.17,3.0,1.0,5.0,3585.0,...,0.32,0.11,0.56,13.1,27.9,39.7,28.5,22.1,19.0,30.4
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.78,145.18,3.0,2.0,2.0,2955.0,...,0.32,0.11,0.56,13.1,27.9,39.7,28.5,22.1,19.0,30.4


In [670]:
'''# Display all column names as a list (with no truncation)
columns_list = feature.columns.tolist()
for col in columns_list:
    print(col)'''

'# Display all column names as a list (with no truncation)\ncolumns_list = feature.columns.tolist()\nfor col in columns_list:\n    print(col)'

### Liveable Index Metrics

#### 1. Parks and Reservations Count

In [671]:
# Group by 'sa2_name' to count the number of parks and reservations
parkres_count = parkres_data.groupby('sa2_name').size().reset_index(name='parkres_count')

parkres_count.head()

Unnamed: 0,sa2_name,parkres_count
0,Abbotsford,11
1,Alphington - Fairfield,1
2,Altona,2
3,Altona Meadows,2
4,Altona North,1


#### 2. Property Lost Rate Per Capita

In [672]:
# Disable scientific notation in pandas
pd.set_option('display.float_format', '{:.2f}'.format)

In [673]:
# Grouping by 'sa2_name' and aggregating the required metrics
property_lost_stats = property_lost_data.groupby('sa2_name').agg(
    crime_frequency=('sa2_name', 'size'),  
    total_value_lost=('value of items ($)', 'sum'),  
    total_items_lost=('number of items', 'sum') 
).reset_index()

# Sort by crime_frequency in descending order
property_lost_stats = property_lost_stats.sort_values(by='crime_frequency', ascending=False)

property_lost_stats.head()

Unnamed: 0,sa2_name,crime_frequency,total_value_lost,total_items_lost
187,West Footscray - Tottenham,3287,29285221.68,40017
126,North Melbourne,2848,117641963.71,201522
7,Ashwood - Chadstone,2703,28860046.34,44787
148,Roxburgh Park - North,2604,56605655.19,73377
176,Truganina - South West,2559,73155166.38,95074


In [674]:
# Only keep specific columns
population_data = population_data[['Region', 'ERP_2021']]

# Rename 'Region' to 'sa2_name'
population_data.rename(columns={'Region': 'sa2_name'}, inplace=True)

population_data.head()

Unnamed: 0,sa2_name,ERP_2021
0,Alfredton,16841.0
1,Ballarat,12071.0
2,Buninyong,7229.0
3,Delacombe,10648.0
4,Smythes Creek,4211.0


In [675]:
# Merge population data with property lost stats based on the SA2 name
merged_df = pd.merge(property_lost_stats, population_data, how='outer', on='sa2_name')

# Calculate crime rate per 1,000 residents
merged_df['crime_rate_per_1000'] = (merged_df['crime_frequency'] / merged_df['ERP_2021']) * 1000

# Replace missing population values or zero population with a small positive number to avoid division by zero
merged_df['ERP_2021'] = merged_df['ERP_2021'].fillna(1)
merged_df['ERP_2021'] = merged_df['ERP_2021'].replace(0, 1)

# Recalculate the crime rate after filling missing or zero population values
merged_df['crime_rate_per_1000'] = (merged_df['crime_frequency'] / merged_df['ERP_2021']) * 1000

# Sort by crime rate to see regions with the highest rates
merged_df = merged_df.sort_values(by='crime_rate_per_1000', ascending=False)

merged_df[['sa2_name', 'crime_rate_per_1000']].head()

Unnamed: 0,sa2_name,crime_rate_per_1000
56,Braeside,13964.29
312,Moorabbin Airport,11307.69
287,Melbourne Airport,3500.0
136,Croydon South,481.36
72,Bundoora - West,278.18


#### 3. Hospital Count

In [676]:
# Count unique hospital IDs in each SA2
hospital_count = train_hospital_data.groupby('sa2_name')['nearest_hospital_id'].nunique().reset_index(name='hospital_count')

# Sort the result by 'hospital_count' in descending order
hospital_count = hospital_count.sort_values(by='hospital_count', ascending=False)

hospital_count.head()

Unnamed: 0,sa2_name,hospital_count
22,Blackburn,6
183,Oakleigh - Huntingdale,6
202,Richmond (South) - Cremorne,5
236,Surrey Hills (West) - Canterbury,5
25,Box Hill,5


#### 4. Transportation Accessibility

In [677]:
# Count unique transport station IDs in each SA2
transport_count = train_hospital_data.groupby('sa2_name')['nearest_station_id'].nunique().reset_index(name='station_count')

# Sort the result by 'station_count' in descending order
transport_count = transport_count.sort_values(by='station_count', ascending=False)

transport_count.head()

Unnamed: 0,sa2_name,station_count
44,Camberwell,8
34,Brunswick West,8
152,Malvern East,8
51,Caulfield - North,7
15,Balwyn North,7


#### 5. CBD Accessibility

In [678]:
# Calculate the average distance to the CBD for each SA2
cbd_accessibility = train_hospital_data.groupby('sa2_name')['cbd_distance_km'].mean().reset_index(name='avg_cbd_distance_km')

# Sort the result by 'avg_cbd_distance_km' in ascending order
cbd_accessibility = cbd_accessibility.sort_values(by='avg_cbd_distance_km', ascending=True)

cbd_accessibility.head()

Unnamed: 0,sa2_name,avg_cbd_distance_km
157,Melbourne CBD - North,0.86
158,Melbourne CBD - West,1.12
156,Melbourne CBD - East,1.16
46,Carlton,1.86
268,West Melbourne - Residential,1.88


#### 6. Recreation / Entertainment Facilities Count

In [679]:
recreation_count = recreation_data.copy()

# Move 'sa2_name' to the first position
cols = ['sa2_name'] + [col for col in recreation_count.columns if col != 'sa2_name']
recreation_count = recreation_count[cols]

# Rename the column 'Facility Count' to 'recreation_facility_count'
recreation_count.rename(columns={'Facility Count': 'recreation_facilities_count'}, inplace=True)

recreation_count.drop(columns=['sa2_code'], inplace=True)

# Sort the result by 'recreation_facility_count' in descending order
recreation_count = recreation_count.sort_values(by='recreation_facilities_count', ascending=False)

recreation_count.head()

Unnamed: 0,sa2_name,recreation_facilities_count
91,Seaford (Vic.),480
119,Yarra Valley,467
79,Point Cook - North West,462
83,Reservoir - South West,439
18,Caroline Springs,385


#### 7. Shopping Centres Count

In [680]:
# Make a copy of the shopping data
shopping_count = shopping_data.copy()

# Drop unnecessary columns 'mean_stores' and 'total_stores'
shopping_count.drop(columns=['mean_stores', 'total_stores'], inplace=True)

# Rename the column 'count_shopping_centres' to 'shopping_centres_count'
shopping_count.rename(columns={'count_shopping_centres': 'shopping_centres_count'}, inplace=True)

# Sort the result by 'shopping_centres_count' in descending order
shopping_count = shopping_count.sort_values(by='shopping_centres_count', ascending=False)

shopping_count.head()

Unnamed: 0,sa2_name,shopping_centres_count
84,Melbourne CBD - East,10
61,Glen Waverley - West,3
85,Melbourne CBD - North,3
137,Warrnambool - North,3
40,Docklands,3


#### 8. School Count

In [681]:
# Count unique closest schools in each SA2
school_count= feature.groupby('sa2_name')['closest_school'].nunique().reset_index(name='school_count')

# Sort the result by 'avg_cbd_distance_km' in descending order
school_count = school_count.sort_values(by='school_count', ascending=False)

school_count.head()

Unnamed: 0,sa2_name,school_count
265,Werribee - South,13
195,Preston - East,12
230,St Kilda East,11
14,Balwyn,11
178,North Melbourne,10


### 9. Rent Count

In [682]:
# Calculate the average rent (price) by SA2
rent_count = feature.groupby('sa2_name').agg({'extracted_price': 'mean'}).reset_index()

# Sort the result by the average rent (price) in ascending order
rent_count = rent_count.sort_values(by='extracted_price', ascending=False)

rent_count.head()

Unnamed: 0,sa2_name,extracted_price
261,Warrandyte - Wonga Park,1700.0
18,Beaumaris,1104.69
29,Brighton East,1060.0
236,Surrey Hills (West) - Canterbury,1034.52
12,Aspendale Gardens - Waterways,1000.0


### Calculate Liveablity Index

In [683]:
# Merge all relevant data on 'sa2_name'
liveable_count = school_count.merge(transport_count, on='sa2_name', how='outer')\
    .merge(hospital_count, on='sa2_name', how='outer')\
    .merge(parkres_count, on='sa2_name', how='outer')\
    .merge(recreation_count, on='sa2_name', how='outer')\
    .merge(shopping_count, on='sa2_name', how='outer')\
    .merge(rent_count, on='sa2_name', how='outer')\
    .merge(merged_df, on='sa2_name', how='outer')\
    .merge(cbd_accessibility, on='sa2_name', how='outer')

# Fill any NaN values with 0 for counting purposes
liveable_count.fillna(0, inplace=True)

In [684]:
liveable_count.head(1)

Unnamed: 0,sa2_name,school_count,station_count,hospital_count,parkres_count,recreation_facilities_count,shopping_centres_count,extracted_price,crime_frequency,total_value_lost,total_items_lost,ERP_2021,crime_rate_per_1000,avg_cbd_distance_km
0,Abbotsford,5.0,4.0,4.0,11.0,0.0,0.0,699.9,677.0,10978545.92,18992.0,9258.0,73.13,4.64


In [685]:
# Define the weights for each factor (you can adjust these weights)
school_weight = 0.1
station_weight = 0.2
hospital_weight = 0.15
park_weight = 0.05
recreation_weight = 0.05
shopping_weight = 0.05
crime_weight = 0.1
cbd_weight = 0.1
rent_weight = 0.2

In [686]:
# Z-score normalization for each element
liveable_count['school_count'] = (liveable_count['school_count'] - liveable_count['school_count'].min()) / (liveable_count['school_count'].max() - liveable_count['school_count'].min())
liveable_count['station_count'] = (liveable_count['station_count'] - liveable_count['station_count'].min()) / (liveable_count['station_count'].max() - liveable_count['station_count'].min())
liveable_count['hospital_count'] = (liveable_count['hospital_count'] - liveable_count['hospital_count'].min()) / (liveable_count['hospital_count'].max() - liveable_count['hospital_count'].min())
liveable_count['parkres_count'] = (liveable_count['parkres_count'] - liveable_count['parkres_count'].min()) / (liveable_count['parkres_count'].max() - liveable_count['parkres_count'].min())
liveable_count['recreation_facilities_count'] = (liveable_count['recreation_facilities_count'] - liveable_count['recreation_facilities_count'].min()) / (liveable_count['recreation_facilities_count'].max() - liveable_count['recreation_facilities_count'].min())
liveable_count['shopping_centres_count'] = (liveable_count['shopping_centres_count'] - liveable_count['shopping_centres_count'].min()) / (liveable_count['shopping_centres_count'].max() - liveable_count['shopping_centres_count'].min())
liveable_count['avg_cbd_distance_km'] = (liveable_count['avg_cbd_distance_km'] - liveable_count['avg_cbd_distance_km'].min()) / (liveable_count['avg_cbd_distance_km'].max() - liveable_count['avg_cbd_distance_km'].min())
liveable_count['crime_rate_per_1000'] = (liveable_count['crime_rate_per_1000'] - liveable_count['crime_rate_per_1000'].min()) / (liveable_count['crime_rate_per_1000'].max() - liveable_count['crime_rate_per_1000'].min())
liveable_count['extracted_price'] = (liveable_count['extracted_price'] - liveable_count['extracted_price'].min()) / (liveable_count['extracted_price'].max() - liveable_count['extracted_price'].min())

In [687]:
# Calculate the livability index based on the weighted sum of the normalized factors
liveable_count['liveability_index'] = (
    liveable_count['school_count'] * school_weight +
    liveable_count['station_count'] * station_weight +
    liveable_count['hospital_count'] * hospital_weight +
    liveable_count['parkres_count'] * park_weight +
    liveable_count['recreation_facilities_count'] * recreation_weight +
    liveable_count['avg_cbd_distance_km'] * cbd_weight -
    liveable_count['crime_rate_per_1000'] * crime_weight -  # Subtract crime rate
    liveable_count['extracted_price'] * rent_weight  # Rent is considered negative
)

In [688]:
# Sort by liveability index in descending order (best liveability first)
liveable_count = liveable_count.sort_values(by='liveability_index', ascending=False)

# Reset the index, drop the old index, and set the new index to start from 1
liveable_count = liveable_count.reset_index(drop=True)
liveable_count.index = liveable_count.index + 1

# Get the liveability index of the 10th position
tenth_value = liveable_count.iloc[9]['liveability_index']

# Filter the dataframe to include all rows with liveability index greater than or equal to the 10th position
top_liveable = liveable_count[liveable_count['liveability_index'] >= tenth_value]

top_liveable[['sa2_name', 'liveability_index']]

Unnamed: 0,sa2_name,liveability_index
1,Frankston,0.33
2,Camberwell,0.31
3,Caulfield - North,0.29
4,Richmond - North,0.28
5,Malvern East,0.27
6,Brunswick West,0.27
7,North Melbourne,0.27
8,Oakleigh - Huntingdale,0.26
9,Hawthorn East,0.26
10,Surrey Hills (West) - Canterbury,0.26
