## Calculate average distance to nearest public transport

### Note:
Before running the code, make sure you have the rental history datasets before running this code

Make sure to run `scrape_rent_history.py` and `preprocess_rental_history.ipynb` before running this notebook

Make sure you have the Public transport data called `Public Transport Victoria` from the [Google Drive](https://drive.google.com/drive/folders/1JzqWIVPAHOvMeD0X1u3RefYBSj1PehZ0?usp=sharing) and save it in the `data/map/` directory.

Also download the `VIC Localities Shape File` from the [Google Drive](https://drive.google.com/drive/folders/1JzqWIVPAHOvMeD0X1u3RefYBSj1PehZ0?usp=sharing) and save that in the `data/map/` directory.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import openrouteservice
from shapely.geometry import Point, Polygon, MultiPolygon
import time
import os
from dotenv import load_dotenv
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [2]:
env_path = Path('..') / 'api.env'

# Load the .env file from the specified path
load_dotenv(dotenv_path=env_path)

# Access the API key from the environment variable
api_key = os.getenv("API_KEY")

In [3]:
# Load the ORS client with your API key
client = openrouteservice.Client(key=api_key)

### Load the rental history

This is done to only calculate the distances for the suburbs we have data on, so that we don't run in to significant API limitations.

In [4]:
# Rental History Data
one_bed_flat = pd.read_csv('../../data/curated/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../../data/curated/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../../data/curated/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../../data/curated/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../../data/curated/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../../data/curated/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../../data/curated/rental_history/all_properties.csv')

In [5]:
# Concatenate all property data into one DataFrame
property_data = pd.concat([one_bed_flat, two_bed_flat, three_bed_flat, 
                           two_bed_house, three_bed_house, four_bed_house, all_properties], 
                          ignore_index=True)

# Extract unique suburbs to calculate distances for
unique_suburbs_df = pd.DataFrame(property_data['suburb'].unique(), columns=['suburb'])

### Load Public Transport & Suburb shapefiles

In [6]:
# Load public transport shapefiles for train and tram stops
pt_area_gdf = gpd.read_file('../../data/map/Public Transport Victoria/gis_osm_transport_a_free_1.shp')
pt_line_gdf = gpd.read_file('../../data/map/Public Transport Victoria/gis_osm_transport_free_1.shp')
pt_gdf = pd.concat([pt_area_gdf, pt_line_gdf], ignore_index=True)

In [7]:
# Load Victoria suburbs shapefile for filtering
vic_suburbs_gdf = gpd.read_file('../../data/map/VIC Localities Shape File/gda2020_vicgrid/esrishape/whole_of_dataset/victoria/VMADMIN/LOCALITY_POLYGON.shp')
vic_suburbs_gdf = vic_suburbs_gdf.to_crs(epsg=4326)
vic_suburbs_gdf['suburb'] = vic_suburbs_gdf['GAZLOC'].str.lower()
# Merge unique suburbs with their geometries using a left join
unique_suburbs_gdf = unique_suburbs_df.merge(vic_suburbs_gdf[['suburb', 'geometry']], on='suburb', how='left')
unique_suburbs_gdf = unique_suburbs_gdf.dropna(subset=['geometry'])

# Filter public transport stops that fall within Victoria
filtered_pt_gdf = gpd.sjoin(pt_gdf.to_crs(epsg=4326), vic_suburbs_gdf, how='inner', predicate='intersects')


In [8]:
# Filter train stations and tram stops
train_stops_gdf = filtered_pt_gdf[filtered_pt_gdf['fclass'].isin(['railway_station', 'railway_halt'])]
tram_stops_gdf = filtered_pt_gdf[filtered_pt_gdf['fclass'] == 'tram_stop']

# Convert any Polygon or MultiPolygon geometries to centroids
train_stops_gdf['geometry'] = train_stops_gdf['geometry'].apply(lambda geom: geom.centroid if isinstance(geom, (Polygon, MultiPolygon)) else geom)
tram_stops_gdf['geometry'] = tram_stops_gdf['geometry'].apply(lambda geom: geom.centroid if isinstance(geom, (Polygon, MultiPolygon)) else geom)

In [9]:
# Re-project the property data and stops to EPSG:4326
train_stops_gdf.set_crs(epsg=4326, inplace=True, allow_override=True)
tram_stops_gdf.set_crs(epsg=4326, inplace=True, allow_override=True)

Unnamed: 0,osm_id,code,fclass,name,geometry,index_right,UFI,PFI,LOCALITY,GAZLOC,VICNAMESID,TASK_ID,PFI_CR,UFI_OLD,UFI_CR,LABEL_USE_,suburb
689,217625262,5603,tram_stop,Stop 124A: Casino/MCEC,POINT (144.95619 -37.82335),1921,743233293,205410296,SOUTHBANK,SOUTHBANK,102786,,2005-09-27,729544619,2023-04-17,5,southbank
690,217625264,5603,tram_stop,Stop 124A: Casino/MCEC,POINT (144.95609 -37.82337),1925,743233294,210768089,SOUTH WHARF,SOUTH WHARF,103517,,2008-03-17,729544621,2023-04-17,5,south wharf
3998,122817630,5603,tram_stop,Stop D1: Docklands Stadium,POINT (144.9464 -37.81461),2897,812633154,205410229,DOCKLANDS,DOCKLANDS,100986,,2005-09-27,743233291,2023-11-24,5,docklands
4189,252587671,5603,tram_stop,Stop 7D: AAMI Park,POINT (144.98679 -37.82411),2057,777758969,205410007,MELBOURNE,MELBOURNE,102000,,2005-09-27,743233290,2023-07-03,5,melbourne
4190,252588293,5603,tram_stop,Stop 7A: William Barak Bridge,POINT (144.97598 -37.81823),2057,777758969,205410007,MELBOURNE,MELBOURNE,102000,,2005-09-27,743233290,2023-07-03,5,melbourne
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70903,11234800056,5603,tram_stop,Stop 58: Box Hill Central,POINT (145.12208 -37.81783),2260,746966702,205410187,BOX HILL,BOX HILL,100468,,2005-09-27,468966733,2023-05-01,5,box hill
70904,11234800057,5603,tram_stop,Stop 58: Box Hill Central,POINT (145.12207 -37.8179),2260,746966702,205410187,BOX HILL,BOX HILL,100468,,2005-09-27,468966733,2023-05-01,5,box hill
77050,11852055220,5603,tram_stop,Stop 6: Melbourne Central and State Library St...,POINT (144.96427 -37.80945),2057,777758969,205410007,MELBOURNE,MELBOURNE,102000,,2005-09-27,743233290,2023-07-03,5,melbourne
77586,11903142479,5603,tram_stop,Stop 30: St Kilda Junction,POINT (144.98252 -37.85535),2057,777758969,205410007,MELBOURNE,MELBOURNE,102000,,2005-09-27,743233290,2023-07-03,5,melbourne


#### Function to calculate distance to public transport

In [10]:
# Function to calculate nearest transport stops for each suburb
def calculate_nearest_transports(suburb_geometry, transport_stops, num_stops=3, profile='driving-car'):
    # Suburb centroid to use as the reference point
    suburb_centroid = suburb_geometry.centroid
    
    # Find nearest transport stops 
    nearby_stops = transport_stops.copy()
    nearby_stops['distance'] = nearby_stops.geometry.distance(suburb_centroid)
    
    # Sort by distance and take the nearest `num_stops` stops
    nearest_stops = nearby_stops.nsmallest(num_stops, 'distance')

    if nearest_stops.empty:
        return None

    total_distance = 0
    for _, stop in nearest_stops.iterrows():
        try:
            # Use ORS API to get the driving distance
            distances = client.distance_matrix(
                locations=[(suburb_centroid.x, suburb_centroid.y), (stop.geometry.x, stop.geometry.y)],
                profile=profile,
                metrics=['distance'],
                units='km'
            )
            total_distance += distances['distances'][0][1]
        except openrouteservice.exceptions.ApiError as e:
            print(f"ORS API Error: {e}")
            return None

    time.sleep(1.0)
    
    # Return average distance for the nearest `num_stops`
    return total_distance / num_stops

#### Function to calculate the distance to CBD

In [11]:
# Function to calculate the distance from suburb to Melbourne CBD (by car)
def calculate_distance_to_cbd(suburb_geometry, cbd_point, profile='driving-car'):
    suburb_centroid = suburb_geometry.centroid
    try:
        distances = client.distance_matrix(
            locations=[(suburb_centroid.x, suburb_centroid.y), (cbd_point.x, cbd_point.y)],
            profile=profile,
            metrics=['distance'],
            units='km'
        )
        return distances['distances'][0][1]  
    except openrouteservice.exceptions.ApiError as e:
        print(f"ORS API Error: {e}")
        return None

In [12]:
# Melbourne CBD as a fixed point (Flinders Street Station)
melbourne_cbd_point = Point(144.9671, -37.8183)

# Calculate the average distance to the nearest public transport stops
unique_suburbs_gdf['nearest_transport_avg_distance'] = unique_suburbs_gdf['geometry'].apply(
    lambda geom: calculate_nearest_transports(geom, train_stops_gdf.append(tram_stops_gdf), num_stops=3)
)

# Calculate the distance to Melbourne CBD
unique_suburbs_gdf['distance_to_cbd'] = unique_suburbs_gdf.apply(
    lambda row: calculate_distance_to_cbd(row['geometry'], melbourne_cbd_point),
    axis=1
)


In [13]:
# remove extra instance of Newtown (outlier - Not in Victoria)
unique_suburbs_gdf = unique_suburbs_gdf[~((unique_suburbs_gdf['suburb'] == 'newtown') & (unique_suburbs_gdf['nearest_transport_avg_distance'] > 5))]

# Save the results
unique_suburbs_gdf[['suburb', 'nearest_transport_avg_distance', 'distance_to_cbd']].to_csv('../../data/curated/suburb_transport_distances.csv', index=False)