This notebook performs proximity analysis using a locally installed ORS server and Australian map data. It involves loading datasets related to rental properties and public transport (PTV) stops, calculating the nearest PTV stops for each property, and determining the travel time to these stops using the ORS server. The results are then merged with the original dataset and saved for further analysis.

In [None]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from scipy.spatial import cKDTree
import numpy as np
import requests
import json
from tqdm.auto import tqdm
tqdm.pandas()  # "tqdm>=4.9.0"
school_path = "../data/raw/school_zones/school_zones.csv"
ptv_path = "../data/raw/ptv_zones/ptv_zones.csv"
property_path = "../data/curated/rental-17-24.csv"
ors_url = "http://localhost:8080/ors/v2/directions/foot-walking"
out_rental_path = "../data/raw/property/rental_data_cleaned_ptv.csv"
out_mapping_path = "../data/raw/property/rental_data_cleaned_ptv_mapping.csv"

# Load Datasets

## Rental

In [27]:
property_df = pd.read_csv(open(property_path, "r"))

In [10]:
property_df['geometry'] = property_df.apply(lambda row: Point(row['lng'], row['lat']), axis=1)

## PTV

In [29]:
paths = ["../data/raw/PTV/1 - Regional Train/stops_1_sa2.csv",
         "../data/raw/PTV/2 - Metropolitan Train/stops_2_sa2.csv",
         "../data/raw/PTV/3 - Metropolitan Tram/stops_3_sa2.csv", 
         "../data/raw/PTV/4 - Metropolitan Bus/stops_4_sa2.csv",
         "../data/raw/PTV/5 - Regional Coach/stops_5_sa2.csv",
         "../data/raw/PTV/6 - Regional Bus/stops_6_sa2.csv"]
total_ptv_coords = []
for path in paths:
    ptv = pd.read_csv(path)
    total_ptv_coords = total_ptv_coords + [ptv[['stop_lon','stop_lat']].values]
    # total_ptv_coords = total_ptv_coords + ptv[['stop_lon','stop_lat']].values\
total_ptv_coords = np.concatenate(total_ptv_coords)

# Proximity

In [31]:
num_to_find = 3
tree = cKDTree(total_ptv_coords)
property_coords = property_df[['lng', 'lat']].values
distances, indices = tree.query(property_coords, k=3)
property_df["ptv_index"] = list(indices)

In [32]:
def get_time_proximity(coordinates):
    url = ors_url
    body = {"coordinates": coordinates}
    # print(body)
    response = requests.post(url, json=body)
    try:
        if response.status_code == 200:
            summary = response.json()["routes"][0]["summary"]
            if "duration" in summary:
                return summary["duration"]
            return 0
        else:
            if(response.json()["error"]["code"] == 2004):
                # distance is too long (> 100000.0 m)
                return -1
            if(response.json()["error"]["code"] == 2010):
                # "error":{"code":2010,"message":"Could not find routable point within a radius of 400.0 meters of specified coordinate
                return -1
            print(body)
            print(response.text)
            raise Exception("Error in request")
    except Exception as e:
        print(response.json())
        raise e

# # Example usage
# directions = get_time_proximity([[144.96332, -37.8140], [144.96332, -37.8120]])
# print(directions)

In [49]:
def get_time_proximity_from_property(row, ptv_stop_coords,tree):
    min_time = -1
    # if(num_to_find == 1):
    #     coordinates = [[row['lng'], row['lat']], ptv_stop_coords[indices].tolist()]
    #     return get_time_proximity(coordinates)
    for index in row['ptv_index']:
        coordinates = [[row['lng'], row['lat']], ptv_stop_coords[index].tolist()]
        time = get_time_proximity(coordinates)
        # print(time)
        if min_time == -1:
            min_time = time
        min_time = min(time,min_time)
    # coordinates = [[row['lng'], row['lat']], ptv_stop_coords[IndexError].tolist()]
    # return get_time_proximity(coordinates)
    # print(min_time)
    
    return min_time

In [None]:
i=5

path = f"../data/raw/oldlistings_rent_{i}.json"
property_df = pd.read_json(open(path, "r"))

# get index of the closest ptv stop
property_coords = property_df[['lng', 'lat']].values
distances, indices = tree.query(property_coords, k=3)
property_df["ptv_index"] = list(indices)

# dist time matching
property_df["time_to_ptv_1"] = property_df.progress_apply(lambda row: get_time_proximity_from_property(row, total_ptv_coords, tree), axis=1)
property_df.to_csv(f"../data/raw/property/rent_{i}_ptv.csv", index=False)

# join to orginal dataset

## read all

In [3]:
cleaned_df = pd.read_csv("../data/raw/property/rental_data_cleaned.csv")

In [None]:
cleaned_df['date'].drop_duplicates().sort_values()

In [5]:
# read all the ptv data
all_ptv = pd.concat([pd.read_csv(f"../data/raw/property/rent_{i}_ptv.csv") for i in range(1,6)])

In [None]:
cleaned_df

In [17]:
all_ptv['ptv_time'] = all_ptv['time_to_ptv_1']
all_ptv = all_ptv[['lat','lng','address','ptv_time']]
all

In [37]:
all_ptv = all_ptv.drop_duplicates()

In [38]:
merged_df = cleaned_df.merge(all_ptv, on=['lat','lng','address'], how='left')

In [None]:
merged_df.shape

In [None]:
cleaned_df.shape

In [24]:
unmatched_rows = merged_df[merged_df['ptv_time'].isna()]

In [33]:
duplicated_rows = all_ptv[all_ptv.duplicated(subset=['lat', 'lng', 'address'], keep=False)]

In [None]:
all_ptv.drop_duplicates().shape

In [41]:
merged_df.to_csv(out_rental_path, index=False)

# Store Data in forms of a mapping

In [2]:
data = pd.read_csv(out_rental_path)

In [6]:
data[['ptv_time','lat','lng']].drop_duplicates().to_csv(out_mapping_path, index=False)