Note:



In [38]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from scipy.spatial import cKDTree
import numpy as np
import requests
import json
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
tqdm.pandas()  # "tqdm>=4.9.0"
ptv_path = "../data/raw/ptv_zones/ptv_zones.csv"

# property data

In [4]:
path = "../data/raw/property/rental_data_cleaned.csv"
property_df = pd.read_csv(path)

In [13]:
coords = property_df[['lat', 'lng']].drop_duplicates()

# City
according to wikipedia, the top 4 city in victoria is
1.	Melbourne	
2.	Geelong	
3.	Ballarat
4.	Bendigo

In [20]:
city = pd.DataFrame({'locations': ["-37.814819636878, 144.96639135042003",
                                   "-38.14928665137738, 144.35990084446854",
                                   "-37.56151637411765, 143.85484215682467",
                                   "-36.75960898491938, 144.2786689721445"],
                     'city': ["Melbourne", "Geelong", "Ballarat", "Bendigo"]})
city['lat'] = city['locations'].apply(lambda x: float(x.split(",")[0]))
city['lng'] = city['locations'].apply(lambda x: float(x.split(",")[1]))
city['Point'] = city.apply(lambda x: Point(x['lng'], x['lat']), axis=1)

In [42]:
num_to_find = 3
tree = cKDTree(city[['lng', 'lat']].values)
property_coords = coords[['lng', 'lat']].values
distances, indices = tree.query(property_coords, k=1)
coords["city_index"] = list(indices)

In [44]:
coords["city_index"].value_counts()

city_index
0    369522
1     30938
3     26071
2     20615
Name: count, dtype: int64

In [59]:
def get_time_proximity(coordinates):
    url = "http://localhost:8080/ors/v2/directions/driving-car"
    body = {"coordinates": coordinates}
    # print(body)
    response = requests.post(url, json=body)
    try:
        if response.status_code == 200:
            summary = response.json()["routes"][0]["summary"]
            if "duration" in summary:
                return summary["duration"]
            return 0
        else:
            if(response.json()["error"]["code"] == 2004):
                # distance is too long (> 100000.0 m)
                return -1
            if(response.json()["error"]["code"] == 2010):
                # "error":{"code":2010,"message":"Could not find routable point within a radius of 400.0 meters of specified coordinate
                return -1
            print(body)
            print(response.text)
            raise Exception("Error in request")
    except Exception as e:
        print(response.json())
        raise e

# # Example usage
# directions = get_time_proximity([[144.96332, -37.8140], [144.96332, -37.8120]])
# print(directions)

In [53]:
city.iloc[0]

locations           -37.814819636878, 144.96639135042003
city                                           Melbourne
lat                                            -37.81482
lng                                           144.966391
Point        POINT (144.96639135042003 -37.814819636878)
Name: 0, dtype: object

In [62]:
def get_time_proximity_from_property(row, cities):
    city = cities.iloc[int(row['city_index'])]
    coordinates = [[row['lng'], row['lat']], [city['lng'], city['lat']]]
    time = get_time_proximity(coordinates)
    # print(time)
    return time

In [58]:
coords

Unnamed: 0,lat,lng,city_index
0,-37.813730,144.955580,0
6,-37.810280,144.956670,0
12,-37.813590,144.955720,0
16,-37.813700,144.953930,0
18,-37.808826,144.959549,0
...,...,...,...
1531532,-37.896543,144.723878,0
1531537,-37.913624,144.760232,0
1531541,-37.915965,144.780830,0
1531543,-37.896490,144.741029,0


In [63]:
# dist time matching
coords["time_city"] = coords.progress_apply(lambda row: get_time_proximity_from_property(row, city), axis=1)
# property_df.to_csv(f"../data/raw/property/rent_{i}_ptv.csv", index=False)

100%|██████████| 447146/447146 [19:06<00:00, 390.17it/s]


In [66]:
coords.to_csv(f"../data/raw/time_city.csv", index=False)

# join to orginal dataset

## read all

In [68]:
cleaned_df = pd.read_csv("../data/raw/property/rental_data_cleaned.csv")

In [17]:
all_ptv['ptv_time'] = all_ptv['time_to_ptv_1']
all_ptv = all_ptv[['lat','lng','address','ptv_time']]
all

In [37]:
all_ptv = all_ptv.drop_duplicates()

In [73]:
city_coords = pd.read_csv(f"../data/raw/time_city.csv", index=False)
merged_df = cleaned_df.merge(city_coords[['lat', 'lng','time_city']], on=['lat','lng'], how='left')

In [85]:
gdf_city_coords[['geometry','time_city']]

Unnamed: 0,geometry,time_city
0,POINT (144.95558 -37.81373),263.3
1,POINT (144.95667 -37.81028),210.9
2,POINT (144.95572 -37.81359),232.5
3,POINT (144.95393 -37.8137),228.1
4,POINT (144.95955 -37.80883),225.9
...,...,...
447141,POINT (144.72388 -37.89654),1962.8
447142,POINT (144.76023 -37.91362),2271.4
447143,POINT (144.78083 -37.91597),2484.2
447144,POINT (144.74103 -37.89649),1963.3


In [82]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
city_coords = pd.read_csv(f"../data/raw/time_city.csv")
gdf_city_coords = gpd.GeoDataFrame(city_coords, geometry=gpd.points_from_xy(city_coords.lng, city_coords.lat))
gpd_cleaned_df = gpd.GeoDataFrame(cleaned_df, geometry=gpd.points_from_xy(cleaned_df.lng, cleaned_df.lat))

# Ensure both GeoDataFrames have the same CRS (Coordinate Reference System)
gdf_city_coords = gdf_city_coords.set_crs("EPSG:4326")
gpd_cleaned_df = gpd_cleaned_df.set_crs("EPSG:4326")

# Perform the nearest spatial join
joined_gdf = gpd.sjoin_nearest(gpd_cleaned_df,gdf_city_coords[['geometry','time_city']], how="left",rsuffix='city_coords')
joined_gdf.drop(columns=['index_city_coords'],inplace=True)





ValueError: 'index_right' cannot be a column name in the frames being joined




In [24]:
unmatched_rows = merged_df[merged_df['ptv_time'].isna()]

In [33]:
duplicated_rows = all_ptv[all_ptv.duplicated(subset=['lat', 'lng', 'address'], keep=False)]

In [36]:
all_ptv.drop_duplicates().shape

(1860717, 4)

In [41]:
merged_df.to_csv("../data/raw/property/rental_data_cleaned_ptv.csv", index=False)