# Summary of the Notebook

This notebook performs proximity analysis using a locally installed ORS server and a map of Australia. The main steps include:

1. **Loading Data**:
    - Property data and city data are loaded from CSV files.

2. **Data Preparation**:
    - Coordinates are extracted and processed.
    - City information is prepared with latitude and longitude.

3. **Proximity Calculation**:
    - A KD-Tree is used to find the nearest city for each property.
    - Travel time proximity is calculated using the ORS server.

4. **Data Joining**:
    - The calculated proximity data is joined back to the original dataset.

5. **Analysis and Export**:
    - The results are analyzed and exported to CSV files.


In [None]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from scipy.spatial import cKDTree
import numpy as np
import requests
import json
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
tqdm.pandas()  # "tqdm>=4.9.0"

# Load data

## property data

In [27]:
property_data_path = "../data/raw/property/rental_data_2024.csv"
city_output = "../data/raw/time_city_2024.csv"
property_df = pd.read_csv(property_data_path)


In [14]:
coords = property_df[['lat', 'lng']].drop_duplicates()

## City
according to wikipedia, the top 4 city in victoria is
1.	Melbourne	
2.	Geelong	
3.	Ballarat
4.	Bendigo

In [15]:
city = pd.DataFrame({'locations': ["-37.814819636878, 144.96639135042003",
                                   "-38.14928665137738, 144.35990084446854",
                                   "-37.56151637411765, 143.85484215682467",
                                   "-36.75960898491938, 144.2786689721445"],
                     'city': ["Melbourne", "Geelong", "Ballarat", "Bendigo"]})
city['lat'] = city['locations'].apply(lambda x: float(x.split(",")[0]))
city['lng'] = city['locations'].apply(lambda x: float(x.split(",")[1]))
city['Point'] = city.apply(lambda x: Point(x['lng'], x['lat']), axis=1)

# Proximity

In [16]:
num_to_find = 3
tree = cKDTree(city[['lng', 'lat']].values)
property_coords = coords[['lng', 'lat']].values
distances, indices = tree.query(property_coords, k=1)
coords["city_index"] = list(indices)

In [None]:
coords["city_index"].value_counts()

In [18]:
from collections import defaultdict
err_count = defaultdict(int)
def get_time_proximity(coordinates):
    url = "http://localhost:8080/ors/v2/directions/driving-car"
    body = {"coordinates": coordinates}
    # print(body)
    response = requests.post(url, json=body)
    try:
        if response.status_code == 200:
            summary = response.json()["routes"][0]["summary"]
            if "duration" in summary:
                return summary["duration"]
            return 0
        else:
            
            if(response.json()["error"]["code"] == 2004):
                # distance is too long (> 100000.0 m)
                err_count[2004] += 1
                return -1
            if(response.json()["error"]["code"] == 2010):
                err_count[2010] += 1
                # "error":{"code":2010,"message":"Could not find routable point within a radius of 400.0 meters of specified coordinate
                return -1
            print(response.text)
            print(body)
            
            raise Exception("Error in request")
    except Exception as e:
        print(response.json())
        raise e

# # Example usage
# directions = get_time_proximity([[144.96332, -37.8140], [144.96332, -37.8120]])
# print(directions)

In [20]:
def get_time_proximity_from_property(row, cities):
    city = cities.iloc[int(row['city_index'])]
    coordinates = [[row['lng'], row['lat']], [city['lng'], city['lat']]]
    time = get_time_proximity(coordinates)
    # print(time)
    return time

In [None]:
coords.shape

In [None]:
# dist time matching
coords["time_city"] = coords.progress_apply(lambda row: get_time_proximity_from_property(row, city), axis=1)
# property_df.to_csv(f"../data/raw/property/rent_{i}_ptv.csv", index=False)

In [None]:
err_count

In [25]:
coords.to_csv(city_output, index=False)

# join to orginal dataset

## read all

In [28]:
import sys, os
import pandas as pd
import geopandas as gpd
sys.path.append("../")
from scripts.proximity import proximity_hard_join, proximity_sjoin

In [33]:
cleaned_df = pd.read_csv(property_data_path)
city_df = pd.read_csv(city_output)

In [None]:
cleaned_df

In [None]:
proximity_sjoin(cleaned_df, city_df).shape

In [None]:
city_df['time_city'].value_counts()

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
city_coords = pd.read_csv(f"../data/raw/time_city.csv")
gdf_city_coords = gpd.GeoDataFrame(city_coords, geometry=gpd.points_from_xy(city_coords.lng, city_coords.lat))
gpd_cleaned_df = gpd.GeoDataFrame(cleaned_df, geometry=gpd.points_from_xy(cleaned_df.lng, cleaned_df.lat))

# Ensure both GeoDataFrames have the same CRS (Coordinate Reference System)
gdf_city_coords = gdf_city_coords.set_crs("EPSG:4326")
gpd_cleaned_df = gpd_cleaned_df.set_crs("EPSG:4326")

# Perform the nearest spatial join
joined_gdf = gpd.sjoin_nearest(gpd_cleaned_df,gdf_city_coords[['geometry','time_city']], how="left",rsuffix='city_coords')
joined_gdf.drop(columns=['index_city_coords'],inplace=True)


In [None]:
gpd_cleaned_df


In [None]:
cleaned_df.dtypes