**PROCESSING PROXIMITY**
-----------

In this notebook, we will calculate the proximity of some features that may affect housing prices.

Features include proximity to:
- CBD
- Nearest Train Station
- Nearest Shopping Precinct
- Nearest Park (or reserve, national parks etc.)
- Nearest Hospital
- Nearest Primary/Secondary School
  - And the type of school

All distances calculated are in km.

Due to the large amount of data that is processed, we seperately save the features throughout this notebook as we add route distance through iterating and saving every 100 rows. If it runs fine, just save the bottom one as csv. If you had to restart the notebook, then load all the dfs to combine them together.

In [None]:
import pandas as pd
import geopandas as gpd
import googlemaps
from geopy.geocoders import Nominatim
import time
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import sys
sys.path.append('../')
import scripts
import importlib
importlib.reload(scripts)
from scripts.preprocess_proximity import *

# Initialize the Google Maps API client with  API key
google_apikey = 'your_key'
gmaps = googlemaps.Client(key = google_apikey)

# Directories
data_dir = '../data/'
landing_dir = data_dir + 'landing/'
raw_dir = data_dir + 'raw/'
curated_dir = data_dir + 'curated/'


**Adds coordinates for each rental property to the rental scrape.**

In [14]:
# Adding coordinates for each address
rental_df = pd.read_csv(f"{landing_dir}rental_scrape.csv")

# Step 1: Initialize Nominatim geocoder with retry logic
geolocator = Nominatim(user_agent="rental_geocoder", timeout=10)

# Create a session with retry settings to handle temporary errors
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Step 2: Function to geocode addresses with retries
def geocode_address(address):
    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return (location.latitude, location.longitude)
        else:
            return (None, None)
    except Exception as e:
        print(f"Error geocoding {address}: {e}")
        return (None, None)

# Step 3: Apply geocoding function to the 'Address' column with retry logic
rental_coord = rental_df['Address'].apply(lambda address: geocode_address(address))

# Step 4: Split coordinates into 'latitude' and 'longitude' columns
rental_df['latitude'] = rental_coord.apply(lambda x: x[0])
rental_df['longitude'] = rental_coord.apply(lambda x: x[1])

# Step 5: Filter out rows where latitude or longitude is NaN
cleaned_rental_df = rental_df.dropna(subset=['latitude', 'longitude']).copy()
cleaned_rental_df = cleaned_rental_df.reset_index(drop=True)
cleaned_rental_df = cleaned_rental_df.drop(columns= ["URL", "Name"])

# Step 6: Save the DataFrame with geocoded coordinates to a new CSV
cleaned_rental_df.to_csv(f'{raw_dir}rental_with_coordinates.csv', index=False)

# Add a small delay between requests to avoid being blocked by Nominatim API
time.sleep(1)

**Adding proximity of the rental property to CDB**

In [None]:
cleaned_rental_df = pd.read_csv(f"{raw_dir}rental_with_coordinates.csv")
cbd_feature = "CBD"

# Coordinates for Melbourne CBD (latitude, longitude for Google Maps)
melbourne_cbd_coords = [-37.8136, 144.9631]
feature_data = melbourne_cbd_coords

# Get straight line distance between Melbourne CBD and the rental
cbd_haversine_df = rental_haversine_closest(cleaned_rental_df, melbourne_cbd_coords, cbd_feature)
# Save incrementally in a file, just in case. Combine later. 
cbd_rental_df = route_dist_and_save_csv(cbd_haversine_df, cbd_feature, curated_dir, gmaps, melbourne_cbd_coords)

**Adding proximity of the rental property to their closest train station**

In [None]:
ptv_sf = gpd.read_file(f"{landing_dir}PTV/VIC_RAILWAY_STATIONS.shp")
train_station_feature = "train_station"

# Processing the shapefile for information we need
train_station_sf = feat_sf(ptv_sf, train_station_feature)
# Get closest station, and the straight line distance between Melbourne CBD and the rental
train_station_haversine_df = rental_haversine_closest(cleaned_rental_df, train_station_sf, train_station_feature)
train_station_rental_df = route_dist_and_save_csv(train_station_haversine_df, train_station_feature, curated_dir, gmaps)


# route_dist_and_save_csv(rental_df, feature_name, save_to_dir, gmaps_client, single_dest_coord = None)

**Adding proximity of the rental property to their closest shopping precinct, park/reserve and hospital**

In [None]:
foi_sf = gpd.read_file(f"{landing_dir}FOI/GEOMARK_POLYGON.shp")
cleaned_rental_df = pd.read_csv(f"{raw_dir}rental_with_coordinates.csv")
cleaned_rental_df = cleaned_rental_df.dropna(subset=['latitude', 'longitude']).copy()

shopping_type = "commercial facility"
shopping_feature = "shopping"
shopping_labels = ["shopping precinct", "shopping centre"]

shopping_sf = feat_sf(foi_sf, shopping_feature, shopping_type, shopping_labels)
shopping_haversine_df = rental_haversine_closest(cleaned_rental_df, shopping_sf, shopping_feature)
shopping_rental_df = route_dist_and_save_csv(shopping_haversine_df, shopping_feature, curated_dir, gmaps)

parks_type = "reserve"
parks_feature = "parks"
parks_labels = ["park", "conservation park", "gardens", "national park", "city square"]

parks_sf = feat_sf(foi_sf, parks_feature, parks_type, parks_labels)
parks_haversine_df = rental_haversine_closest(cleaned_rental_df, parks_sf, parks_feature)
parks_rental_df = route_dist_and_save_csv(parks_haversine_df, parks_feature, curated_dir, gmaps)

hospital_type = "hospital"
hospital_feature = "hospital"
hospital_labels = ["hospital complex"]

hospital_sf = feat_sf(foi_sf, hospital_feature, hospital_type, hospital_labels)
hospital_haversine_df = rental_haversine_closest(cleaned_rental_df, hospital_sf, hospital_feature)
hospital_rental_df = route_dist_and_save_csv(hospital_haversine_df, hospital_feature, curated_dir, gmaps)

**Adding proximity of the rental property to their closest primary and secondary school**

In [26]:
school_df = pd.read_csv(f"{landing_dir}dv346-schoollocations2023.csv", encoding = "ISO-8859-1")

primary_school_type = ["Primary", "Pri/Sec"]
primary_school_feautre = "primary_school"

primary_school_df = feat_sf(school_df, primary_school_feautre, primary_school_type)
primary_school_haversine_df = rental_haversine_closest(cleaned_rental_df, primary_school_df, primary_school_feautre)
primary_school_rental_df = route_dist_and_save_csv(primary_school_haversine_df, primary_school_feautre, curated_dir, gmaps)

secondary_school_type = ["secondary", "Pri/Sec"]
secondary_school_feautre = "secondary_school"

secondary_school_df = feat_sf(school_df, secondary_school_feautre, secondary_school_type)
secondary_school_df = feat_sf(school_df, secondary_school_feautre, secondary_school_type)
secondary_school_haversine_df = rental_haversine_closest(cleaned_rental_df, secondary_school_df, secondary_school_feautre)
secondary_school_rental_df = route_dist_and_save_csv(secondary_school_haversine_df, secondary_school_feautre, curated_dir, gmaps)


**Combining and saving all features and their distances into one single dataframe**

In [30]:
"""
If restart notebook
"""
# Redownloading dfs just in case
# cleaned_rental_df = pd.read_csv(f"{raw_dir}rental_with_coordinates.csv")
# cbd_rental_df = pd.read_csv(f"{curated_dir}rental_with_CBD.csv")
# train_station_rental_df = pd.read_csv(f"{curated_dir}rental_with_train_station.csv")
# shopping_rental_df = pd.read_csv(f"{curated_dir}rental_with_shopping.csv")
# parks_rental_df = pd.read_csv(f"{curated_dir}rental_with_parks.csv")
# hospital_rental_df = pd.read_csv(f"{curated_dir}rental_with_hospital.csv")
# primary_school_rental_df = pd.read_csv(f"{curated_dir}rental_with_primary_school.csv")
# secondary_school_rental_df = pd.read_csv(f"{curated_dir}rental_with_secondary_school.csv")

# columns_to_remove = cleaned_rental_df.columns

# # Dropping common columns
# train_station_rental_df = train_station_rental_df.drop(columns=columns_to_remove)
# shopping_rental_df = shopping_rental_df.drop(columns=columns_to_remove)
# parks_rental_df = parks_rental_df.drop(columns=columns_to_remove)
# hospital_rental_df = hospital_rental_df.drop(columns=columns_to_remove)
# primary_school_rental_df = primary_school_rental_df.drop(columns=columns_to_remove)
# secondary_school_rental_df = secondary_school_rental_df.drop(columns=columns_to_remove)

# Combining all columns
# combined_df = pd.concat([cbd_rental_df, train_station_rental_df, shopping_rental_df, parks_rental_df, 
#                          hospital_rental_df, primary_school_rental_df, secondary_school_rental_df])

# Save as csv
# combined_df.to_csv(f"{curated_dir}rental_with_all_features.csv", index=False)

"""
If no need to restart notebook
"""
# Save as csv
cleaned_rental_df.to_csv(f"{curated_dir}rental_with_all_features.csv", index=False)