**PROCESSING PROXIMITY**
-----------

In this notebook, we will calculate the proximity of some features that may affect housing prices.

Features include proximity to:
- CBD
- Nearest Train Station
- Nearest Shopping Precinct
- Nearest Park (or reserve, national parks etc.)
- Nearest Primary/Secondary School
  - And the type of school

All distances calculated are in km.

Due to the large amount of data that is processed, we seperately save the features throughout this notebook as we add route distance through iterating and saving every 100 rows. If it runs fine, just save the bottom one as csv. If you had to restart the notebook, then load all the dfs to combine them together.

In [25]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances
import googlemaps
from geopy.geocoders import Nominatim
import time
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def feat_sf (shapefile, feature_name, feat_type = None, feat_subtypes = None):
    """
    Cleaning shapefiles and dataframes for features we want.

    Args:
        shapefile (gpd.Geodataframe or pd.dataframe): the file with information on neighbourhood features
        feature_name (str): name of the feature
        feat_type (str or list, optional): any specific types of feature we want. Defaults to None.
        feat_subtypes (list, optional): feature subtypes, for example, a chicken is a subtype of a bird . Defaults to None.

    Raises:
        ValueError: feature name is wrong and not mentioned

    Returns:
        gpd.Geodataframe or pd.dataframe: the cleaned shapefile or dataframe
    """
    
    #Removing irrelevant features
    if feature_name in ("shopping", "parks") and feat_type is not None and feat_subtypes is not None:
        # We only want features in VIC
        filtered_sf = shapefile[shapefile['STATE'] == "VIC"]
        filtered_sf = filtered_sf[filtered_sf['FTYPE'] == feat_type]
        filtered_sf = filtered_sf[filtered_sf['FEATSUBTYP'].isin(feat_subtypes)]
        
    elif feature_name == "train_station":
        filtered_sf = shapefile[shapefile['STATUS'] == "Active"]
        # Renaming columns for ease of use for future functions
        filtered_sf = filtered_sf.rename(columns={'STATION': 'NAME'})
    
    elif feature_name in ("primary_school", "secondary_school") and feat_type is not None:
        filtered_sf = shapefile[shapefile['School_Type'].isin(feat_type)]
        # Renaming columns for ease of use for future functions
        filtered_sf = filtered_sf.rename(columns={'School_Name': 'NAME'})
        filtered_sf = filtered_sf.rename(columns={'Y': 'latitude'})
        filtered_sf = filtered_sf.rename(columns={'X': 'longitude'})
        
        # As the df for school data is just a dataframe, we do not need to convert polygons into coordinates
        return filtered_sf.reset_index(drop=True)
    else:
        # Handle cases where feature_name does not match any known types
        raise ValueError("Invalid feature_name provided.")
        
    # Setting shapefile format
    filtered_sf['geometry'] = filtered_sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
    
    # Creating an array of centroids of polygons in the feature shapefiles
    filtered_sf['centroid'] = filtered_sf['geometry'].centroid.apply(lambda geom: (geom.y, geom.x))
    filtered_sf['latitude'] = filtered_sf['centroid'].apply(lambda coord: coord[0])
    filtered_sf['longitude'] = filtered_sf['centroid'].apply(lambda coord: coord[1])

    return filtered_sf.reset_index(drop=True)

def coord_radian_array (input_data):
    """
    Creates a numpy array of radians to use for Haversine distance calculations

    Args:
        input_data (pd.DataFrame, gpd.GeoDataFrame, list or tuple): getting the coordinates from dataframe,
                                                                    or converting the sole coordinate to an array

    Raises:
        ValueError: wrong type of input, or is not a single coordinate

    Returns:
        np.array: array of radians
    """
    # Check if the input is a DataFrame using type()
    if (type(input_data) == pd.DataFrame or type(input_data) == gpd.GeoDataFrame) and \
       ("latitude" in input_data.columns and "longitude" in input_data.columns):
       # Convert the latitude and longitude to separate numpy arrays
        lattitudes = np.radians(input_data['latitude'].to_numpy())
        longtitudes = np.radians(input_data['longitude'].to_numpy())
        
        # Combine latitudes and longitudes into a 2D array of radians
        coordinate = np.column_stack((lattitudes, longtitudes))
    elif (type(input_data) == list or type(input_data) == tuple) and len(input_data) == 2:
        # Turn the one coordinate into a 2D array, and turn it into a radian value
        coordinate = np.radians(np.array([input_data]))
    else:
        # Error if unexpected type of data or does not have the appropiate columns
        raise ValueError("Input must be a DataFrame with 'latitude' and 'longitude' columns or a \
                         list/tuple of coordinates [latitude, longitude].")
         
    return coordinate

def rental_haversine_closest(rental_df, feature_data, feature_name):
    """
    Calculate the distance, and for features with multiple coordinates, picks the closest one.

    Args:
        rental_df (pd.DataFrame): Rental data and coordinates
        feature_data (pd.DataFrame, gpd.GeoDataFrame, list or tuple): coordinates and info on features
        feature_name (str): Name of feature

    Returns:
        pd.DataFrame: rental dataframe with info on distance and additional data
    """
    global nearest_distance
    
    # Turning info into radian arrays
    feat_radians = coord_radian_array(feature_data)
    rental_radians = coord_radian_array(rental_df)

    # Used Haversine distance as the earth's curve may affect distance 
    distances_radians = haversine_distances(feat_radians, rental_radians)
    distances_km = distances_radians * 6371
    
    # If only one coordinate
    if (type(feature_data) == list or type(feature_data) == tuple) and len(feature_data) == 2:
        nearest_distance = distances_km[0, :] 
    
    # If a set of coordinates
    elif type(feature_data) == pd.DataFrame or type(feature_data) == gpd.GeoDataFrame:
        # Grabbing the id of the nearest feature
        nearest_point_id = np.argmin(distances_km, axis=0)
        # Grabbing distance between the rental and the nearest feature
        nearest_distance = np.min(distances_km, axis=0)

        # Add the name, latitude, longitude and distance of the closest feature from the rental
        rental_df[f'nearest_{feature_name}_name'] = feature_data.loc[nearest_point_id, 'NAME'].values
        rental_df[f'nearest_{feature_name}_name'] = rental_df[f'nearest_{feature_name}_name'].str.title()
        # If feautre is schools, add school type
        if feature_name in ("primary_school", "secondary_school"):
            rental_df[f'nearest_{feature_name}_type'] = feature_data.loc[nearest_point_id, 'Education_Sector'].values
        rental_df[f'nearest_{feature_name}_latitude'] = feature_data.loc[nearest_point_id, 'latitude'].values
        rental_df[f'nearest_{feature_name}_longitude'] = feature_data.loc[nearest_point_id, 'longitude'].values

    # Add distance  
    rental_df[f'straight_line_distance_{feature_name}'] = nearest_distance
    return rental_df

# Calculate the driving route distance using Google Maps API
def calculate_route_distance(property_coords, destination_coords, gmaps_client):
    """
    Uses google maps api to look up the route distance in km

    Args:
        property_coords (float): coordinates of the rental property
        destination_coords (float): coordinates of the feature/desitnation
        gmaps_client (API): google maps API connection

    Returns:
        int: route distance in km
    """
    try:
        # Request the driving distance between the property and the closest train station
        result = gmaps_client.distance_matrix(origins=[property_coords], destinations=[destination_coords], mode="driving")
        
        # Check if the result is valid
        if result['rows'][0]['elements'][0]['status'] == 'OK':
            distance = result['rows'][0]['elements'][0]['distance']['value']  # Distance in meters
            return distance / 1000  # Convert from meters to kilometers
        else:
            print(f"No valid route distance found for {property_coords} to {destination_coords}: {result['rows'][0]['elements'][0]['status']}")
            return None
    except Exception as e:
        print(f"Error calculating route distance for {property_coords}: {e}")
        return None

def route_dist_and_save_csv(rental_df, feature_name, single_dest_coord = None):
    """
    Iterate through rows of a rental_df, then apply route (driving distance) calculation from
    each rental property to each feature in km.

    Args:
        rental_df (pd.dataframe): cleaned rental df with the closest feature listed with haversine distance
        feature_name (str): name of feature to save as
        single_dest_coord (_type_, optional): _description_. Defaults to None.

    Returns:
        pd.dataframe: final rental df with route distance calculated
    """
    # Create null column
    rental_df[f'route_distance_{feature_name}'] = np.nan

    # Iterate through each row 
    for index, rental in rental_df.iterrows():
        # Sacing coordinate
        property_coord = (rental['latitude'], rental['longitude'])
        if single_dest_coord != None:
            feat_coord = tuple(single_dest_coord)
        else:
            feat_coord = (rental[f'nearest_{feature_name}_latitude'], rental[f'nearest_{feature_name}_longitude'])
        # Calculate route distance
        route_distance = calculate_route_distance(property_coord, feat_coord, gmaps)
        rental_df.at[index, f'route_distance_{feature_name}'] = route_distance
        
        if (index + 1) % 100 == 0:
            print(f"Processed {index + 1} rows, saving progress...")
            rental_df.to_csv(f"{curated_dir}rental_with_{feature_name}.csv", index=False)
            
    # Final save after processing all data
    rental_df.to_csv(f"{curated_dir}rental_with_{feature_name}.csv", index=False)
    return rental_df


# Initialize the Google Maps API client with  API key
google_apikey = 'your key'
gmaps = googlemaps.Client(key = google_apikey)

# Directories
data_dir = '../data/'
landing_dir = data_dir + 'landing/'
raw_dir = data_dir + 'raw/'
curated_dir = data_dir + 'curated/'

pd.set_option('display.max_columns', None)

**Adds coordinates for each rental property to the rental scrape.**

In [14]:
# Adding coordinates for each address
rental_df = pd.read_csv(f"{landing_dir}rental_scrape.csv")

# Step 1: Initialize Nominatim geocoder with retry logic
geolocator = Nominatim(user_agent="rental_geocoder", timeout=10)

# Create a session with retry settings to handle temporary errors
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Step 2: Function to geocode addresses with retries
def geocode_address(address):
    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return (location.latitude, location.longitude)
        else:
            return (None, None)
    except Exception as e:
        print(f"Error geocoding {address}: {e}")
        return (None, None)

# Step 3: Apply geocoding function to the 'Address' column with retry logic
rental_coord = rental_df['Address'].apply(lambda address: geocode_address(address))

# Step 4: Split coordinates into 'latitude' and 'longitude' columns
rental_df['latitude'] = rental_coord.apply(lambda x: x[0])
rental_df['longitude'] = rental_coord.apply(lambda x: x[1])

# Step 5: Filter out rows where latitude or longitude is NaN
cleaned_rental_df = rental_df.dropna(subset=['latitude', 'longitude']).copy()
cleaned_rental_df = cleaned_rental_df.reset_index(drop=True)
cleaned_rental_df = cleaned_rental_df.drop(columns= ["URL", "Name"])

# Step 6: Save the DataFrame with geocoded coordinates to a new CSV
cleaned_rental_df.to_csv(f'{raw_dir}rental_with_coordinates.csv', index=False)

# Add a small delay between requests to avoid being blocked by Nominatim API
time.sleep(1)

**Adding proximity of the rental property to CDB**

In [None]:
cleaned_rental_df = pd.read_csv(f"{raw_dir}rental_with_coordinates.csv")
cbd_feature = "CBD"

# Coordinates for Melbourne CBD (latitude, longitude for Google Maps)
melbourne_cbd_coords = [-37.8136, 144.9631]
feature_data = melbourne_cbd_coords

# Get straight line distance between Melbourne CBD and the rental
cbd_haversine_df = rental_haversine_closest(cleaned_rental_df, melbourne_cbd_coords, cbd_feature)
# Save incrementally in a file, just in case. Combine later. 
cbd_rental_df = route_dist_and_save_csv(cbd_haversine_df, cbd_feature, melbourne_cbd_coords)

**Adding proximity of the rental property to their closest train station**

In [None]:
ptv_sf = gpd.read_file(f"{landing_dir}PTV/VIC_RAILWAY_STATIONS.shp")
train_station_feature = "train_station"

# Processing the shapefile for information we need
train_station_sf = feat_sf(ptv_sf, train_station_feature)
# Get closest station, and the straight line distance between Melbourne CBD and the rental
train_station_haversine_df = rental_haversine_closest(cleaned_rental_df, train_station_sf, train_station_feature)
train_station_rental_df = route_dist_and_save_csv(train_station_haversine_df, train_station_feature)

**Adding proximity of the rental property to their closest shopping precinct and park**

In [None]:
shopping_parks_foi_sf = gpd.read_file(f"{landing_dir}FOI/GEOMARK_POLYGON.shp")

shopping_type = "commercial facility"
shopping_feature = "shopping"
shopping_labels = ["shopping precinct", "shopping centre"]

shopping_sf = feat_sf(shopping_parks_foi_sf, shopping_feature, shopping_type, shopping_labels)
shopping_haversine_df = rental_haversine_closest(cleaned_rental_df, shopping_sf, shopping_feature)
shopping_rental_df = route_dist_and_save_csv(shopping_haversine_df, shopping_feature)

parks_type = "reserve"
parks_feature = "parks"
parks_labels = ["park", "conservation park", "gardens", "national park", "city square"]

parks_sf = feat_sf(shopping_parks_foi_sf, parks_feature, parks_type, parks_labels)
parks_haversine_df = rental_haversine_closest(cleaned_rental_df, parks_sf, parks_feature)
parks_rental_df = route_dist_and_save_csv(parks_haversine_df, parks_feature)


**Adding proximity of the rental property to their closest primary and secondary school**

In [26]:
school_df = pd.read_csv(f"{landing_dir}dv346-schoollocations2023.csv", encoding = "ISO-8859-1")

primary_school_type = ["Primary", "Pri/Sec"]
primary_school_feautre = "primary_school"

primary_school_df = feat_sf(school_df, primary_school_feautre, primary_school_type)
primary_school_haversine_df = rental_haversine_closest(cleaned_rental_df, primary_school_df, primary_school_feautre)
primary_school_rental_df = route_dist_and_save_csv(primary_school_haversine_df, primary_school_feautre)

secondary_school_type = ["secondary", "Pri/Sec"]
secondary_school_feautre = "secondary_school"

secondary_school_df = feat_sf(school_df, secondary_school_feautre, secondary_school_type)
secondary_school_df = feat_sf(school_df, secondary_school_feautre, secondary_school_type)
secondary_school_haversine_df = rental_haversine_closest(cleaned_rental_df, secondary_school_df, secondary_school_feautre)
secondary_school_rental_df = route_dist_and_save_csv(secondary_school_haversine_df, secondary_school_feautre)


**Combining and saving all features and their distances into one single dataframe**

In [30]:
"""
If restart notebook
"""
# Redownloading dfs just in case
# cleaned_rental_df = pd.read_csv(f"{raw_dir}rental_with_coordinates.csv")
# cbd_rental_df = pd.read_csv(f"{curated_dir}rental_with_CBD.csv")
# train_station_rental_df = pd.read_csv(f"{curated_dir}rental_with_train_station.csv")
# shopping_rental_df = pd.read_csv(f"{curated_dir}rental_with_shopping.csv")
# parks_rental_df = pd.read_csv(f"{curated_dir}rental_with_parks.csv")
# primary_school_rental_df = pd.read_csv(f"{curated_dir}rental_with_primary_school.csv")
# secondary_school_rental_df = pd.read_csv(f"{curated_dir}rental_with_secondary_school.csv")

# columns_to_remove = cleaned_rental_df.columns

# # Dropping common columns
# train_station_rental_df = train_station_rental_df.drop(columns=columns_to_remove)
# shopping_rental_df = shopping_rental_df.drop(columns=columns_to_remove)
# parks_rental_df = parks_rental_df.drop(columns=columns_to_remove)
# primary_school_rental_df = primary_school_rental_df.drop(columns=columns_to_remove)
# secondary_school_rental_df = secondary_school_rental_df.drop(columns=columns_to_remove)

# # Combining all columns
# combined_df = pd.concat([cbd_rental_df, train_station_rental_df, shopping_rental_df, parks_rental_df, 
#                          primary_school_rental_df, secondary_school_rental_df])

# Save as csv
# combined_df.to_csv(f"{curated_dir}rental_with_all_features.csv", index=False)

"""
If no need to restart notebook
"""
# Save as csv
cleaned_rental_df.to_csv(f"{curated_dir}rental_with_all_features.csv", index=False)