In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances
import googlemaps

def feat_sf (shapefile, feat_type, feat_subtypes):
    """_summary_

    Args:
        shapefile (_type_): _description_
        feat_type (_type_): _description_
        feat_subtypes (_type_): _description_

    Returns:
        _type_: _description_
    """
    # Removing irrelevant features, as we only want features in VIC
    filtered_sf = shapefile[shapefile['STATE'] == "VIC"]
    filtered_sf = filtered_sf[filtered_sf['FTYPE'] == feat_type]
    filtered_sf = filtered_sf[filtered_sf['FEATSUBTYP'].isin(feat_subtypes)]
    filtered_sf = filtered_sf.reset_index(drop=True)
    
    # Setting shapefile format
    filtered_sf['geometry'] = filtered_sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
    
    # Creating an array of centroids of polygons in the feature shapefiles
    filtered_sf['centroid'] = filtered_sf['geometry'].centroid.apply(lambda geom: (geom.y, geom.x))
    filtered_sf['latitude'] = filtered_sf['centroid'].apply(lambda coord: coord[0])
    filtered_sf['longitude'] = filtered_sf['centroid'].apply(lambda coord: coord[1])

    return filtered_sf

def rental_feat_haversine_closest(rental_df, filtered_sf, feature_name):
    # Convert the latitude and longitude to separate numpy arrays
    feat_latitudes = np.radians(filtered_sf['latitude'].to_numpy())
    feat_longitudes = np.radians(filtered_sf['longitude'].to_numpy())
    rental_latitudes = np.radians(rental_df['latitude'].to_numpy())
    rental_longitudes = np.radians(rental_df['longitude'].to_numpy())

    # Combine latitudes and longitudes into a 2D array of radians
    feat_centroid_radians = np.column_stack((feat_latitudes, feat_longitudes))
    rental_centroid_radians = np.column_stack((rental_latitudes, rental_longitudes))

    # Used Haversine distance as the earth's curve may affect distance 
    distances_radians = haversine_distances(feat_centroid_radians, rental_centroid_radians)
    distances_km = distances_radians * 6371
    
    # Grabbing the id of the nearest feature
    nearest_point_id = np.argmin(distances_km, axis=0)
    # Grabbing distance between the rental and the nearest feature
    nearest_distance = np.min(distances_km, axis=0)

    # Add the name, latitude, longitude and distance of the closest feature from the rental
    rental_df[f'nearest_{feature_name}_name'] = filtered_sf.loc[nearest_point_id, 'NAME'].values
    rental_df[f'nearest_{feature_name}_latitude'] = filtered_sf.loc[nearest_point_id, 'latitude'].values
    rental_df[f'nearest_{feature_name}_longitude'] = filtered_sf.loc[nearest_point_id, 'longitude'].values
    rental_df[f'straight_line_distance_{feature_name}'] = nearest_distance
    return rental_df

# Calculate the driving route distance using Google Maps API
def calculate_route_distance(property_coords, destination_coords, gmaps_client):
    try:
        # Request the driving distance between the property and the closest train station
        result = gmaps_client.distance_matrix(origins=[property_coords], destinations=[destination_coords], mode="driving")
        
        # Check if the result is valid
        if result['rows'][0]['elements'][0]['status'] == 'OK':
            distance = result['rows'][0]['elements'][0]['distance']['value']  # Distance in meters
            return distance / 1000  # Convert from meters to kilometers
        else:
            print(f"No valid route distance found for {property_coords} to {destination_coords}: {result['rows'][0]['elements'][0]['status']}")
            return None
    except Exception as e:
        print(f"Error calculating route distance for {property_coords}: {e}")
        return None

def route_dist_and_save_csv(rental_df, feature_name):
    rental_df[f'route_distance_{feature_name}'] = np.nan

    for index, rental in rental_df.iterrows():
        property_coord = (rental['latitude'], rental['longitude'])
        feat_coord = (rental[f'nearest_{feature_name}_latitude'], rental[f'nearest_{feature_name}_longitude'])
        route_distance = calculate_route_distance(property_coord, feat_coord, gmaps)
        rental_df.at[index, f'route_distance_{feature_name}'] = route_distance
        
        if (index + 1) % 100 == 0:
            print(f"Processed {index + 1} rows, saving progress...")
            rental_df.to_csv(f"{curated_dir}rental_with_{feature_name}.csv", index=False)
    return rental_df


# Initialize the Google Maps API client with  API key
google_apikey = 'your_key'
gmaps = googlemaps.Client(key = google_apikey)

# Directories
data_dir = '../data/'
landing_dir = data_dir + 'landing/'
raw_dir = data_dir + 'raw/'
curated_dir = data_dir + 'curated/'

# Download files
shopping_parks_foi_sf = gpd.read_file(f"{landing_dir}FOI/GEOMARK_POLYGON.shp")
rental_df = pd.read_csv(f"{raw_dir}rental_with_coordinates.csv")

pd.set_option('display.max_columns', None)

# Filter out rows where latitude or longitude is NaN
cleaned_rental_df = rental_df.dropna(subset=['latitude', 'longitude']).copy()
cleaned_rental_df = cleaned_rental_df.reset_index(drop=True)

In [None]:
# Initialize the distance column if not already present
cleaned_rental_df['distance_to_cbd_km'] = None

# Coordinates for Melbourne CBD (latitude, longitude for Google Maps)
melbourne_cbd_coords = [-37.8136, 144.9631]

# Step 3: Apply the distance calculation for each rental property
cleaned_rental_df['coordinates'] = cleaned_rental_df.apply(lambda row: (row['latitude'], row['longitude']), axis=1)
feature_name = "cbd_distances"

# Step 4: Process each property and save progress incrementally
for idx, row in cleaned_rental_df.iterrows():
    if pd.isnull(row['distance_to_cbd_km']):  # Only process rows with null distances
        coords = row['coordinates']
        if coords:  # Ensure the coordinates are valid
            distance = calculate_route_distance(coords, melbourne_cbd_coords, gmaps)
            cleaned_rental_df.loc[idx, 'distance_to_cbd_km'] = distance

    # Save progress every 100 rows
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1} rows, saving progress...")
        cleaned_rental_df.to_csv(f"{curated_dir}rental_with_{feature_name}.csv", index=False)

# Final save after processing all data
cleaned_rental_df.to_csv(f"{curated_dir}rental_with_{feature_name}.csv", index=False)

In [3]:


shopping_type = "commercial facility"
shopping_feature = "shopping"
shopping_labels = ["shopping precinct", "shopping centre"]

shopping_sf = feat_sf(shopping_parks_foi_sf, shopping_type, shopping_labels)
shopping_haversine_df = rental_feat_haversine_closest(cleaned_rental_df, shopping_sf, shopping_feature)
shopping_rental_df = route_dist_and_save_csv(shopping_haversine_df, shopping_feature)

In [None]:
parks_type = "reserve"
parks_feature = "parks"
parks_labels = ["park", "conservation park", "gardens", "national park", "city square"]

parks_sf = feat_sf(shopping_parks_foi_sf, parks_type, parks_labels)
parks_haversine_df = rental_feat_haversine_closest(cleaned_rental_df, parks_sf, parks_feature)
parks_rental_df = route_dist_and_save_csv(parks_haversine_df, parks_feature)


In [None]:
school_cbd_train_df = pd.read_csv(f"{curated_dir}rental_distances_cbd_trains_schools.csv")
parks_df = pd.read_csv(f"{curated_dir}rental_with_parks.csv")
shopping_df = pd.read_csv(f"{curated_dir}rental_with_shopping.csv")
similar_cols = list(rental_df.columns)
combined_parks_shopping = pd.merge(parks_df, shopping_df, left_on=similar_cols, right_on=similar_cols)
school_cbd_train_df = school_cbd_train_df.drop_duplicates(subset=['coordinates', 'latitude', 'longitude'])
combined_all = pd.merge(combined_parks_shopping, school_cbd_train_df, 
                        how = 'inner',
                        left_on=['coordinates', 'latitude', 'longitude'], 
                        right_on=['coordinates', 'latitude','longitude'])
cleaned_combined_all = combined_all.drop(['Unnamed: 0', 'Cost_y', 'Beds', 'Baths', 'Cars', 'Address_y','Property Type'], axis=1)
cleaned_combined_all.columns
cleaned_combined_all.rename(columns={'Cost_x': 'Cost', 'Address_x': 'Address'}, inplace=True)
cleaned_combined_all['nearest_parks_name'] = cleaned_combined_all['nearest_parks_name'].str.title()
cleaned_combined_all['nearest_shopping_name'] = cleaned_combined_all['nearest_shopping_name'].str.title()

cleaned_combined_all.to_csv(f"{curated_dir}combined_rental_features.csv", index=False)