In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances


# Directories
data_dir = '../data/'
landing_dir = data_dir + 'landing/'
raw_dir = data_dir + 'raw/'
curated_dir = data_dir + 'curated/'

# Download files
foi_sf = gpd.read_file(f"{landing_dir}FOI/GEOMARK_POLYGON.shp")
rental_df = pd.read_csv(f"{raw_dir}rental_with_coordinates.csv")


In [2]:
# Setting shapefile format
foi_sf['geometry'] = foi_sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

# List of subfeature types: shopping precincts and shopping centres only
shopping_labels = ["shopping precinct", "shopping centre"]

# Removing irrelevant features, as we only want shopping centers in VIC
shopping_sf = foi_sf[foi_sf['STATE'] == "VIC"]
shopping_sf = shopping_sf[shopping_sf['FTYPE'] == "commercial facility"]
shopping_sf = shopping_sf[shopping_sf['FEATSUBTYP'].isin(shopping_labels)]
shopping_sf = shopping_sf.reset_index(drop=True)

# Cleaning the rental dataframe to remove null entries for coordinates
rental_df = rental_df.dropna(subset=['latitude', 'longitude'])
rental_df = rental_df.reset_index(drop=True)

In [3]:

# Creating an array of centroids of polygons in the shopping feature shapefiles
shopping_sf['centroid'] = shopping_sf.centroid
shopping_centroid_list = shopping_sf['centroid'].apply(lambda geom: (geom.y, geom.x))
shopping_centroid_list = list(shopping_centroid_list)
shopping_centroid_array = np.array(shopping_centroid_list)

# Turn rental coordinates into a numpy array
rental_centroid_array = rental_df[['latitude', 'longitude']].to_numpy()

# Turn the arrays into radians for distance calculation
shopping_centroid_radians = np.radians(shopping_centroid_array)
rental_centroid_radians = np.radians(rental_centroid_array)

# Used Haversine distance as the earth's curve may affect distance 
distances_radians = haversine_distances(shopping_centroid_radians, rental_centroid_radians)
distances_km = distances_radians * 6371

# Grabbing the id of the nearest shopping feature
nearest_point_id = np.argmin(distances_km, axis=0)
# Grabbing distance between the rental and the nearest shopping feature
nearest_distance = np.min(distances_km, axis=0)

# Add to dataframe
rental_df['nearest_shopping_id'] = shopping_sf.index[nearest_point_id]
rental_df['distance_to_nearest_shopping'] = nearest_distance



  shopping_sf['centroid'] = shopping_sf.centroid


array([[-38.0045428, 145.0884301],
       [-37.7863384, 145.1237982],
       [-37.9523213, 145.1736   ],
       ...,
       [-37.8091737, 145.0833678],
       [-37.7930145, 145.1318194],
       [-37.4304853, 143.8912389]])

In [5]:
# Save dataframe to csv
rental_df.to_csv(f"{curated_dir}rental_with_shopping_features.csv")