In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.geocoders import Nominatim
import re

In [2]:
property_df = pd.read_parquet('../../data/raw/property_details.parquet')

In [3]:
# Drop rows where price = None (Sometimes not given on website)
property_df = property_df[property_df['price'].notnull()]

# Drop nbn_details, property_features columns (often left empty on website)
property_df = property_df.drop(columns=['nbn_details', 'property_features'])

In [4]:
# View rows with None values
property_df[property_df.isnull().any(axis=1)]

Unnamed: 0,title,description,street_address,suburb,postcode,price,bedrooms,bathrooms,parking,primary_property_type,structured_features,video_count,photo_count,date_listed,days_listed,floor_plans_count,virtual_tour,nearby_schools


In [5]:
# Function to clean up the street address by removing the unit number
def remove_unit_number(address):
    # Regular expression to match unit numbers (e.g., 2657/181) and remove the first number (unit number)
    return re.sub(r'^\d+/', '', address)

# Apply the function to the 'street_address' column
property_df['street_address'] = property_df['street_address'].apply(remove_unit_number)

In [6]:
# Initialize the geolocator
geolocator = Nominatim(user_agent="property_geocoder")

# Function to get longitude and latitude
def get_coordinates(address):
    try:
        location = geolocator.geocode(address)
        if location:
            return (location.latitude, location.longitude)
        else:
            return (None, None)
    except Exception as e:
        print(f"Error getting coordinates for {address}: {e}")
        return (None, None)

# Apply the function to the 'street_address' column and store the results in new columns
property_df[['latitude', 'longitude']] = property_df['street_address'].apply(lambda x: pd.Series(get_coordinates(x)))

# Show the updated dataframe with coordinates
property_df[['street_address', 'latitude', 'longitude']]

property_df.dropna(inplace=True)
len(property_df)

property_df.to_csv("property_details_with_longlat.csv", index=False)

Error getting coordinates for 2 Sunrise Place, Wyndham Vale VIC 3024: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=2+Sunrise+Place%2C+Wyndham+Vale+VIC+3024&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Error getting coordinates for 3-11 High, North Melbourne VIC 3051: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=3-11+High%2C+North+Melbourne+VIC+3051&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
Error getting coordinates for 9 High Street, North Melbourne VIC 3051: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=9+High+Street%2C+North+Melbourne+VIC+3051&format=json&limit=1 (Caused by ReadTimeoutE

In [None]:
# Load the train station shapefiles
regional_trains_gdf = gpd.read_file("../../data/landing/PTV/PTV_METRO_TRAIN_STATION.shp")
metro_trains_gdf = gpd.read_file("../../data/landing/PTV/PTV_REGIONAL_TRAIN_STATION.shp")

regional_bus_gdf = gpd.read_file("../../data/landing/PTV/PTV_REGIONAL_BUS_STOP.shp")
metro_bus_gdf = gpd.read_file("../../data/landing/PTV/PTV_METRO_BUS_STOP.shp")

metro_trams_gdf = gpd.read_file("../../data/landing/PTV/PTV_METRO_TRAM_STOP.shp")


# Convert the property dataframe into a GeoDataFrame
property_gdf = gpd.GeoDataFrame(
    property_df, 
    geometry=gpd.points_from_xy(property_df.longitude, property_df.latitude), 
    crs="EPSG:4326"
)

property_gdf.to_crs(epsg=3857, inplace=True)

# Combine both regional and metro train stations into one GeoDataFrame
all_stations_gdf = pd.concat([regional_trains_gdf, metro_trains_gdf, regional_bus_gdf, metro_bus_gdf, metro_trams_gdf])

all_stations_gdf.to_crs(epsg=3857, inplace=True)

# Step 4: Create a function to calculate the distance to the closest train station
def closest_station_distance(property_point, stations_gdf):
    # Calculate the distance between a property and all train stations
    distances = stations_gdf.geometry.distance(property_point)
    # Return the minimum distance
    return distances.min()

# Apply the distance (meters) calculation to each property 
property_gdf['distance_to_closest_ptv'] = property_gdf.geometry.apply(
    lambda x: closest_station_distance(x, all_stations_gdf)
)

# Inspect the updated GeoDataFrame with the new column
property_gdf[['street_address', 'distance_to_closest_ptv']]

Unnamed: 0,street_address,distance_to_closest_ptv
13797,Brighton VIC 3186,235.230486
4948,"767 Glenferrie Road, Hawthorn VIC 3122",93.041715
3199,"406 La Trobe St, Melbourne VIC 3000",24.221090
1739,"47 Robinson Street, Armstrong Creek VIC 3217",1278.507187
10672,"260 Spencer Street, Melbourne VIC 3000",33.875718
...,...,...
13841,"8 Pinnock Street, Bairnsdale VIC 3875",231.786727
6183,"162 Rosslyn Street, West Melbourne VIC 3003",85.534025
3963,"18 Arthur Street, South Yarra VIC 3141",160.384414
1918,"6 Leicester Street, Carlton VIC 3053",138.987626


In [None]:
property_gdf.to_csv("property_details_with_distance_to_closest_ptv.csv", index=False)