In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from geopy.geocoders import Nominatim
import re

import numpy as np
from scipy.stats import zscore

In [2]:
property_df = pd.read_parquet('../../data/raw/property_details.parquet')

In [3]:
# Drop rows where price = None (Sometimes not given on website)
property_df = property_df[property_df['price'].notnull()]

# Drop nbn_details, property_features columns (often left empty on website)
property_df = property_df.drop(columns=['nbn_details', 'property_features'])

In [4]:
# View rows with None values
property_df[property_df.isnull().any(axis=1)]

Unnamed: 0,title,description,street_address,suburb,postcode,price,bedrooms,bathrooms,parking,primary_property_type,structured_features,video_count,photo_count,date_listed,days_listed,floor_plans_count,virtual_tour,nearby_schools


In [5]:
# Function to clean up the street address by removing the unit number
def remove_unit_number(address):
    # Regular expression to match unit numbers (e.g., 2657/181) and remove the first number (unit number)
    return re.sub(r'^\d+/', '', address)

# Apply the function to the 'street_address' column
property_df['street_address'] = property_df['street_address'].apply(remove_unit_number)

In [6]:
# Function to extract the dollar amount as a float, allowing for commas
def extract_dollar_amount(text):
    # Regular expression to match the dollar amount (handles commas)
    match = re.search(r'\$([\d,]+(\.\d+)?)', text)
    if match:
        # Remove commas before converting to float
        return float(match.group(1).replace(',', ''))
    return None

property_df['price'] = property_df['price'].apply(extract_dollar_amount)
property_df.dropna(subset=['price'], inplace=True)

In [None]:
# Initialize the geolocator
geolocator = Nominatim(user_agent="property_geocoder")

# Function to get longitude and latitude
def get_coordinates(address):
    try:
        location = geolocator.geocode(address)
        if location:
            return (location.latitude, location.longitude)
        else:
            return (None, None)
    except Exception as e:
        print(f"Error getting coordinates for {address}: {e}")
        return (None, None)

# Apply the function to the 'street_address' column and store the results in new columns
property_df[['latitude', 'longitude']] = property_df['street_address'].apply(lambda x: pd.Series(get_coordinates(x)))

# Show the updated dataframe with coordinates
property_df[['street_address', 'latitude', 'longitude']]

property_df.dropna(inplace=True)
len(property_df)

property_df.to_parquet("property_details_with_longlat.parquet", index=False)

In [19]:
# Load the train station shapefiles
regional_trains_gdf = gpd.read_file("../../data/landing/PTV/PTV_METRO_TRAIN_STATION.shp")
metro_trains_gdf = gpd.read_file("../../data/landing/PTV/PTV_REGIONAL_TRAIN_STATION.shp")

regional_bus_gdf = gpd.read_file("../../data/landing/PTV/PTV_REGIONAL_BUS_STOP.shp")
metro_bus_gdf = gpd.read_file("../../data/landing/PTV/PTV_METRO_BUS_STOP.shp")

trams_gdf = gpd.read_file("../../data/landing/PTV/PTV_METRO_TRAM_STOP.shp")


# Convert the property dataframe into a GeoDataFrame
property_gdf = gpd.GeoDataFrame(
    property_df, 
    geometry=gpd.points_from_xy(property_df.longitude, property_df.latitude), 
    crs="EPSG:4326"
)

property_gdf.to_crs(epsg=3857, inplace=True)

supermarkets_df = pd.read_csv("../../data/curated/preprocessed_supermarkets.csv")

supermarkets_gdf = gpd.GeoDataFrame(
    supermarkets_df, 
    geometry=gpd.points_from_xy(supermarkets_df.Longitude, supermarkets_df.Latitude), 
    crs="EPSG:4326"
)

supermarkets_gdf.to_crs(epsg=3857, inplace=True)


trains_gdf = pd.concat([regional_trains_gdf, metro_trains_gdf])
buses_gdf = pd.concat([regional_bus_gdf, metro_bus_gdf])

trains_gdf.to_crs(epsg=3857, inplace=True)
buses_gdf.to_crs(epsg=3857, inplace=True)
trams_gdf.to_crs(epsg=3857, inplace=True)

# Create a function to calculate the distance to the closest train station
def closest_station_distance(property_point, stations_gdf):
    # Calculate the distance between a property and all train stations
    distances = stations_gdf.geometry.distance(property_point)
    # Return the minimum distance
    return distances.min()

melbourne_cbd = Point(144.9628, -37.8102)  # Original coordinates in EPSG:4326
melbourne_cbd_gdf = gpd.GeoDataFrame(geometry=[melbourne_cbd], crs="EPSG:4326")
melbourne_cbd_gdf = melbourne_cbd_gdf.to_crs(epsg=3857)
melbourne_cbd_point = melbourne_cbd_gdf.geometry[0]

# Function to calculate the distance to Melbourne CBD (in meters)
def distance_to_melbourne_cbd(property_point):
    return property_point.distance(melbourne_cbd_point)

# Apply the distance (meters) calculation to each property 
property_gdf['distance_to_closest_train'] = property_gdf.geometry.apply(
    lambda x: closest_station_distance(x, trains_gdf)
)

property_gdf['distance_to_closest_bus'] = property_gdf.geometry.apply(
    lambda x: closest_station_distance(x, buses_gdf)
)

property_gdf['distance_to_closest_tram'] = property_gdf.geometry.apply(
    lambda x: closest_station_distance(x, trams_gdf)
)

property_gdf['distance_to_melbourne_cbd'] = property_gdf.geometry.apply(
    lambda x: distance_to_melbourne_cbd(x)
)

property_gdf['distance_to_closest_supermarket'] = property_gdf.geometry.apply(
    lambda x: closest_station_distance(x, supermarkets_gdf)
)


In [20]:
# Filter properties within 600000 meters of Melbourne CBD (Eliminates NSW properties)
property_gdf = property_gdf[property_gdf['distance_to_melbourne_cbd'] < 600000]

In [21]:
def remove_z_score_outliers(data, column):
    N = len(data)
    z_scores = zscore(data[column])
    
    # Compute threshold based on the number of records
    if N <= 100:
        threshold = 3
    else:
        threshold = np.sqrt(2 * np.log(N))
    
    # Filter out the outliers based on Z-score threshold
    print(f"Removing {np.sum(np.abs(z_scores) > threshold)} outliers from {column}")
    data_cleaned = data[np.abs(z_scores) <= threshold]
    return data_cleaned

In [25]:
property_gdf['price'] = property_gdf['price'].apply(extract_dollar_amount)
property_gdf.dropna(subset=['price'], inplace=True)

In [27]:
property_df = pd.DataFrame(property_gdf.drop(columns=['geometry'])).reset_index(drop=True)

In [28]:
def get_closest_school(nearby_schools):
    return nearby_schools[0]['distance']

In [29]:
property_df.dropna(inplace=True)
property_df['nearby_schools'] = property_df['nearby_schools'].apply(get_closest_school)

In [32]:
property_df.dropna(inplace=True)
property_df.reset_index(drop=True).to_parquet("../../data/raw/property_details_w_distances.parquet", index=False)