# Coordinates of properties, closest distance to schools and CBD

The geolocation notebook uses the individual property names and calculates their distances to CBD and distance to the closest schools.

This notebook uses Google Maps API's and other user defined functions for this.

Getting Property coordinates in batches

In [3]:
import os
import googlemaps
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import time

# Initialize the Spark session
spark = SparkSession.builder.appName("Geocode Properties in Batches with Google Maps").getOrCreate()

# Load the Parquet file
input_file_path = '../data/raw/property_data.parquet/.'  
df_spark = spark.read.parquet(input_file_path)

# Initialize the Google Maps client with API key
gmaps = googlemaps.Client(key='key')  

# Define a function to geocode an address using Google Maps API with retry logic
def geocode_with_retry(address, retries=3, delay=1):
    for i in range(retries):
        try:
            response = gmaps.geocode(address)
            if response and len(response) > 0:
                location = response[0]['geometry']['location']
                return location['lat'], location['lng']  # Return as (latitude, longitude)
        except googlemaps.exceptions.ApiError as e:
            print(f"Error geocoding {address}: {e}")
            if "OVER_QUERY_LIMIT" in str(e):  # Rate limit exceeded
                print(f"Rate limit exceeded, waiting for {delay} seconds before retrying...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
        except Exception as e:
            print(f"Error geocoding {address}: {e}")
    return None, None

# Register UDFs for latitude and longitude
@udf(returnType=DoubleType())
def get_latitude(address):
    lat, _ = geocode_with_retry(address)
    return lat

@udf(returnType=DoubleType())
def get_longitude(address):
    _, lon = geocode_with_retry(address)
    return lon

# Get the total number of properties
total_properties = df_spark.count()
batch_size = 1000
num_batches = (total_properties // batch_size) + 1  # Calculate how many batches we need

# Directory to save Parquet files
output_dir = os.path.abspath('../data/raw/batch_output')

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process properties in batches
for batch_number in range(num_batches):
    # Calculate the starting index for this batch
    start = batch_number * batch_size
    end = start + batch_size
    
    # Get the current batch of properties
    df_batch = df_spark.limit(end).subtract(df_spark.limit(start))
    
    # Apply the UDFs to get latitude and longitude for the current batch
    df_batch = df_batch.withColumn('latitude', get_latitude(df_batch['name']))
    df_batch = df_batch.withColumn('longitude', get_longitude(df_batch['name']))
    
    # Define the output file path for the current batch
    output_file_path = os.path.join(output_dir, f'properties_batch_{batch_number+1}.parquet')
    
    # Save the updated DataFrame with latitude and longitude to a new Parquet file
    df_batch.write.mode("overwrite").parquet(output_file_path)
    
    # Print progress
    print(f"Batch {batch_number+1} with {df_batch.count()} properties saved to {output_file_path}")
    
    
    time.sleep(5)  # Adjust this based on rate limit requirements

print("Geocoding and saving all properties complete.")

In [9]:
spark.stop()  # Stops the existing Spark session

Merging all batch parquets

In [23]:
import os
import pandas as pd

# Path to the main folder containing 13 subfolders with Parquet files
base_dir = '../data/raw/batch_output'



# Initialize an empty list to hold DataFrames
df_list = []

# Loop through each folder
for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        # Loop through each parquet file in the folder
        for file in os.listdir(folder_path):
            if file.endswith('.parquet'):
                file_path = os.path.join(folder_path, file)
                # Read the parquet file with only selected columns and append the DataFrame to the list
                df_list.append(pd.read_parquet(file_path))

# Concatenate all DataFrames into one
merged_df = pd.concat(df_list, ignore_index=True)

# Save the merged DataFrame with selected columns to a new Parquet file
merged_df.to_parquet('property_coords.parquet')


In [2]:
#pip3 install googlemaps?
#pip3 install osmnx geopy

Distance to CBD

In [17]:
import os
import googlemaps
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import time

# Initialize the Spark session
spark = SparkSession.builder.appName("Distance to Melbourne Central Station").getOrCreate()

# Load the Parquet file with properties and their coordinates
input_file_path = '../data/raw/property_coords.parquet'
df_properties = spark.read.parquet(input_file_path)
df_properties = df_properties.select('name', 'postcode', 'latitude', 'longitude')

# Initialize the Google Maps client with API key
gmaps = googlemaps.Client(key='API') 

# Define a function to calculate the distance between a property and Melbourne Central Station using Google Maps Distance Matrix API
def calculate_distance_to_central_station(property_lat, property_lng):
    try:
        origins = f"{property_lat},{property_lng}"
        # Melbourne Central Station coordinates
        destination = "-37.810592,144.962999"
        response = gmaps.distance_matrix(origins, destination, mode="driving")
        if response['rows'][0]['elements'][0]['status'] == 'OK':
            return response['rows'][0]['elements'][0]['distance']['value'] / 1000  # Distance in kilometers
        else:
            return None
    except Exception as e:
        print(f"Error calculating distance to Melbourne Central Station: {e}")
        return None

# Register a UDF to calculate the distance to Melbourne Central Station
@udf(returnType=DoubleType())
def get_distance_to_central_station(property_lat, property_lng):
    return calculate_distance_to_central_station(property_lat, property_lng)

# Apply the UDF to calculate the distance to Melbourne Central Station for all properties
df_properties_with_distance = df_properties.withColumn(
    'distance_to_cbd', 
    get_distance_to_central_station(col('latitude'), col('longitude'))
)

# Show the first few rows to verify the results
df_properties_with_distance.show(truncate=False)

# Define output path for the updated properties DataFrame
output_file_path = '../data/raw/cbd_dist.parquet'

# Save the updated DataFrame with the distance to Melbourne Central Station to a new Parquet file
df_properties_with_distance.write.mode("overwrite").parquet(output_file_path)

print(f"All properties with distance to Melbourne Central Station saved to {output_file_path}")


24/10/01 19:59:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+-------------------------------------------------+--------+------------------+-----------+---------------+
|name                                             |postcode|latitude          |longitude  |distance_to_cbd|
+-------------------------------------------------+--------+------------------+-----------+---------------+
|201/1-5 Donald Street, Prahran VIC 3181          |3181    |-37.8503323       |144.9861498|6.517          |
|201/1273 Burke Road, Kew VIC 3101                |3101    |-37.8074455       |145.0610387|13.715         |
|201/14 David Street, Richmond VIC 3121           |3121    |-37.8154413       |145.0111408|5.667          |
|201/1728 Dandenong Road, Clayton VIC 3168        |3168    |-37.9130175       |145.1242551|22.91          |
|201/175 Balaclava Road, Caulfield North VIC 3161 |3161    |-37.8724194       |145.0214855|11.156         |
|201/2a Montrose Place, Hawthorn East VIC 3123    |3123    |-37.8228565       |145.0458156|12.382         |
|201/324 Pascoe Vale Road, E

[Stage 45:>                                                         (0 + 1) / 1]

All properties with distance to Melbourne Central Station saved to ../data/raw/features_property.parquet


                                                                                

Schools in 5 km radius

In [24]:
import os
import googlemaps
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
import time

# Initialize the Spark session
spark = SparkSession.builder.appName("Find Schools within 5km").getOrCreate()

# Load the Parquet file with property coordinates
input_file_path = '../data/raw/property_coords.parquet'
df_properties = spark.read.parquet(input_file_path)
df_properties = df_properties.select('name', 'postcode', 'latitude', 'longitude')

# Initialize the Google Maps client with API key
gmaps = googlemaps.Client(key='API')  

# Define a function to find the number of schools within a 5 km radius using Google Places API
def count_schools_nearby(latitude, longitude, radius=2000):
    try:
        # Perform nearby search using the Google Places API
        places_result = gmaps.places_nearby(
            location=(latitude, longitude), 
            radius=radius, 
            type='school'
        )
        # Count the number of schools returned
        return len(places_result['results'])
    except Exception as e:
        print(f"Error fetching schools for location ({latitude}, {longitude}): {e}")
        return 0

# Register UDF to count the number of schools for each property
@udf(returnType=IntegerType())
def get_number_of_schools(latitude, longitude):
    return count_schools_nearby(latitude, longitude)

# Apply the UDF to calculate the number of schools for each property
df_properties = df_properties.withColumn(
    'number_of_schools_within_5km', 
    get_number_of_schools(df_properties['latitude'], df_properties['longitude'])
)

# Show the first few rows to verify the results
df_properties.show(truncate=False)




[Stage 65:>                                                         (0 + 1) / 1]

+-------------------------------------------------+--------+------------------+-----------+----------------------------+
|name                                             |postcode|latitude          |longitude  |number_of_schools_within_5km|
+-------------------------------------------------+--------+------------------+-----------+----------------------------+
|201/1-5 Donald Street, Prahran VIC 3181          |3181    |-37.8503323       |144.9861498|20                          |
|201/1273 Burke Road, Kew VIC 3101                |3101    |-37.8074455       |145.0610387|20                          |
|201/14 David Street, Richmond VIC 3121           |3121    |-37.8154413       |145.0111408|20                          |
|201/1728 Dandenong Road, Clayton VIC 3168        |3168    |-37.9130175       |145.1242551|20                          |
|201/175 Balaclava Road, Caulfield North VIC 3161 |3161    |-37.8724194       |145.0214855|20                          |
|201/2a Montrose Place, Hawthorn

                                                                                

In [27]:
import os
import googlemaps
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import time

# Initialize the Spark session
spark = SparkSession.builder.appName("Closest School Distance").getOrCreate()

# Load the Parquet file with properties and their coordinates
input_file_path = '../data/raw/property_coords.parquet'
df_properties = spark.read.parquet(input_file_path)
df_properties = df_properties.select('name', 'postcode', 'latitude', 'longitude')

# Initialize the Google Maps client with API key
gmaps = googlemaps.Client(key='API')  

# Define a function to find the closest school and its distance
def find_closest_school(latitude, longitude):
    try:
        # Search for schools within a 5km radius
        nearby_schools = gmaps.places_nearby(
            location=(latitude, longitude),
            radius=5000,
            type='school'
        )
        
        if 'results' not in nearby_schools or len(nearby_schools['results']) == 0:
            return None  # No schools found within the radius

        # Extract school locations (lat, lon)
        school_locations = [(school['geometry']['location']['lat'], school['geometry']['location']['lng']) for school in nearby_schools['results']]

        # Calculate distance to each school using Google Maps Distance Matrix API
        origins = f"{latitude},{longitude}"
        destinations = [f"{lat},{lng}" for lat, lng in school_locations]
        
        distance_result = gmaps.distance_matrix(origins, destinations, mode="driving")
        
        # Find the closest school by distance
        distances = [element['distance']['value'] for element in distance_result['rows'][0]['elements'] if element['status'] == 'OK']
        
        if len(distances) == 0:
            return None  # No valid distances found
        
        return min(distances) / 1000  # Return the closest distance in kilometers

    except Exception as e:
        print(f"Error finding closest school for ({latitude}, {longitude}): {e}")
        return None

# Register a UDF to calculate the distance to the closest school
@udf(returnType=DoubleType())
def get_closest_school_distance(latitude, longitude):
    return find_closest_school(latitude, longitude)


# Apply the UDF to calculate the distance to the closest school for each property
df_properties = df_properties.withColumn(
    'distance_to_closest_school_km', 
    get_closest_school_distance(col('latitude'), col('longitude'))
)

# Show the first few rows to verify the results
df_properties.show(truncate=False)


# Define output path as an absolute path
output_file_path = os.path.abspath('../data/raw/school_distances.parquet')

# Save the updated DataFrame with closest school distances to a new Parquet file
df_properties.write.mode("overwrite").parquet(output_file_path)


                                                                                

+-------------------------------------------------+--------+------------------+-----------+-----------------------------+
|name                                             |postcode|latitude          |longitude  |distance_to_closest_school_km|
+-------------------------------------------------+--------+------------------+-----------+-----------------------------+
|201/1-5 Donald Street, Prahran VIC 3181          |3181    |-37.8503323       |144.9861498|0.499                        |
|201/1273 Burke Road, Kew VIC 3101                |3101    |-37.8074455       |145.0610387|0.798                        |
|201/14 David Street, Richmond VIC 3121           |3121    |-37.8154413       |145.0111408|1.054                        |
|201/1728 Dandenong Road, Clayton VIC 3168        |3168    |-37.9130175       |145.1242551|1.3                          |
|201/175 Balaclava Road, Caulfield North VIC 3161 |3161    |-37.8724194       |145.0214855|1.592                        |
|201/2a Montrose Place, 

                                                                                

In [17]:
from pyspark.sql import SparkSession


# Load the two Parquet files
parquet_file_path1 = '../data/raw/school_distances.parquet'
parquet_file_path2 = '../data/raw/cbd_dist.parquet'

df1 = spark.read.parquet(parquet_file_path1)
df2 = spark.read.parquet(parquet_file_path2)

common_columns = ['name', 'postcode', 'latitude', 'longitude']

# Perform an outer join
merged_df = df1.join(df2, on=common_columns, how='outer')
merged_df.show()
output_file_path = '../data/curated/distances.parquet'
merged_df.write.mode("overwrite").parquet(output_file_path)

+--------------------+--------+------------------+-----------+-----------------------------+---------------+
|                name|postcode|          latitude|  longitude|distance_to_closest_school_km|distance_to_cbd|
+--------------------+--------+------------------+-----------+-----------------------------+---------------+
|(Leased) 3 Yarra ...|    3141|        -37.838131|144.9927343|                         1.54|          5.569|
|0 Cnr Watson Rd &...|    3953|       -38.4689049|145.9566144|                        0.413|        136.184|
|004B/12 Albert St...|    3123|       -37.8236023|145.0480671|                        1.237|          9.083|
|04/390 Burwood Hi...|    3125|       -37.8521232|145.1309439|                        1.409|         17.834|
|04/949 Dandenong ...|    3145|       -37.8785804|145.0483283|                        2.226|         14.936|
|07/390 Burwood Hi...|    3125|       -37.8521232|145.1309439|                        1.409|         17.834|
|1 & 2/23 Koonawar.

Getting average distances to cbd and school for suburb

In [21]:
data = df_spark

# adding epsilon 0.00001 where data is null to prevent errors
epsilon = 0.00001
data = data.fillna({'distance_to_cbd': epsilon, 'distance_to_closest_school_km': epsilon})

# Calculate the mean distance, grouping by postcode
mean_distances = data.groupBy('postcode').agg(
    {'distance_to_cbd': 'mean', 'distance_to_closest_school_km': 'mean'}
)
mean_distances.show()

mean_distances.write.parquet('../data/curated/avg_dist.parquet')



+--------+----------------------------------+--------------------+
|postcode|avg(distance_to_closest_school_km)|avg(distance_to_cbd)|
+--------+----------------------------------+--------------------+
|    3414|                             0.395|             336.669|
|    3015|                0.7306595744680852|  12.165212765957447|
|    3281|                            1.9805|             254.691|
|    3858|                            0.8935|  199.78300000000002|
|    3200|                1.1157857142857142|    51.6767142857143|
|    3121|                1.6963562500000005|   5.077168749999999|
|    3249|                            1.0E-5|             163.624|
|    3266|                0.6285000000000001|             200.784|
|    3167|                0.9129473684210526|  21.651526315789475|
|    3898|                             1.158|  393.83450000000005|
|    3179|                0.9099999999999999|             31.4574|
|    3875|                0.9314285714285715|   281.5841428571