In [21]:
from pyspark.sql import SparkSession

# Create a Spark session (if not already created)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

# Load the Parquet file using Spark
parquet_file_path = '../data/raw/property_coords.parquet'  # Replace with your file path
df_spark = spark.read.parquet(parquet_file_path)

# Show the first few rows to verify the result
df_spark.show()

# Count the number of properties (rows) in the Parquet file
num_properties = df_spark.count()

# Print the result
print(f"There are {num_properties} properties in the Parquet file.")


+--------------------+--------+------------------+--------------------+-------------+----+-----+-------+--------------------+------------------+-----------+
|                 url|postcode|            suburb|                name|    cost_text|beds|baths|parking|       property_type|          latitude|  longitude|
+--------------------+--------+------------------+--------------------+-------------+----+-----+-------+--------------------+------------------+-----------+
|https://www.domai...|    3181|           prahran|201/1-5 Donald St...|$340 per week|   1|    1|      1|              Studio|       -37.8503323|144.9861498|
|https://www.domai...|    3101|               kew|201/1273 Burke Ro...|$675 per week|   2|    2|      1|Apartment / Unit ...|       -37.8074455|145.0610387|
|https://www.domai...|    3121|          richmond|201/14 David Stre...|  $650 weekly|   2|    1|      1|Apartment / Unit ...|       -37.8154413|145.0111408|
|https://www.domai...|    3168|           clayton|201/1728

In [None]:
import googlemaps

# Initialize the Google Maps client with your API key
gmaps = googlemaps.Client(key='AIzaSyDYDPdLTa7c2WJCDLfiujiOnYzG3mYthHY')

# Geocode an address
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')

print(geocode_result)


In [3]:
import os
import googlemaps
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import time

# Initialize the Spark session
spark = SparkSession.builder.appName("Geocode Properties in Batches with Google Maps").getOrCreate()

# Load the Parquet file
input_file_path = '../data/raw/property_data.parquet/.'  
df_spark = spark.read.parquet(input_file_path)

# Initialize the Google Maps client with your API key
gmaps = googlemaps.Client(key='key')  # Replace with your actual Google Maps API key

# Define a function to geocode an address using Google Maps API with retry logic
def geocode_with_retry(address, retries=3, delay=1):
    for i in range(retries):
        try:
            response = gmaps.geocode(address)
            if response and len(response) > 0:
                location = response[0]['geometry']['location']
                return location['lat'], location['lng']  # Return as (latitude, longitude)
        except googlemaps.exceptions.ApiError as e:
            print(f"Error geocoding {address}: {e}")
            if "OVER_QUERY_LIMIT" in str(e):  # Rate limit exceeded
                print(f"Rate limit exceeded, waiting for {delay} seconds before retrying...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
        except Exception as e:
            print(f"Error geocoding {address}: {e}")
    return None, None

# Register UDFs for latitude and longitude
@udf(returnType=DoubleType())
def get_latitude(address):
    lat, _ = geocode_with_retry(address)
    return lat

@udf(returnType=DoubleType())
def get_longitude(address):
    _, lon = geocode_with_retry(address)
    return lon

# Get the total number of properties
total_properties = df_spark.count()
batch_size = 1000
num_batches = (total_properties // batch_size) + 1  # Calculate how many batches we need

# Directory to save Parquet files
output_dir = os.path.abspath('../data/raw/batch_output')

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process properties in batches
for batch_number in range(num_batches):
    # Calculate the starting index for this batch
    start = batch_number * batch_size
    end = start + batch_size
    
    # Get the current batch of properties
    df_batch = df_spark.limit(end).subtract(df_spark.limit(start))
    
    # Apply the UDFs to get latitude and longitude for the current batch
    df_batch = df_batch.withColumn('latitude', get_latitude(df_batch['name']))
    df_batch = df_batch.withColumn('longitude', get_longitude(df_batch['name']))
    
    # Define the output file path for the current batch
    output_file_path = os.path.join(output_dir, f'properties_batch_{batch_number+1}.parquet')
    
    # Save the updated DataFrame with latitude and longitude to a new Parquet file
    df_batch.write.mode("overwrite").parquet(output_file_path)
    
    # Print progress
    print(f"Batch {batch_number+1} with {df_batch.count()} properties saved to {output_file_path}")
    
    # Wait before processing the next batch (optional, to avoid API rate limits)
    time.sleep(5)  # Adjust this based on rate limit requirements

print("Geocoding and saving all properties complete.")

24/09/25 22:09:36 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

Batch 1 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_1.parquet


                                                                                

Batch 2 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_2.parquet


                                                                                

Batch 3 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_3.parquet


                                                                                

Batch 4 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_4.parquet


                                                                                

Batch 5 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_5.parquet


                                                                                

Batch 6 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_6.parquet


                                                                                

Batch 7 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_7.parquet


                                                                                

Batch 8 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_8.parquet


                                                                                

Batch 9 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_9.parquet


                                                                                

Batch 10 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_10.parquet


                                                                                

Batch 11 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_11.parquet


                                                                                

Batch 12 with 1000 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_12.parquet


                                                                                

Batch 13 with 245 properties saved to /home/fafda/project-2-group-real-estate-industry-project-22/data/raw/batch_output/properties_batch_13.parquet
Geocoding and saving all properties complete.


In [9]:
spark.stop()  # Stops the existing Spark session

In [23]:
import os
import pandas as pd

# Path to the main folder containing 13 subfolders with Parquet files
base_dir = '../data/raw/batch_output'



# Initialize an empty list to hold DataFrames
df_list = []

# Loop through each folder
for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        # Loop through each parquet file in the folder
        for file in os.listdir(folder_path):
            if file.endswith('.parquet'):
                file_path = os.path.join(folder_path, file)
                # Read the parquet file with only selected columns and append the DataFrame to the list
                df_list.append(pd.read_parquet(file_path))

# Concatenate all DataFrames into one
merged_df = pd.concat(df_list, ignore_index=True)

# Optionally, save the merged DataFrame with selected columns to a new Parquet file
merged_df.to_parquet('property_coords.parquet')


In [2]:
#pip3 install googlemaps?
#pip3 install osmnx geopy





In [2]:
import os
import googlemaps
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import time


spark = SparkSession.builder.appName("Test Distance to Melbourne CBD").getOrCreate()


input_file_path = '../data/raw/property_coords.parquet'
df_properties = spark.read.parquet(input_file_path)


gmaps = googlemaps.Client(key='AIzaSyDYDPdLTa7c2WJCDLfiujiOnYzG3mYthHY')  


def calculate_distance_to_cbd(property_lat, property_lng):
    try:
        origins = f"{property_lat},{property_lng}"
        # Melbourne CBD coordinates (Flinders Street Station)
        destination = "-37.817209,144.964478"
        response = gmaps.distance_matrix(origins, destination, mode="driving")
        if response['rows'][0]['elements'][0]['status'] == 'OK':
            return response['rows'][0]['elements'][0]['distance']['value'] / 1000  # Distance in kilometers
        else:
            return None
    except Exception as e:
        print(f"Error calculating distance to CBD: {e}")
        return None

# Register a UDF to calculate the distance to Melbourne CBD
@udf(returnType=DoubleType())
def get_distance_to_cbd(property_lat, property_lng):
    return calculate_distance_to_cbd(property_lat, property_lng)

# Limit the DataFrame to the first 100 properties for testing
df_properties_100 = df_properties.limit(1000)

# Apply the UDF to calculate the distance to Melbourne CBD for each property
df_properties_100 = df_properties_100.withColumn(
    'distance_to_cbd_km', 
    get_distance_to_cbd(col('latitude'), col('longitude'))
)

# Show the first few rows to verify the results
df_properties_100.show(truncate=False)

# Define output path for the test results
#output_file_path = '../data/raw/test_100_properties_with_cbd_distances.parquet'

# Save the updated DataFrame with the distance to CBD to a new Parquet file


[Stage 3:>                                                          (0 + 1) / 1]

+-----------------------------------------------------------------------------------+--------+------------------+-------------------------------------------------+-------------+----+-----+-------+-----------------------------+------------------+-----------+------------------+
|url                                                                                |postcode|suburb            |name                                             |cost_text    |beds|baths|parking|property_type                |latitude          |longitude  |distance_to_cbd_km|
+-----------------------------------------------------------------------------------+--------+------------------+-------------------------------------------------+-------------+----+-----+-------+-----------------------------+------------------+-----------+------------------+
|https://www.domain.com.au/201-1-5-donald-street-prahran-vic-3181-8638809           |3181    |prahran           |201/1-5 Donald Street, Prahran VIC 3181          |$340 p

                                                                                