#### Written Franklin (Koquiun) Li Lin 

## Parks/Reservations & Domain Merge

In this notebook, we will merge 2 datasets: the parkres dataset and domain dataset, and calculate the Euclidian distance between each park/reservation to each property.

#### Import Libraries

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
import os
import sys
from pyspark.sql import functions as F  #filtering
import pandas as pd
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType, StringType
from shapely import wkt
from pyspark.sql.functions import udf
from math import radians, sin, cos, sqrt, atan2

### Inspect data

In [4]:
# starting a Spark session
spark = (
    SparkSession.builder.appName('Parkres Further Analysis')
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

# Read the dataset from a CSV file using PySpark
parkres = spark.read.csv('../data/curated/parkres/parkres.csv', header=True, inferSchema=True)

# Drop the extra index column (_c0) if it exists
parkres = parkres.drop('_c0')

# Show the first few rows of the dataset to confirm
parkres.show(5)

+--------------------+--------------------+--------+--------------------+
|                name|            sa2_name|postcode|            geometry|
+--------------------+--------------------+--------+--------------------+
|Lilydale-Warburto...|        Yarra Valley|    3139|POLYGON ((1034153...|
|Nangana Bushland ...|        Yarra Valley|    3139|POLYGON ((1022203...|
|Nillumbik G139 Bu...|Wattle Glen - Dia...|    3089|POLYGON ((989912....|
|Lilydale-Warburto...|Lilydale - Coldst...|    3140|POLYGON ((1005216...|
|Plenty Gorge Park...|  Plenty - Yarrambat|    3088|POLYGON ((983018....|
+--------------------+--------------------+--------+--------------------+
only showing top 5 rows



In [5]:
# read the domain parquet dataset
domain = spark.read.csv('../data/curated/domain_data.csv', header=True, inferSchema=True)
domain = domain.drop('_c0')
domain.limit(5)

24/10/08 20:31:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,extracted_price,geometry,index_right,sa2_code,sa2_name,chg_flag,chg_lbl,sa3_code,sa3_name,sa4_code,sa4_name,gcc_code,gcc_name,ste_code,ste_name,aus_code,aus_name,areasqkm,loci_uri,geometry_proj
https://www.domai...,"$1,400.00","10 Allara Court, ...",Townhouse,-37.77427300000001,145.1811258,4.0,3.0,2.0,9125.0,1400.0,POINT (145.181125...,310.0,211021261,Donvale - Park Or...,0,No change,21102,Manningham - East,211,Melbourne - Outer...,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.dat...,POINT (989259.417...
https://www.domai...,$750 per week,"7 Pine Ridge, Don...",House,-37.7912513,145.1756489,4.0,2.0,0.0,3259.0,750.0,POINT (145.175648...,310.0,211021261,Donvale - Park Or...,0,No change,21102,Manningham - East,211,Melbourne - Outer...,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.dat...,POINT (988607.823...
https://www.domai...,$1300 per week,"20 Mulsanne Way, ...",House,-37.7972323,145.1812636,5.0,2.0,2.0,5649.0,1300.0,POINT (145.181263...,310.0,211021261,Donvale - Park Or...,0,No change,21102,Manningham - East,211,Melbourne - Outer...,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.dat...,POINT (989043.874...
https://www.domai...,$825pw / $3585pcm,3 Monterey Cresce...,House,-37.792402,145.1743233,3.0,1.0,5.0,3585.0,825.0,POINT (145.174323...,310.0,211021261,Donvale - Park Or...,0,No change,21102,Manningham - East,211,Melbourne - Outer...,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.dat...,POINT (988479.463...
https://www.domai...,$680.00,3/49 Leslie Stree...,Townhouse,-37.7810117,145.180705,3.0,2.0,2.0,2955.0,680.0,POINT (145.180705...,310.0,211021261,Donvale - Park Or...,0,No change,21102,Manningham - East,211,Melbourne - Outer...,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.dat...,POINT (989155.458...


In [6]:
# Check for null values in each column
domain.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in domain.columns]).show()

+---+-----+-------+-------------+--------+---------+----+-----+-------+----+---------------+--------+-----------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------------+
|url|price|address|property_type|latitude|longitude|Beds|Baths|Parking|bond|extracted_price|geometry|index_right|sa2_code|sa2_name|chg_flag|chg_lbl|sa3_code|sa3_name|sa4_code|sa4_name|gcc_code|gcc_name|ste_code|ste_name|aus_code|aus_name|areasqkm|loci_uri|geometry_proj|
+---+-----+-------+-------------+--------+---------+----+-----+-------+----+---------------+--------+-----------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------------+
|  0|    0|      0|            0|       0|        0| 136|   67|      9|1199|              0|       0|          0|       0|       0|       0|      0|       0|       0|       0|       0|   

### Calculate the distance between parks/reservations and properties

In [7]:
# Function to calculate distance between two points (Haversine formula)
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in km
    
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

def calculate_euclidean_distance(lat1, lon1, lat2, lon2):
    return sqrt((lat1 - lat2) ** 2 + (lon1 - lon2) ** 2)

# UDF to convert WKT geometry to centroid coordinates
def get_centroid(geometry):
    try:
        shape = wkt.loads(geometry)
        centroid = shape.centroid
        return f"{centroid.y},{centroid.x}"
    except:
        return None

### Merge datasets

In [8]:
# Register UDFs
distance_udf = udf(calculate_euclidean_distance, DoubleType())
centroid_udf = udf(get_centroid, StringType())

# Prepare park data
parkres = parkres.withColumnRenamed("geometry", "park_geometry")
parkres = parkres.withColumn("park_centroid", centroid_udf(F.col("park_geometry")))
parkres = parkres.withColumn("park_centroid_lat", F.split(F.col("park_centroid"), ",")[0].cast(DoubleType()))
parkres = parkres.withColumn("park_centroid_lon", F.split(F.col("park_centroid"), ",")[1].cast(DoubleType()))

# Prepare domain data
domain = domain.withColumn("property_location", centroid_udf(F.col("geometry_proj")))
domain = domain.withColumn("prop_lat", F.split(F.col("property_location"), ",")[0].cast(DoubleType()))
domain = domain.withColumn("prop_lon", F.split(F.col("property_location"), ",")[1].cast(DoubleType()))
domain = domain.withColumnRenamed("sa2_name", "property_sa2_name")

# Cross join to calculate the distance between each property and every park
result = domain.crossJoin(F.broadcast(parkres))

# Calculate distances (km) between properties and park centroids
result = result.withColumn("distance", 
    distance_udf(F.col("prop_lat"), F.col("prop_lon"), 
                 F.col("park_centroid_lat"), F.col("park_centroid_lon")))
result.orderBy("distance").show(5)



+--------------------+-------------------+--------------------+--------------------+-----------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+--------+--------+-----------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+--------------------+------------------+-----------------+--------------------+--------------------+--------+--------------------+--------------------+------------------+-----------------+------------------+
|                 url|              price|             address|       property_type|   latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|   property_sa2_name|chg_flag|  chg_lbl|sa3_code|sa3_name|sa4_code|         sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|   pro

                                                                                

### Find the nearest park for each property

In [9]:
window_spec = Window.partitionBy("url").orderBy("distance")
nearest_park = result.withColumn("row", F.row_number().over(window_spec)) \
    .filter(F.col("row") == 1) \
    .select(
        "url",
        F.round(F.col("distance"), 3).alias("nearest_park_distance"),  # Round to 3 decimal places
        F.col("name").alias("nearest_park_name"),
        F.col("sa2_name").alias("park_sa2_name")
    )

# Join back to the original domain data
final_result = domain.join(nearest_park, on="url")

# Show the result for validation
final_result.select(
    "address", 
    "url",
    "sa2_code",
    "property_sa2_name", 
    "extracted_price", 
    "nearest_park_distance", 
    "nearest_park_name", 
    "park_sa2_name"
) \
    .orderBy("nearest_park_distance") \
    .show(10, truncate=False)



+---------------------------------------------+-------------------------------------------------------------------------------+---------+----------------------+---------------+---------------------+-------------------------------------------+----------------------+
|address                                      |url                                                                            |sa2_code |property_sa2_name     |extracted_price|nearest_park_distance|nearest_park_name                          |park_sa2_name         |
+---------------------------------------------+-------------------------------------------------------------------------------+---------+----------------------+---------------+---------------------+-------------------------------------------+----------------------+
|208/20 Shamrock Street, Abbotsford VIC 3067  |https://www.domain.com.au/208-20-shamrock-street-abbotsford-vic-3067-17132635  |206071139|Abbotsford            |475.0          |48.085               |Lowe

                                                                                

### Save

In [10]:
directory_path = "../data/curated/parkres"
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

final_result.select(
    "address", 
    "sa2_code",
    "url",
    "property_sa2_name", 
    "extracted_price", 
    F.col("nearest_park_distance").alias("nearest_parkres_distance"),  
    F.col("nearest_park_name").alias("nearest_parkres_name"), 
    F.col("park_sa2_name").alias("parkres_sa2_name")
).write.parquet(f"{directory_path}/parkres_domain.parquet", mode="overwrite")

                                                                                

Please turn to `parkres_domain_analysis` for further analysis on the impact of parks/reservations on rental price.