# Calculating Distances from Properties to CBD, Train Stations & Nearest Hospitals - Anushka


#### Starting the Spark Session

In [1]:
from pyspark.sql import SparkSession
# Register the UDF

spark = SparkSession.builder \
    .appName("App with More Memory") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/05 13:53:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/05 13:53:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
import geopandas as gpd
gdf_train_stations = gpd.read_file("../data/landing/Order_G6Z8LX (1)/ll_gda94/esrishape/whole_of_dataset/victoria/PTV/PTV_METRO_TRAIN_STATION.shp")

In [3]:
gdf_train_stations.head()

Unnamed: 0,STOP_ID,LATITUDE,STOP_NAME,LONGITUDE,TICKETZONE,ROUTEUSSP,geometry
0,19970,-37.781193,Royal Park Railway Station (Parkville),144.952301,1,Upfield,POINT (144.9523 -37.78119)
1,19971,-37.78814,Flemington Bridge Railway Station (North Melbo...,144.939323,1,Upfield,POINT (144.93932 -37.78814)
2,19972,-37.794267,Macaulay Railway Station (North Melbourne),144.936166,1,Upfield,POINT (144.93617 -37.79427)
3,19973,-37.807419,North Melbourne Railway Station (West Melbourne),144.94257,1,"Flemington,Sunbury,Upfield,Werribee,Williamsto...",POINT (144.94257 -37.80742)
4,19974,-37.788657,Clifton Hill Railway Station (Clifton Hill),144.995417,1,"Mernda,Hurstbridge",POINT (144.99542 -37.78866)


#### Extracting suburb names from train data

In [4]:
import re
# Function to extract suburb from STOP_NAME
def extract_suburb(stop_name):
    # Using regex to find the text within parentheses (suburb name)
    match = re.search(r'\((.*?)\)', stop_name)
    if match:
        return match.group(1)  # Return the suburb inside parentheses
    else:
        return None  # Return None if no match is found

In [5]:

gdf_train_stations['suburb'] = gdf_train_stations['STOP_NAME'].apply(extract_suburb)

#### Extracting suburb names from properties data

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

# Initialize Spark session with required settings
spark = SparkSession.builder \
    .appName("Address Processing") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.memory.fraction", "0.9") \
    .getOrCreate()


# Define a user-defined function (UDF) to extract the suburb
def extract_suburb(address):
    # Using regex to find the pattern between ', ' and ' VIC'
    match = re.search(r',\s*([^,]+)\s+VIC', address)
    if match:
        return match.group(1)
    return None

# Register the UDF
extract_suburb_udf = udf(extract_suburb, StringType())

# Load your property data
# Adjust this line to match your actual data source
# Read the CSV, but drop the first column (index)
properties_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("../data/curated/domain_data.csv")

# Drop the first column (index)
properties_df = properties_df.drop('_c0')

# Show the result
properties_df.show()

# If you accidentally loaded a pandas DataFrame, convert it to Spark DataFrame
# Example: if you have a pandas DataFrame named pd_df, convert it like this:
# property_df = spark.createDataFrame(pd_df)

# Apply the UDF to create a new column for the suburb
properties_df = properties_df.withColumn('suburb', extract_suburb_udf(properties_df['address']))

# Show the updated DataFrame to verify the suburb extraction
properties_df.show()


24/10/05 13:53:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
24/10/05 13:53:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+-----------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+-----------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+
|                 url|            price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|         sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|
+--------------------+-----------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+--

#### Assigning a Property_ID to each property

In [7]:
from pyspark.sql import functions as F

# Assuming your DataFrame is named `properties_df`
properties_df = properties_df.withColumn("property_id", F.monotonically_increasing_id())


In [8]:
properties_df.show()

+--------------------+-----------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+-----------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+-------------+-----------+
|                 url|            price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|         sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|       suburb|property_id|
+--------------------+-----------------+--------------------+--------------------+------------------+-----------+----+-----+--

In [9]:
#Drop the geometry column for distance calculation as we have latitude and longitude columns
gdf_train_stations = gdf_train_stations.drop(columns=['geometry'])

#### UDF to calculate the haversine distance between 2 points

In [10]:
from pyspark.sql.types import DoubleType
# Define the Haversine function
def haversine(lon1, lat1, lon2, lat2):
    import math
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

haversine_udf = udf(haversine, DoubleType())

#### Calculating the nearest train station per property based on haversine distance

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, min, row_number
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
import geopandas as gpd

# Assuming gdf_train_stations is your GeoDataFrame
# Check if 'geometry' column exists and drop it
if 'geometry' in gdf_train_stations.columns:
    gdf_train_stations = gdf_train_stations.drop(columns=['geometry'])

# Convert GeoDataFrame to Spark DataFrame after dropping 'geometry'
train_stations_df = spark.createDataFrame(gdf_train_stations)\
    .withColumnRenamed('longitude', 'station_longitude')\
    .withColumnRenamed('latitude', 'station_latitude')

# Perform the cross join and calculate distances
distances_df = properties_df.crossJoin(train_stations_df)\
    .withColumn("distance", haversine_udf(col("longitude"), col("latitude"), col("station_longitude"), col("station_latitude")))

# Define a window spec to partition by property and order by distance
windowSpec = Window.partitionBy("property_id").orderBy("distance")

# Determine the nearest station for each property using the window spec
nearest_stations_df = distances_df.withColumn("rank", row_number().over(windowSpec))\
    .filter(col("rank") == 1)\
    .select("property_id", "STOP_ID", "distance")

# Join the nearest station information back to the original properties DataFrame
result_df = properties_df.join(nearest_stations_df, "property_id")\
    .select(properties_df["*"], nearest_stations_df["STOP_ID"].alias("nearest_station_id"), nearest_stations_df["distance"].alias("nearest_station_distance"))

# Show the result
result_df.show()


                                                                                

+--------------------+--------------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+--------------+-----------+------------------+------------------------+
|                 url|               price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|          sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|        suburb|property_id|nearest_station_id|nearest_station_distance|
+--------------------+------

In [12]:
result_df.write.parquet("../data/curated/with_train_distance_haversine", mode="overwrite")

                                                                                

In [13]:
import pandas as pd
properties_updated = pd.read_parquet("../data/curated/with_train_distance_haversine")

In [14]:
import openrouteservice
# Set your ORS API key here
client = openrouteservice.Client(key='5b3ce3597851110001cf624831f5f7b8900e4eeab9d54353300aab90')

#### Function to calculate the car route distance from each property to its nearest train station

In [15]:
import time
import openrouteservice
import numpy as np
import pandas as pd
import os
import builtins

def calculate_route_distance(house, school, id_col_name, key, output_file='calculated_distances_1.csv', resume_file='last_index.txt', api_limit=1940, batch_size=40):
    start_index = 0
    
    if not isinstance(house, pd.DataFrame):
        house = house.toPandas()

    # Check if we need to resume from a previous state
    if os.path.exists(resume_file):
        with open(resume_file, 'r') as f:
            start_index = int(f.read().strip())
            print(f"Resuming from index {start_index}")

    total_houses = len(house)
    distances = [None] * total_houses

    # Load any previously calculated distances if available
    if os.path.exists(output_file):
        existing_data = pd.read_csv(output_file)
        for idx in range(builtins.min(len(existing_data), total_houses)):
            distances[idx] = existing_data.at[idx, 'distance']

    processed_houses = start_index
    remaining_requests = api_limit

    while processed_houses < total_houses:
        batch_houses = builtins.min(batch_size, total_houses - processed_houses, remaining_requests)
        for i in range(batch_houses):
            house_idx = processed_houses + i
            house_coords = (house.iloc[house_idx]['longitude'], house.iloc[house_idx]['latitude'])
            closest_school_id = house.iloc[house_idx][id_col_name]
            school_coords = school[school[key] == closest_school_id][['LONGITUDE', 'LATITUDE']].values[0]

            try:
                route = client.directions(
                    coordinates=[house_coords, tuple(school_coords)],
                    profile='driving-car',
                    format='geojson'
                )
                distance_meters = route['features'][0]['properties']['segments'][0]['distance']
                distances[house_idx] = distance_meters / 1000
            except Exception as e:
                if "quota" in str(e).lower():  # Check if the error message is related to API quota
                    print(f"API quota exceeded at index {house_idx}. Saving progress and pausing.")
                    with open(resume_file, 'w') as f:
                        f.write(str(house_idx))
                    pd.DataFrame({'distance': distances}).to_csv(output_file, index=False)
                    return
                print(f"Error calculating distance for house {house_idx}: {e}")

        processed_houses += batch_houses
        remaining_requests -= batch_houses

        # Save progress after each batch
        if processed_houses % 100 == 0 or processed_houses == total_houses:
            pd.DataFrame({'distance': distances}).to_csv(output_file, index=False)
            print(f"Progress saved at index {house_idx}")

        # Sleep for 60 seconds after each batch to avoid hitting API rate limits
        print("Sleeping for 60 seconds to manage API call frequency...")
        time.sleep(60)

        if remaining_requests <= 0 and processed_houses < total_houses:
            print("Waiting to reset API limit...")
            time.sleep(60)  # Sleep time could be adjusted based on your API's reset time
            remaining_requests = api_limit

        if processed_houses >= total_houses:
            with open(resume_file, 'w') as f:
                f.write("completed")
            print("All distances calculated and saved.")


In [16]:
## commented out because I don't want to risk running it again and losing the file lol
# calculated_distances = calculate_route_distance(result_df, gdf_train_stations, 'nearest_station_id', "STOP_ID")

In [17]:
train_distance_df = pd.read_csv('calculated_distances.csv')

In [18]:
train_distance_df.head(10)

Unnamed: 0,distance
0,5.6596
1,4.6923
2,1.4377
3,2.2956
4,1.7662
5,1.6396
6,0.8211
7,0.4065
8,1.308
9,0.8074


In [19]:
import pandas as pd


if not isinstance(properties_updated, pd.DataFrame):
        properties_updated = properties_updated.toPandas()
# Ensure both DataFrames have the same index length and order
train_distance_df = train_distance_df.reset_index(drop=True)
properties_updated = properties_updated.reset_index(drop=True)

# Add the distance column to properties_updated with the new name
properties_updated['train_station_distance_km'] = train_distance_df['distance']


In [20]:
properties_updated.head(5)

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,...,aus_code,aus_name,areasqkm,loci_uri,geometry_proj,suburb,property_id,nearest_station_id,nearest_station_distance,train_station_distance_km
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4.0,3.0,2.0,9125.0,...,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (989259.4173701588 -4291898.086618988),Donvale,0,19900,4.980381,5.6596
1,https://www.domain.com.au/10-51-55-leslie-stre...,$500 Per Week,"10/51-55 Leslie Street, Donvale VIC 3111",Apartment / Unit / Flat,-37.781431,145.181474,2.0,1.0,1.0,2173.0,...,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (989219.1533940694 -4292696.593780912),Donvale,7,19900,4.197604,4.6923
2,https://www.domain.com.au/293-richardson-stree...,$630.00,"293 Richardson Street, Carlton North VIC 3054",House,-37.784081,144.965615,2.0,1.0,0.0,2738.0,...,AUS,Australia,2.3042,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (970145.4328510358 -4291308.466741183),Carlton North,19,19969,1.179007,1.4377
3,https://www.domain.com.au/786-drummond-street-...,$720.00,"786 Drummond Street, Carlton North VIC 3054",House,-37.786146,144.971258,3.0,1.0,0.0,3129.0,...,AUS,Australia,2.3042,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (970623.2592136518 -4291581.655997339),Carlton North,22,19969,1.659614,2.2956
4,https://www.domain.com.au/8-1068-lygon-street-...,$750 pw,"8/1068 Lygon Street, Carlton North VIC 3054",Apartment / Unit / Flat,-37.780942,144.970534,3.0,2.0,2.0,3260.0,...,AUS,Australia,2.3042,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (970610.0843960904 -4290997.402562675),Carlton North,26,19969,1.231711,1.7662


In [21]:
# dropping haversine distance of nearest station
if 'nearest_station_distance' in properties_updated.columns:
    properties_updated = properties_updated.drop('nearest_station_distance', axis=1)


In [22]:
# Save to Parquet in a specific directory
properties_updated.to_parquet('../data/curated/train_car_route_final', engine='pyarrow', index=False)



In [23]:
import pandas as pd
properties_train = pd.read_parquet('../data/curated/train_car_route_final', engine='pyarrow')
properties_train.head()

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,...,ste_name,aus_code,aus_name,areasqkm,loci_uri,geometry_proj,suburb,property_id,nearest_station_id,train_station_distance_km
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4.0,3.0,2.0,9125.0,...,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (989259.4173701588 -4291898.086618988),Donvale,0,19900,5.6596
1,https://www.domain.com.au/10-51-55-leslie-stre...,$500 Per Week,"10/51-55 Leslie Street, Donvale VIC 3111",Apartment / Unit / Flat,-37.781431,145.181474,2.0,1.0,1.0,2173.0,...,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (989219.1533940694 -4292696.593780912),Donvale,7,19900,4.6923
2,https://www.domain.com.au/293-richardson-stree...,$630.00,"293 Richardson Street, Carlton North VIC 3054",House,-37.784081,144.965615,2.0,1.0,0.0,2738.0,...,Victoria,AUS,Australia,2.3042,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (970145.4328510358 -4291308.466741183),Carlton North,19,19969,1.4377
3,https://www.domain.com.au/786-drummond-street-...,$720.00,"786 Drummond Street, Carlton North VIC 3054",House,-37.786146,144.971258,3.0,1.0,0.0,3129.0,...,Victoria,AUS,Australia,2.3042,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (970623.2592136518 -4291581.655997339),Carlton North,22,19969,2.2956
4,https://www.domain.com.au/8-1068-lygon-street-...,$750 pw,"8/1068 Lygon Street, Carlton North VIC 3054",Apartment / Unit / Flat,-37.780942,144.970534,3.0,2.0,2.0,3260.0,...,Victoria,AUS,Australia,2.3042,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (970610.0843960904 -4290997.402562675),Carlton North,26,19969,1.7662


In [24]:
properties_train_sdf = spark.createDataFrame(properties_train)
properties_train_sdf.show()

+--------------------+--------------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+--------------+-----------+------------------+-------------------------+
|                 url|               price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|          sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|        suburb|property_id|nearest_station_id|train_station_distance_km|
+--------------------+----

#### Calculating the car route distance to the CBD for each property

In [25]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import monotonically_increasing_id
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName("Property Distance Calculation").getOrCreate()

# Assuming properties_df and client are properly defined and imported
melbourne_cbd_coords = [144.9631, -37.8136]

# Convert Spark DataFrame to pandas DataFrame for iteration
pandas_df = properties_train_sdf.select("longitude", "latitude").toPandas()

# Prepare property coordinates
property_coords = list(zip(pandas_df['longitude'], pandas_df['latitude']))

# Define maximum number of routes per batch
max_destinations_per_batch = 3500

# Split property coordinates into batches
batches = [property_coords[i:i + max_destinations_per_batch] for i in range(0, len(property_coords), max_destinations_per_batch)]

# Store results for all batches
all_distances = []
for batch in batches:
    try:
        response = client.distance_matrix(
            locations=[melbourne_cbd_coords] + batch,
            profile='driving-car',
            metrics=['distance'],
            sources=[0],
            destinations=list(range(1, len(batch) + 1))
        )
        distances = response['distances'][0]
        all_distances.extend(distances)
    except Exception as e:
        print(f"Error making batch request: {e}")
        all_distances.extend([None] * len(batch))

# Create a DataFrame for distances

distances_pandas_df = pd.DataFrame(all_distances, columns=['cbd_distance_m'])
# Reset index to add an increasing ID column starting from 0
distances_pandas_df.reset_index(inplace=True)
distances_pandas_df.rename(columns={'index': 'ID'}, inplace=True)

# Add a sequential index to both DataFrames to ensure correct row alignment
properties_train_sdf = properties_train_sdf.withColumn("row_index", monotonically_increasing_id())

properties_df_pandas = properties_train_sdf.toPandas()

# Make sure both DataFrames have an 'ID' or 'row_index' that you will use to join
properties_df_pandas['row_index'] = range(len(properties_df_pandas))
distances_pandas_df['row_index'] = range(len(distances_pandas_df))

# Join the DataFrames on 'row_index'
properties_train_cbd = pd.merge(properties_df_pandas, distances_pandas_df, on='row_index')





24/10/05 13:54:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

#### Ensuring that we have not lost any data

In [26]:
# Write to CSV
properties_train_cbd.to_csv('../data/curated/pandas_dataset_with_property_distance_to_cbd.csv', index=False, header=True)
# Read back the data
df_check = pd.read_csv('../data/curated/pandas_dataset_with_property_distance_to_cbd.csv')

# Print the number of entries in the read-back DataFrame
print("Count of entries in the read-back DataFrame:")
print(df_check.shape[0])  # df.shape[0] gives the number of rows
# Showing a larger number of rows
print(properties_train_cbd[['address', 'cbd_distance_m']].head(100))


Count of entries in the read-back DataFrame:
9565
                                          address  cbd_distance_m
0               10 Allara Court, Donvale VIC 3111        25934.87
1        10/51-55 Leslie Street, Donvale VIC 3111        24889.39
2   293 Richardson Street, Carlton North VIC 3054         4140.98
3     786 Drummond Street, Carlton North VIC 3054         4030.91
4     8/1068 Lygon Street, Carlton North VIC 3054         4225.06
..                                            ...             ...
95   11/305 Canterbury Road, Forest Hill VIC 3131        26492.85
96          20 Taronga Court, Nunawading VIC 3131        23015.63
97         1/54 Efron Street, Nunawading VIC 3131        23884.42
98            3 Menck Street, Nunawading VIC 3131        24524.55
99         36 Worrell Street, Nunawading VIC 3131        23245.19

[100 rows x 2 columns]


In [27]:
# Assuming your pandas DataFrame is named 'pandas_df'
if "ID" in df_check.columns:
    df_check = df_check.drop("ID", axis=1)
else:
    print("Column 'ID' not found in the DataFrame.")
if "row_index" in df_check.columns:
    df_check = df_check.drop("row_index", axis=1)
else:
    print("Column 'row_index' not found in the DataFrame.")
    

result_cbd_train = spark.createDataFrame(df_check)
# Now you can use Spark DataFrame operations
result_cbd_train.show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+--------------+-----------+------------------+-------------------------+--------------+
|                 url|               price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|          sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|        suburb|property_id|nearest_station_id|train_station_distance_km|cbd_distance

#### Converting CBD distance to KM for consistency

In [28]:
from pyspark.sql.functions import col

# Assuming 'result_df' is your Spark DataFrame containing the column 'cbd_distance_m'

# Convert 'cbd_distance_m' from meters to kilometers and rename it to 'cbd_distance_km'
result_cbd_train = result_cbd_train.withColumn('cbd_distance_km', col('cbd_distance_m') / 1000) \
                     .drop('cbd_distance_m')

# Show the updated DataFrame to verify the changes
result_cbd_train.show()


+--------------------+--------------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+--------------+-----------+------------------+-------------------------+------------------+
|                 url|               price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|          sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|        suburb|property_id|nearest_station_id|train_station_distance_km|   cbd_d

In [29]:
result_cbd_train.write.parquet('../data/curated/train_cbd_distance_final',  mode='overwrite')

                                                                                

In [30]:
sdf_cbd_train = spark.read.parquet('../data/curated/train_cbd_distance_final')

#### Reading in the Hospital Data

In [31]:
victoria_hospitals = pd.read_csv('../data/raw/vic_hospitals.csv')

In [32]:
# the dataset has instances of "closed" hospitals, which have null values for latitude and longitude 
victoria_hospitals = victoria_hospitals.dropna(subset=['Latitude', 'Longitude'])

#### Calculating the haversine distance to the nearest hospital for each property

In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number, monotonically_increasing_id
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

# Ensure you have a Spark session running
spark = SparkSession.builder.getOrCreate()

# Assuming your DataFrames are loaded correctly, and you have the haversine_udf defined
if isinstance(victoria_hospitals, pd.DataFrame):
    hospital_df_1 = spark.createDataFrame(victoria_hospitals)
else:
    hospital_df_1 = victoria_hospitals

# Rename columns in hospital_df for easier handling
hospital_df_1 = hospital_df_1.withColumnRenamed('Longitude', 'hospital_longitude')\
                             .withColumnRenamed('Latitude', 'hospital_latitude')


# Convert string columns to double for calculation purposes
sdf_cbd_train = sdf_cbd_train.withColumn("longitude", col("longitude").cast(DoubleType()))\
                             .withColumn("latitude", col("latitude").cast(DoubleType()))

hospital_df_1 = hospital_df_1.withColumn("hospital_longitude", col("hospital_longitude").cast(DoubleType()))\
                             .withColumn("hospital_latitude", col("hospital_latitude").cast(DoubleType()))

# Perform the cross join and calculate distances
distances_df = sdf_cbd_train.crossJoin(hospital_df_1)\
    .withColumn("distance", haversine_udf(col("longitude"), col("latitude"), col("hospital_longitude"), col("hospital_latitude")))

# Define a window specification to partition by property and order by distance
windowSpec = Window.partitionBy("property_id").orderBy("distance")

# Determine the nearest hospital for each property using the window spec
nearest_hospitals_df = distances_df.withColumn("rank", row_number().over(windowSpec))\
    .filter(col("rank") == 1)\
    .select("property_id", "Code", "distance")

# Join the nearest hospital information back to the original properties DataFrame
final_df = sdf_cbd_train.join(nearest_hospitals_df, "property_id")\
    .select(sdf_cbd_train["*"], nearest_hospitals_df["Code"].alias("nearest_hospital_id"), nearest_hospitals_df["distance"].alias("nearest_hospital_distance"))

# Show the result
final_df.show()




+--------------------+--------------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+--------------+-----------+------------------+-------------------------+------------------+-------------------+-------------------------+
|                 url|               price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|          sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|        suburb|property_id|nearest

                                                                                

#### Writing the final dataframe with train station, cbd and hospital distances

In [34]:
from pyspark.sql import functions as F

# Apply log transformation (log1p to handle values of 0)
final_df = final_df.withColumn('log_nearest_station_distance', F.log1p('train_station_distance_km'))
final_df = final_df.withColumn('log_nearest_hospital_distance', F.log1p('nearest_hospital_distance'))
final_df = final_df.withColumn('log_cbd_distance', F.log1p('cbd_distance_km'))


In [36]:
final_df.show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+------------------+-----------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+---------+--------+------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+--------------+-----------+------------------+-------------------------+------------------+-------------------+-------------------------+----------------------------+-----------------------------+------------------+
|                 url|               price|             address|       property_type|          latitude|  longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|  chg_lbl|sa3_code|          sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqk

In [35]:
final_df.write.parquet('../data/curated/final_train_hospital_cbd_dist_data',mode='overwrite')

                                                                                