In [19]:
import pandas as pd

# Load your Parquet file
file_path = "../data/raw/distances.parquet"
df = pd.read_parquet(file_path)

# Check the first few rows to ensure it's loaded correctly
print(df.head())


                                                name postcode   latitude  \
0      (Leased) 3 Yarra Street, South Yarra VIC 3141     3141 -37.838131   
1  0 Cnr Watson Rd & McPherson Rd, Leongatha VIC ...     3953 -38.468905   
2      004B/12 Albert Street, Hawthorn East VIC 3123     3123 -37.823602   
3           04/390 Burwood Highway, Burwood VIC 3125     3125 -37.852123   
4       04/949 Dandenong Road, Malvern East VIC 3145     3145 -37.878580   

    longitude  distance_to_closest_school_km  distance_to_cbd  
0  144.992734                          1.540            5.569  
1  145.956614                          0.413          136.184  
2  145.048067                          1.237            9.083  
3  145.130944                          1.409           17.834  
4  145.048328                          2.226           14.936  


In [22]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("Add Column from One Parquet to Another").getOrCreate()

# Load the two Parquet files
file1_path = "../data/raw/distances.parquet"
file2_path = "../data/raw/property_data.parquet"

df1 = spark.read.parquet(file1_path)
df2 = spark.read.parquet(file2_path)

# Show schemas to understand the columns
df1.printSchema()
df2.printSchema()

df2_selected = df2.select("name", "cost_text")

# Join the two DataFrames on the key (e.g., 'id')
df_merged = df1.join(df2_selected, on="name", how="left")  # Use appropriate join type (e.g., 'inner', 'left')

# Show the merged DataFrame
df_merged.show()

# Save the merged DataFrame to a new Parquet file
output_file_path = "../data/curated/map_features.parquet"
df_merged.write.mode("overwrite").parquet(output_file_path)

print(f"Merged Parquet file with additional column saved to {output_file_path}")


root
 |-- name: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- distance_to_closest_school_km: double (nullable = true)
 |-- distance_to_cbd: double (nullable = true)

root
 |-- url: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- suburb: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cost_text: string (nullable = true)
 |-- beds: string (nullable = true)
 |-- baths: string (nullable = true)
 |-- parking: string (nullable = true)
 |-- property_type: string (nullable = true)

+--------------------+--------+------------------+-----------+-----------------------------+---------------+--------------------+
|                name|postcode|          latitude|  longitude|distance_to_closest_school_km|distance_to_cbd|           cost_text|
+--------------------+--------+------------------+-----------+-----------------------------+---------------+--------

In [15]:
import folium

#(Melbourne CBD)
melbourne_cbd_coords = [-37.8136, 144.9631]


rental_map = folium.Map(location=melbourne_cbd_coords, zoom_start=7)


for index, row in df.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"Price: ${row['cost_text']}<br>Suburb: {row['suburb']}",  
        icon=folium.Icon(color='blue', icon='home')
    ).add_to(rental_map)

# Save map to an HTML file
output_map_path = "../data/plots/rental_properties_map.html"
rental_map.save(output_map_path)

print(f"Map saved to {output_map_path}")


Map saved to ../data/plots/rental_properties_map.html
