In [None]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, IntegerType
from pyspark.sql.functions import col, regexp_extract, expr, lower

# Initialize SparkSession
spark = SparkSession.builder.appName("LoadJSON").getOrCreate()

# Suppose your JSON data is stored in a file named 'data.json'
# Ensure that your JSON data is properly formatted and valid
with open('../data/raw/property_metadata.json', 'r') as file:
    data = json.load(file)

# Transform the data into a list of dictionaries
rows = []
for url, details in data.items():
    row = {'url': url}
    row.update(details)
    # Convert 'latitude' and 'longitude' to float or None if not found or 'Not found'
    row['latitude'] = (
        float(row['latitude']) if 'latitude' in row and row['latitude'] != 'Not found' else None
    )
    row['longitude'] = (
        float(row['longitude']) if 'longitude' in row and row['longitude'] != 'Not found' else None
    )
    
    rows.append(row)

# Define the schema with columns in the desired order, including latitude and longitude
schema = StructType([
    StructField('url', StringType(), True),
    StructField('name', StringType(), True),
    StructField('suburb', StringType(), True),
    StructField('cost_text', StringType(), True),
    StructField('rooms', ArrayType(StringType()), True),
    StructField('parking', ArrayType(StringType()), True),
    StructField('desc', StringType(), True),
    StructField('latitude', DoubleType(), True),
    StructField('longitude', DoubleType(), True),
])

# Create the DataFrame with the defined schema
df = spark.createDataFrame(rows, schema=schema)

# Show the DataFrame
df.show(truncate=False)

In [4]:

# Extract 'beds' and 'baths' from 'rooms' field
# Use expr to filter array elements and regexp_extract to get numbers

# Extract 'beds'
df = df.withColumn('beds_str', expr("filter(rooms, x -> x like '%Bed%')[0]"))
df = df.withColumn('beds', regexp_extract(col('beds_str'), '(\\d+)', 1).cast(IntegerType()))

# Extract 'baths'
df = df.withColumn('baths_str', expr("filter(rooms, x -> x like '%Bath%')[0]"))
df = df.withColumn('baths', regexp_extract(col('baths_str'), '(\\d+)', 1).cast(IntegerType()))

# Drop temporary columns
df = df.drop('beds_str', 'baths_str')

# Optionally, drop the 'rooms' column if no longer needed
# df = df.drop('rooms')

In [5]:
df = df.filter(~lower(col('desc')).contains('storage'))

In [None]:
df.show(truncate=False)

In [None]:
import folium
import pandas as pd

# Convert the PySpark DataFrame to Pandas
pandas_df = df.toPandas()

# Drop rows with NaN values in latitude or longitude
pandas_df = pandas_df.dropna(subset=['latitude', 'longitude'])

# Create a map centered around the mean latitude and longitude of the remaining points
map_center = [pandas_df['latitude'].mean(), pandas_df['longitude'].mean()]
my_map = folium.Map(location=map_center, zoom_start=50)

# Add markers for each property
for _, row in pandas_df.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['name'],
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(my_map)

# Display the map
# my_map