In [31]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, IntegerType
from pyspark.sql.functions import col, regexp_extract, expr, lower, split, explode, trim, col, regexp_replace, initcap, avg, when

# Initialize SparkSession
spark = SparkSession.builder.appName("Preprocess").getOrCreate()

# Suppose your JSON data is stored in a file named 'data.json'
# Ensure that your JSON data is properly formatted and valid
with open('../data/raw/property_metadata.json', 'r') as file:
    data = json.load(file)

# Transform the data into a list of dictionaries
rows = []
for url, details in data.items():
    row = {'url': url}
    row.update(details)
    # Convert 'latitude' and 'longitude' to float or None if not found or 'Not found'
    row['latitude'] = (
        float(row['latitude']) if 'latitude' in row and row['latitude'] != 'Not found' else None
    )
    row['longitude'] = (
        float(row['longitude']) if 'longitude' in row and row['longitude'] != 'Not found' else None
    )
    
    rows.append(row)

# Define the schema with columns in the desired order, including latitude and longitude
schema = StructType([
    StructField('url', StringType(), True),
    StructField('name', StringType(), True),
    StructField('suburb', StringType(), True),
    StructField('cost_text', StringType(), True),
    StructField('rooms', ArrayType(StringType()), True),
    StructField('parking', ArrayType(StringType()), True),
    StructField('desc', StringType(), True),
    StructField('latitude', DoubleType(), True),
    StructField('longitude', DoubleType(), True),
])

# Create the DataFrame with the defined schema
df = spark.createDataFrame(rows, schema=schema)

# Show the DataFrame
df.show(truncate=False)

24/09/22 15:40:24 WARN TaskSetManager: Stage 55 contains a task of very large size (2307 KiB). The maximum recommended task size is 1000 KiB.


+-------------------------------------------------------------------------------------+---------------------------------------------------+------------+-------------------------+-----------------+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+
|url                 

In [41]:
# Extract 'beds'
df = df.withColumn('beds_str', expr("filter(rooms, x -> x like '%Bed%')[0]"))
df = df.withColumn('beds', regexp_extract(col('beds_str'), '(\\d+)', 1).cast(IntegerType()))

# Extract 'baths'
df = df.withColumn('baths_str', expr("filter(rooms, x -> x like '%Bath%')[0]"))
df = df.withColumn('baths', regexp_extract(col('baths_str'), '(\\d+)', 1).cast(IntegerType()))

# Drop temporary columns
df = df.drop('beds_str', 'baths_str')

# Destructure the parking array to just a number
df = df.withColumn(
    'parking_space',
    when(col('parking').getItem(0) == '-', 0)  # Check for '-'
     .otherwise(regexp_extract(col('parking').getItem(0), r'(\d+)', 1).cast('int'))  # Extract the number
)

df = df.fillna({'parking_space': 0})

df = df.filter(~lower(col('desc')).contains('storage'))

# Filter out rows where both 'beds' and 'baths' are NULL
df = df.filter((col('beds').isNotNull()) | (col('baths').isNotNull()))

In [33]:
import folium
import pandas as pd

# Convert the PySpark DataFrame to Pandas
pandas_df = df.toPandas()

# Drop rows with NaN values in latitude or longitude
pandas_df = pandas_df.dropna(subset=['latitude', 'longitude'])

# Create a map centered around the mean latitude and longitude of the remaining points
map_center = [pandas_df['latitude'].mean(), pandas_df['longitude'].mean()]
my_map = folium.Map(location=map_center, zoom_start=50)

# Add markers for each property
for _, row in pandas_df.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['name'],
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(my_map)

# Display the map
# my_map

24/09/22 15:40:24 WARN TaskSetManager: Stage 57 contains a task of very large size (2307 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [99]:
# Define regex pattern to match weekly rent
# weekly_rent_pattern = r"\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\b\s*(?:(?i)per week|pw|weekly|p\.w|wk)\b"
# weekly_rent_pattern = r"(?:\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:(?i)per week|(?i)pw|(?i)weekly|(?i)p\.w|(?i)wk|(?i)p\.w\.)\b|^\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?)$"

weekly_rent_pattern = r"(?:\b\$?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{2})?\b\s*(?:/)?\s*?(?:(?i)(per week|pw|weekly|p\.w|wk|p\.w\.|- Fully Furnished))|^\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)$"

# Use regexp_extract to extract only rows with weekly rent information
df_filtered = df.withColumn("cost", regexp_extract(col("cost_text"), weekly_rent_pattern, 0))

# Remove non-numeric characters like '$' and ',' to prepare for casting
df_cleaned = df_filtered.withColumn("cost", regexp_replace(col("cost"), "[$,]", ""))

# Extract only the numeric part and cast to float
df_final = df_cleaned.withColumn("cost", regexp_extract(col("cost"), r"\d+(\.\d+)?", 0).cast("float"))

# Filter out rows where the cost is not empty (valid matches)
df_valid_rent = df_final.filter(col("cost").isNotNull())

# Create a DataFrame for filtered out rows
df_filtered_out = df_final.filter(col("cost").isNull())

# Show the filtered out rows
df_filtered_out[['cost_text']].show(1000, truncate=False)

# print(df_valid_rent.show(truncate=False))
print(df_valid_rent.count())


24/09/22 16:28:55 WARN TaskSetManager: Stage 242 contains a task of very large size (2307 KiB). The maximum recommended task size is 1000 KiB.
24/09/22 16:28:55 WARN TaskSetManager: Stage 243 contains a task of very large size (2922 KiB). The maximum recommended task size is 1000 KiB.


+------------------------------------------------------------+
|cost_text                                                   |
+------------------------------------------------------------+
|$45,000 for winter season                                   |
|$45,000 for winter season                                   |
|$49,500 for the season                                      |
|$49,500 for the season                                      |
|Contact Agent                                               |
|$300 pw Furnished                                           |
|Under Application pw                                        |
|For Pricing & Bookings - accommdinnerplain.com.au           |
|No Deposit Required or Rent2own                             |
|$385.00 p/w                                                 |
|$380 P/W                                                    |
|$475.00 p/w                                                 |
|**APPLICATIONS NOW CLOSED**                           

24/09/22 16:28:56 WARN TaskSetManager: Stage 244 contains a task of very large size (2307 KiB). The maximum recommended task size is 1000 KiB.


11982


In [100]:
df_valid_rent[["cost_text"]].show(1000)

+------------------+
|         cost_text|
+------------------+
|     $360 per week|
|     $495 per week|
|              $750|
|     $620 per week|
|              $450|
|     $480 per week|
|              $800|
|              $550|
|              $400|
|     $580 per week|
|     $370 per week|
|     $360 per week|
|       $510 weekly|
|              $625|
|     $675 per week|
|           $420 pw|
|              $300|
|              $220|
|           $550 wk|
|           $360 pw|
|           $430 wk|
|  $470.00 per week|
|     $420 per week|
|           $480 wk|
|           $420 wk|
|              $350|
|  $380.00 per week|
|     $470 per week|
|     $500 per week|
|     $450 per week|
|     $300 per week|
|  $370.00 per week|
|     $290 per week|
|  $450.00 per week|
|     $560 per week|
|     $420 per week|
|           $500 wk|
|     $495 per week|
|  $560.00 per week|
|     $550 per week|
|        $380.00 pw|
|     $540 Per Week|
|              $450|
|  $420.00 per week|
|     $550 pe

24/09/22 16:28:59 WARN TaskSetManager: Stage 247 contains a task of very large size (2307 KiB). The maximum recommended task size is 1000 KiB.


In [35]:
mapping_sal = pd.read_csv('../data/raw/SAL_mapping.csv')

mapping_sal = mapping_sal.dropna()

print(mapping_sal)

mapping_sal[['Rental suburbs', 'SAL suburbs (gazetted localities)']].head(5)

# Normalize the delimiters in the Suburb_Variants column
mapping_sal['SAL suburbs (gazetted localities)'] = mapping_sal['SAL suburbs (gazetted localities)'].str.replace(r'[-–—]', ' - ', regex=True)

# Function to capitalize and flatten the suburb variants
def flatten_suburbs(suburb_variants):
    # Capitalize each suburb variant and split them into a list
    return [suburb.title().strip() for suburb in suburb_variants.split(' - ')]

# Apply the function to create a new column
mapping_sal['Standardized_Suburb'] = mapping_sal['SAL suburbs (gazetted localities)'].apply(flatten_suburbs)

# Explode the list into separate rows, keeping the Main_Suburb column
mapping_sal = mapping_sal.explode('Standardized_Suburb')

# Keep only necessary columns
mapping_sal = mapping_sal[['Rental suburbs', 'Standardized_Suburb']]

# Display the final DataFrame
print(mapping_sal)

     Unnamed: 0           Suburb Cluster  \
0             0          Inner Melbourne   
22           23  Inner Eastern Melbourne   
39           41       Southern Melbourne   
52           55  Outer Western Melbourne   
64           68  North Western Melbourne   
77           82  North Eastern Melbourne   
89           95  Outer Eastern Melbourne   
97          104  South Eastern Melbourne   
105         113     Mornington Peninsula   
110         119                  Geelong   
117         127                 Ballarat   
121         132                  Bendigo   
125         137   Other Regional Centres   

                            Rental suburbs  \
0    Albert Park-Middle Park-West St Kilda   
22                                  Balwyn   
39                Aspendale-Chelsea-Carrum   
52                                  Altona   
64              Broadmeadows-Roxburgh Park   
77      Bundoora-Greensborough-Hurstbridge   
89                               Bayswater   
97             

In [101]:
# Load the CSV file into a DataFrame
mapping_sal = spark.read.csv('../data/raw/SAL_mapping.csv', header=True, inferSchema=True)

# Drop rows with null values
mapping_sal = mapping_sal.na.drop()

# Show the first 5 rows of relevant columns
# mapping_sal.select('Rental Suburbs', 'SAL suburbs (gazetted localities)').show(5)

# Normalize delimiters in the 'SAL suburbs (gazetted localities)' column
mapping_sal = mapping_sal.withColumn('SAL suburbs (gazetted localities)', regexp_replace(col('SAL suburbs (gazetted localities)'), r'[-–—]', ' - '))

# Function to capitalize and flatten the suburb variants
def flatten_suburbs(suburb_variants):
    return [suburb.title().strip() for suburb in suburb_variants.split(' - ')]

# Register the function as a UDF (User Defined Function)
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

flatten_suburbs_udf = udf(flatten_suburbs, ArrayType(StringType()))

# Apply the UDF to create a new column
mapping_sal = mapping_sal.withColumn('Standard_Suburb', flatten_suburbs_udf(col('SAL suburbs (gazetted localities)')))


# mapping_sal.show(truncate=False)
# Explode the list into separate rows
mapping_sal = mapping_sal.withColumn('Standard_Suburb', explode(col('Standard_Suburb')))


# mapping_sal.show(truncate=False)

# Keep only necessary columns
mapping_sal = mapping_sal.select('Rental Suburbs', 'Standard_Suburb')

mapping_sal.show(truncate=False)

# Inner join on Rental Suburbs and Standard_Suburb
joined_df = mapping_sal.join(df_valid_rent, 
                    mapping_sal['Standard_Suburb'] == df_valid_rent['suburb'],
                    how="inner")

joined_df = joined_df.drop('suburb')

joined_df.show(truncate=False)

24/09/22 16:29:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Suburb Cluster, Rental suburbs, SAL suburbs (gazetted localities)
 Schema: _c0, Suburb Cluster, Rental suburbs, SAL suburbs (gazetted localities)
Expected: _c0 but found: 
CSV file: file:///mnt/e/Coding_things/ADS/project-2-group-real-estate-industry-project-8/data/raw/SAL_mapping.csv


+-------------------------------------+-----------------+
|Rental Suburbs                       |Standard_Suburb  |
+-------------------------------------+-----------------+
|Albert Park-Middle Park-West St Kilda|Albert Park      |
|Albert Park-Middle Park-West St Kilda|Middle Park      |
|Albert Park-Middle Park-West St Kilda|St Kilda West    |
|Balwyn                               |Balwyn           |
|Balwyn                               |Balwyn North     |
|Balwyn                               |Deepdene         |
|Aspendale-Chelsea-Carrum             |Aspendale        |
|Aspendale-Chelsea-Carrum             |Aspendale Gardens|
|Aspendale-Chelsea-Carrum             |Edithvale        |
|Aspendale-Chelsea-Carrum             |Chelsea          |
|Aspendale-Chelsea-Carrum             |Chelsea Heights  |
|Aspendale-Chelsea-Carrum             |Bonbeach         |
|Aspendale-Chelsea-Carrum             |Patterson Lakes  |
|Aspendale-Chelsea-Carrum             |Bonbeach         |
|Altona       

24/09/22 16:29:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Suburb Cluster, Rental suburbs, SAL suburbs (gazetted localities)
 Schema: _c0, Suburb Cluster, Rental suburbs, SAL suburbs (gazetted localities)
Expected: _c0 but found: 
CSV file: file:///mnt/e/Coding_things/ADS/project-2-group-real-estate-industry-project-8/data/raw/SAL_mapping.csv
24/09/22 16:29:10 WARN TaskSetManager: Stage 252 contains a task of very large size (2307 KiB). The maximum recommended task size is 1000 KiB.


+----------------------------------+---------------+-----------------------------------------------------------------------------+-------------------------------------------+----------------+-----------------+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+----+----

In [104]:
aggregated_df = joined_df.groupBy("Standard_Suburb", "Rental Suburbs").agg(
    avg("beds").alias("average_beds"),
    avg("baths").alias("average_baths"),
    avg("parking_space").alias("average_parking"),
    avg("cost").alias("average_cost")
)

# Show the result
aggregated_df.show(truncate=False)

24/09/22 16:31:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Suburb Cluster, Rental suburbs, SAL suburbs (gazetted localities)
 Schema: _c0, Suburb Cluster, Rental suburbs, SAL suburbs (gazetted localities)
Expected: _c0 but found: 
CSV file: file:///mnt/e/Coding_things/ADS/project-2-group-real-estate-industry-project-8/data/raw/SAL_mapping.csv
24/09/22 16:31:40 WARN TaskSetManager: Stage 258 contains a task of very large size (2307 KiB). The maximum recommended task size is 1000 KiB.

+---------------+----------------------------------+------------------+------------------+------------------+-----------------+
|Standard_Suburb|Rental Suburbs                    |average_beds      |average_baths     |average_parking   |average_cost     |
+---------------+----------------------------------+------------------+------------------+------------------+-----------------+
|Kingsbury      |Bundoora-Greensborough-Hurstbridge|2.5               |1.75              |1.625             |474.375          |
|Greensborough  |Bundoora-Greensborough-Hurstbridge|3.272727272727273 |2.0               |1.9090909090909092|633.1818181818181|
|Watsonia North |Bundoora-Greensborough-Hurstbridge|4.0               |1.5               |1.5               |630.0            |
|Balwyn North   |Balwyn                            |3.4166666666666665|2.0555555555555554|1.9166666666666667|879.8611111111111|
|Balwyn         |Balwyn                            |2.7333333333333334|1.6666666666666667|1.4           

                                                                                