In [31]:
import pyspark, re, json, os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, regexp_replace, lower, explode, array_contains, collect_set, lit, coalesce, array, count, when, isnull, mean, rand
from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField
from pyspark.ml.feature import StringIndexer
import pandas as pd

In [32]:
# Initialize Spark session
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [33]:
# Load the data from the JSON file
with open('../data/landing/domain_data.json', 'r') as f:
    data = json.load(f)

# Convert the JSON to a pandas DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

df

Unnamed: 0,name,cost_text,rooms,parking,description,prop_type,additional_features
https://www.domain.com.au/1001-39-queen-street-melbourne-vic-3000-17221118,"1001/39 Queen Street, Melbourne VIC 3000",$825 per week,"[3 Beds, 1 Bath]",[− Parking],This stunning residence in the historic Madiso...,Apartment / Unit / Flat,
https://www.domain.com.au/2817s-220-spencer-street-melbourne-vic-3000-17223080,"2817s/220 Spencer Street, Melbourne VIC 3000",$800 per week,"[2 Beds, 2 Baths]",[1 Parking],"Stunning Furnished Two Bedroom, Two Bathroom a...",Apartment / Unit / Flat,[Furnished]
https://www.domain.com.au/3207-80-a-beckett-street-melbourne-vic-3000-17119770,"3207/80 A'Beckett Street, Melbourne VIC 3000",$760/w FURNISHED,"[2 Beds, 1 Bath]",[− Parking],The MY80 development gives you the best that t...,Apartment / Unit / Flat,"[Gym, Intercom, Built in wardrobes, Dishwasher..."
https://www.domain.com.au/1413-199-william-street-melbourne-vic-3000-17225646,"1413/199 William Street, Melbourne VIC 3000",$650,"[2 Beds, 1 Bath]",[− Parking],Excellent FULLY FURNISHED property not to miss...,Apartment / Unit / Flat,"[Balcony, Outdoor Entertainment Area, Swimming..."
https://www.domain.com.au/2505-371-little-lonsdale-street-melbourne-vic-3000-17224306,"2505/371 Little Lonsdale Street, Melbourne VIC...",$550,"[1 Bed, 1 Bath]",[− Parking],"380 Melbourne elevates style, convenience, and...",Apartment / Unit / Flat,
...,...,...,...,...,...,...,...
https://www.domain.com.au/19-grazing-road-weir-views-vic-3338-17214974,"19 Grazing Road, Weir Views VIC 3338",$460 per week,"[3 Beds, 2 Baths]",[2 Parking],Spacious 3-bedroom open plan living home offer...,House,[Ensuite]
https://www.domain.com.au/9-norwood-avenue-weir-views-vic-3338-17111843,"9 Norwood Avenue, Weir Views VIC 3338",$460/week,"[4 Beds, 2 Baths]",[2 Parking],A perfect blend of comfort and modern living. ...,House,[]
https://www.domain.com.au/16-shackell-st-weir-views-vic-3338-17222155,"16 Shackell St, Weir Views VIC 3338",$460 weekly,"[4 Beds, 2 Baths]",[2 Parking],"Welcome to your new home at 16 Shackell St, We...",House,[]
https://www.domain.com.au/34-metroon-drive-weir-views-vic-3338-17204576,"34 Metroon Drive, Weir Views VIC 3338",$450,"[3 Beds, 2 Baths]",[2 Parking],The Roof Real Estate is proudly showcasing thi...,House,


In [34]:
df_reset = df.reset_index().rename(columns={'index': 'property_url'})
df_reset

Unnamed: 0,property_url,name,cost_text,rooms,parking,description,prop_type,additional_features
0,https://www.domain.com.au/1001-39-queen-street...,"1001/39 Queen Street, Melbourne VIC 3000",$825 per week,"[3 Beds, 1 Bath]",[− Parking],This stunning residence in the historic Madiso...,Apartment / Unit / Flat,
1,https://www.domain.com.au/2817s-220-spencer-st...,"2817s/220 Spencer Street, Melbourne VIC 3000",$800 per week,"[2 Beds, 2 Baths]",[1 Parking],"Stunning Furnished Two Bedroom, Two Bathroom a...",Apartment / Unit / Flat,[Furnished]
2,https://www.domain.com.au/3207-80-a-beckett-st...,"3207/80 A'Beckett Street, Melbourne VIC 3000",$760/w FURNISHED,"[2 Beds, 1 Bath]",[− Parking],The MY80 development gives you the best that t...,Apartment / Unit / Flat,"[Gym, Intercom, Built in wardrobes, Dishwasher..."
3,https://www.domain.com.au/1413-199-william-str...,"1413/199 William Street, Melbourne VIC 3000",$650,"[2 Beds, 1 Bath]",[− Parking],Excellent FULLY FURNISHED property not to miss...,Apartment / Unit / Flat,"[Balcony, Outdoor Entertainment Area, Swimming..."
4,https://www.domain.com.au/2505-371-little-lons...,"2505/371 Little Lonsdale Street, Melbourne VIC...",$550,"[1 Bed, 1 Bath]",[− Parking],"380 Melbourne elevates style, convenience, and...",Apartment / Unit / Flat,
...,...,...,...,...,...,...,...,...
11753,https://www.domain.com.au/19-grazing-road-weir...,"19 Grazing Road, Weir Views VIC 3338",$460 per week,"[3 Beds, 2 Baths]",[2 Parking],Spacious 3-bedroom open plan living home offer...,House,[Ensuite]
11754,https://www.domain.com.au/9-norwood-avenue-wei...,"9 Norwood Avenue, Weir Views VIC 3338",$460/week,"[4 Beds, 2 Baths]",[2 Parking],A perfect blend of comfort and modern living. ...,House,[]
11755,https://www.domain.com.au/16-shackell-st-weir-...,"16 Shackell St, Weir Views VIC 3338",$460 weekly,"[4 Beds, 2 Baths]",[2 Parking],"Welcome to your new home at 16 Shackell St, We...",House,[]
11756,https://www.domain.com.au/34-metroon-drive-wei...,"34 Metroon Drive, Weir Views VIC 3338",$450,"[3 Beds, 2 Baths]",[2 Parking],The Roof Real Estate is proudly showcasing thi...,House,


In [35]:
df_reset.to_parquet('../data/raw/domain_data_with_id.parquet', index=False)

In [36]:
sdf = spark.read.parquet('../data/raw/domain_data_with_id.parquet')
sdf.limit(15)

property_url,name,cost_text,rooms,parking,description,prop_type,additional_features
https://www.domai...,1001/39 Queen Str...,$825 per week,"[3 Beds, 1 Bath]",[− Parking],This stunning res...,Apartment / Unit ...,
https://www.domai...,2817s/220 Spencer...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],Stunning Furnishe...,Apartment / Unit ...,[Furnished]
https://www.domai...,3207/80 A'Beckett...,$760/w FURNISHED,"[2 Beds, 1 Bath]",[− Parking],The MY80 developm...,Apartment / Unit ...,"[Gym, Intercom, B..."
https://www.domai...,1413/199 William ...,$650,"[2 Beds, 1 Bath]",[− Parking],Excellent FULLY F...,Apartment / Unit ...,"[Balcony, Outdoor..."
https://www.domai...,2505/371 Little L...,$550,"[1 Bed, 1 Bath]",[− Parking],380 Melbourne ele...,Apartment / Unit ...,
https://www.domai...,208/547 Flinders ...,$525 weekly,"[2 Beds, 1 Bath]",[− Parking],***TO BOOK A TIME...,Apartment / Unit ...,
https://www.domai...,1910/22 - 24 Jane...,$525.00 pw,"[1 Bed, 1 Bath]",[1 Parking],This unfurnished ...,Apartment / Unit ...,[Air conditioning...
https://www.domai...,303/18-20 Bank Pl...,$525 per week,"[1 Bed, 1 Bath]",[− Parking],HOW DO I REGISTER...,Apartment / Unit ...,[Built in wardrob...
https://www.domai...,819/555 Flinders ...,$525 per week NOT...,"[1 Bed, 1 Bath]",[1 Parking],Perched high on t...,Apartment / Unit ...,"[Ducted Cooling, ..."
https://www.domai...,2704/288 Spencer ...,$480 per week,"[1 Bed, 1 Bath]",[− Parking],**Booking an insp...,Apartment / Unit ...,"[Gym, Built in wa..."


In [37]:
sdf.dtypes

[('property_url', 'string'),
 ('name', 'string'),
 ('cost_text', 'string'),
 ('rooms', 'array<string>'),
 ('parking', 'array<string>'),
 ('description', 'string'),
 ('prop_type', 'string'),
 ('additional_features', 'array<string>')]

In [38]:
string_columns = ['property_url', 'name', 'cost_text', 'description', 'prop_type']
for column in string_columns:
    sdf = sdf.withColumn(column, lower(col(column)))

sdf

property_url,name,cost_text,rooms,parking,description,prop_type,additional_features
https://www.domai...,1001/39 queen str...,$825 per week,"[3 Beds, 1 Bath]",[− Parking],this stunning res...,apartment / unit ...,
https://www.domai...,2817s/220 spencer...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],stunning furnishe...,apartment / unit ...,[Furnished]
https://www.domai...,3207/80 a'beckett...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],the my80 developm...,apartment / unit ...,"[Gym, Intercom, B..."
https://www.domai...,1413/199 william ...,$650,"[2 Beds, 1 Bath]",[− Parking],excellent fully f...,apartment / unit ...,"[Balcony, Outdoor..."
https://www.domai...,2505/371 little l...,$550,"[1 Bed, 1 Bath]",[− Parking],380 melbourne ele...,apartment / unit ...,
https://www.domai...,208/547 flinders ...,$525 weekly,"[2 Beds, 1 Bath]",[− Parking],***to book a time...,apartment / unit ...,
https://www.domai...,1910/22 - 24 jane...,$525.00 pw,"[1 Bed, 1 Bath]",[1 Parking],this unfurnished ...,apartment / unit ...,[Air conditioning...
https://www.domai...,303/18-20 bank pl...,$525 per week,"[1 Bed, 1 Bath]",[− Parking],how do i register...,apartment / unit ...,[Built in wardrob...
https://www.domai...,819/555 flinders ...,$525 per week not...,"[1 Bed, 1 Bath]",[1 Parking],perched high on t...,apartment / unit ...,"[Ducted Cooling, ..."
https://www.domai...,2704/288 spencer ...,$480 per week,"[1 Bed, 1 Bath]",[− Parking],**booking an insp...,apartment / unit ...,"[Gym, Built in wa..."


In [39]:
sdf.write.parquet("../data/raw/domain_data", mode="overwrite")

In [40]:
duplicates = sdf.groupBy("name", "description").agg(count("property_url").alias("count")).filter(col("count") > 1)

# Join with the original DataFrame to filter out duplicates
sdf = sdf.join(duplicates, on=["name", "description"], how="left_anti")

# Show the DataFrame without duplicates
sdf.count()

11528

In [41]:
# Define a UDF to extract number of beds
def extract_beds(rooms):
    for room in rooms:
        if 'Bed' in room:
            # Extract the number before the word 'Bed'
            match = re.search(r'(\d+)\s*Beds?', room)
            if match:
                return int(match.group(1))
    return None  # Return None if no beds information is found

# Define a UDF to extract number of baths
def extract_baths(rooms):
    for room in rooms:
        if 'Bath' in room:
            # Extract the number before the word 'Bath'
            match = re.search(r'(\d+)\s*Baths?', room)
            if match:
                return int(match.group(1))
    return None  # Return None if no baths information is found

# Register UDFs
extract_beds_udf = udf(extract_beds, IntegerType())
extract_baths_udf = udf(extract_baths, IntegerType())

# Apply UDFs to create new columns
sdf = sdf.withColumn('beds', extract_beds_udf(sdf['rooms']))
sdf = sdf.withColumn('baths', extract_baths_udf(sdf['rooms']))

# Show the DataFrame to verify new columns
sdf

name,description,property_url,cost_text,rooms,parking,prop_type,additional_features,beds,baths
1001/39 queen str...,this stunning res...,https://www.domai...,$825 per week,"[3 Beds, 1 Bath]",[− Parking],apartment / unit ...,,3,1
2817s/220 spencer...,stunning furnishe...,https://www.domai...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],apartment / unit ...,[Furnished],2,2
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1
1413/199 william ...,excellent fully f...,https://www.domai...,$650,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Balcony, Outdoor...",2,1
2505/371 little l...,380 melbourne ele...,https://www.domai...,$550,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,,1,1
208/547 flinders ...,***to book a time...,https://www.domai...,$525 weekly,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,,2,1
1910/22 - 24 jane...,this unfurnished ...,https://www.domai...,$525.00 pw,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,[Air conditioning...,1,1
303/18-20 bank pl...,how do i register...,https://www.domai...,$525 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,[Built in wardrob...,1,1
819/555 flinders ...,perched high on t...,https://www.domai...,$525 per week not...,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,"[Ducted Cooling, ...",1,1
2704/288 spencer ...,**booking an insp...,https://www.domai...,$480 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Built in wa...",1,1


In [42]:
# Define a UDF to extract number of beds
def extract_parkings(parkings):
    for parking in parkings:
        if 'Parking' in parking:
            # Extract the number before the word 'Bed'
            match = re.search(r'(\d+)\s*Parking?', parking)
            if match:
                return int(match.group(1))
    return None

# Register UDFs
extract_parkings_udf = udf(extract_parkings, IntegerType())

# Apply UDFs to create new columns
sdf = sdf.withColumn('parkings', extract_parkings_udf(sdf['parking']))

# Show the DataFrame to verify new columns
sdf

name,description,property_url,cost_text,rooms,parking,prop_type,additional_features,beds,baths,parkings
1001/39 queen str...,this stunning res...,https://www.domai...,$825 per week,"[3 Beds, 1 Bath]",[− Parking],apartment / unit ...,,3,1,
2817s/220 spencer...,stunning furnishe...,https://www.domai...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],apartment / unit ...,[Furnished],2,2,1.0
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,
1413/199 william ...,excellent fully f...,https://www.domai...,$650,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Balcony, Outdoor...",2,1,
2505/371 little l...,380 melbourne ele...,https://www.domai...,$550,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,,1,1,
208/547 flinders ...,***to book a time...,https://www.domai...,$525 weekly,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,,2,1,
1910/22 - 24 jane...,this unfurnished ...,https://www.domai...,$525.00 pw,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,[Air conditioning...,1,1,1.0
303/18-20 bank pl...,how do i register...,https://www.domai...,$525 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,[Built in wardrob...,1,1,
819/555 flinders ...,perched high on t...,https://www.domai...,$525 per week not...,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,"[Ducted Cooling, ...",1,1,1.0
2704/288 spencer ...,**booking an insp...,https://www.domai...,$480 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Built in wa...",1,1,


In [43]:
# Define a UDF to extract numeric cost from 'cost_text'
def extract_numeric_cost(cost_text):
    if cost_text:
        # Remove commas and search for the first sequence of digits
        match = re.search(r'\d+', cost_text.replace(',', ''))
        if match:
            return int(match.group())  # Convert the found digits to an integer
    return None  # Return None if no digits are found

# Register the UDF with IntegerType as the return type
extract_numeric_cost_udf = udf(extract_numeric_cost, IntegerType())

# Apply UDF to create a new column for cost
sdf = sdf.withColumn('cost', extract_numeric_cost_udf(sdf['cost_text']))

# Show the DataFrame to verify the new 'Cost' column
sdf

name,description,property_url,cost_text,rooms,parking,prop_type,additional_features,beds,baths,parkings,cost
1001/39 queen str...,this stunning res...,https://www.domai...,$825 per week,"[3 Beds, 1 Bath]",[− Parking],apartment / unit ...,,3,1,,825
2817s/220 spencer...,stunning furnishe...,https://www.domai...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],apartment / unit ...,[Furnished],2,2,1.0,800
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760
1413/199 william ...,excellent fully f...,https://www.domai...,$650,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Balcony, Outdoor...",2,1,,650
2505/371 little l...,380 melbourne ele...,https://www.domai...,$550,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,,1,1,,550
208/547 flinders ...,***to book a time...,https://www.domai...,$525 weekly,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,,2,1,,525
1910/22 - 24 jane...,this unfurnished ...,https://www.domai...,$525.00 pw,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,[Air conditioning...,1,1,1.0,525
303/18-20 bank pl...,how do i register...,https://www.domai...,$525 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,[Built in wardrob...,1,1,,525
819/555 flinders ...,perched high on t...,https://www.domai...,$525 per week not...,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,"[Ducted Cooling, ...",1,1,1.0,525
2704/288 spencer ...,**booking an insp...,https://www.domai...,$480 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Built in wa...",1,1,,480


In [44]:
# Define the schema for the UDF's return type
address_schema = StructType([
    StructField("unit_floor", StringType(), True),
    StructField("street", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("state", StringType(), True),
    StructField("postal_code", StringType(), True)
])

# Define the UDF to handle complex address formats, including 'suburb state postcode'
def extract_address_components(name):
    # Clean double commas first if any
    name = re.sub(r",,", ",", name)

    # Initialize all components as None
    unit_floor, street, suburb, state, postcode = None, None, None, None, None

    # Split the address based on commas
    parts = [part.strip().lower() for part in name.split(',')]

    # Handle cases based on the number of parts found
    if len(parts) >= 1:
        # Check the last part for 'suburb state postcode'
        suburb_state_postcode = parts[-1]
        state_postcode_match = re.search(r'([a-z]{2,3})\s+(\d{4})$', suburb_state_postcode)
        if state_postcode_match:
            state = state_postcode_match.group(1)
            postcode = state_postcode_match.group(2)
            suburb = suburb_state_postcode[:-(len(state + postcode) + 2)].strip() # just added the plus 2 because melb suburb 

        # Check if there is a unit/floor number or a street
        if len(parts) == 1:
            # Only suburb, state, postcode available
            pass
        elif len(parts) == 2:
            # Could be 'street, suburb state postcode'
            street = parts[0]
        elif len(parts) >= 3:
            # Complex structure with unit/floor number
            unit_floor = parts[0]
            street = ', '.join(parts[1:-1])  # Combine middle parts as street

    return (unit_floor, street, suburb, state, postcode)

# Register the UDF with the schema
extract_address_udf = udf(extract_address_components, address_schema)

# Apply the UDF to create new columns for each address component
sdf = sdf.withColumn("AddressComponents", extract_address_udf(sdf['name']))
sdf = sdf.select(*sdf.columns, "AddressComponents.*")

In [45]:
columns_to_drop = ['AddressComponents']

# Dropping the columns from the DataFrame
sdf = sdf.drop(*columns_to_drop)
sdf

name,description,property_url,cost_text,rooms,parking,prop_type,additional_features,beds,baths,parkings,cost,unit_floor,street,suburb,state,postal_code
1001/39 queen str...,this stunning res...,https://www.domai...,$825 per week,"[3 Beds, 1 Bath]",[− Parking],apartment / unit ...,,3,1,,825,,1001/39 queen street,melbourne,vic,3000
2817s/220 spencer...,stunning furnishe...,https://www.domai...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],apartment / unit ...,[Furnished],2,2,1.0,800,,2817s/220 spencer...,melbourne,vic,3000
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000
1413/199 william ...,excellent fully f...,https://www.domai...,$650,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Balcony, Outdoor...",2,1,,650,,1413/199 william ...,melbourne,vic,3000
2505/371 little l...,380 melbourne ele...,https://www.domai...,$550,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,,1,1,,550,,2505/371 little l...,melbourne,vic,3000
208/547 flinders ...,***to book a time...,https://www.domai...,$525 weekly,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,,2,1,,525,,208/547 flinders ...,melbourne,vic,3000
1910/22 - 24 jane...,this unfurnished ...,https://www.domai...,$525.00 pw,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,[Air conditioning...,1,1,1.0,525,,1910/22 - 24 jane...,melbourne,vic,3000
303/18-20 bank pl...,how do i register...,https://www.domai...,$525 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,[Built in wardrob...,1,1,,525,,303/18-20 bank place,melbourne,vic,3000
819/555 flinders ...,perched high on t...,https://www.domai...,$525 per week not...,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,"[Ducted Cooling, ...",1,1,1.0,525,,819/555 flinders ...,melbourne,vic,3000
2704/288 spencer ...,**booking an insp...,https://www.domai...,$480 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Built in wa...",1,1,,480,,2704/288 spencer ...,melbourne,vic,3000


In [46]:
sdf = sdf.filter(~col("cost_text").rlike("^[^0-9]*$"))

In [47]:
sdf = sdf.filter(~
    (((col("description").contains("car space")) & 
    ((col("beds") == 0) | (col("baths") == 0)) &
    (col("cost") < 100)) | (col("prop_type") == "carspace"))
)
sdf.show(truncate=False)

+---------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+-----------------------------+-----------------+-----------+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+-----+--------+----+----------+-------------------------------+---------+-----+-----------

In [48]:
sdf = sdf.withColumn("additional_features", coalesce(col("additional_features"), array()))

In [49]:
sdf.filter(col("cost") > 3000).show(truncate=False)

+-----------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------+--------------------------------------------------+------------------+-----------+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+-----+--------+-------+----------+---------------------------+--------------+-----+-----------+
|name                           

In [50]:
# Just a check for the the suburb names

unique_suburbs = sdf.select("suburb").distinct().collect()

for row in unique_suburbs:
    print(row['suburb'])

cremorne
carnegie
mordialloc
mambourin
hurstbridge
noble park north
gladstone st south melbourne
gladstone park
balaclava
cranbourne west
warrandyte
box hill north
frankston north
mornington
keilor
hastings
briar hill
manton road clayton
north warrandyte
kooyong
williams landing
box hill
seaholme
pascoe vale
jacana
meadow heights
croydon
seaford
wyndham vale
craigieburn
richmond
williamstown north
st kilda
bentleigh
middle park
skye
berwick
burnside heights
mitcham
knoxfield
narre warren
lynbrook
south morang
parkville
burwood
bentleigh east
travancore
croydon south
essendon north
east melbourne
rosanna
patterson lakes
cranbourne south
brunswick
preston
johnson st balnarring
south melbourne
toorak
malvern
dandenong north
vermont south
glen huntly
spotswood
caroline springs
mill park
mooroolbark
moorabbin
deepdene
templestowe lower
upper ferntree gully
gowanbrae
nunawading
devon meadows
melbourne
sydenham
derrimut
flinders
prahran
highett
gembrook
boronia
west footscray
bayswater
oaklei

In [51]:
DIMENSION = 25

In [52]:
# Explode the additional_features array to work with each feature separately
features_df = sdf.withColumn("feature", explode(col("additional_features")))

# Count the occurrences of each feature and get the top 20
unique_features_df = features_df.groupBy("feature").count().orderBy(col("count").desc()).limit(DIMENSION)

# Collect the names of the top 20 features
unique_features = [row['feature'] for row in unique_features_df.collect()]

In [53]:
# Add each unique feature as a column to the original DataFrame
for feature in unique_features:
    sdf = sdf.withColumn(feature, array_contains(col("additional_features"), feature).cast("integer"))

# Optionally, show the DataFrame with new one-hot encoded columns
sdf.show(truncate=False)

# Drop the exploded features DataFrame if no longer needed
features_df.unpersist()

+---------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+-----------------------------+-----------------+-----------+-----------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+-----+--------+----+----------+-------------------------------+---------+-----+-----------

name,description,property_url,cost_text,rooms,parking,prop_type,additional_features,beds,baths,parkings,cost,unit_floor,street,suburb,state,postal_code,feature
2817s/220 spencer...,stunning furnishe...,https://www.domai...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],apartment / unit ...,[Furnished],2,2,1.0,800,,2817s/220 spencer...,melbourne,vic,3000,Furnished
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Gym
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Intercom
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Built in wardrobes
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Dishwasher
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Split System Heating
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Split System Air ...
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Outdoor Entertain...
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Balcony
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,Swimming Pool


In [54]:
# Function to clean column names
def clean_column_name(column_name):
    # Lowercase all characters
    clean_name = column_name.lower()
    # Replace spaces and special characters with underscores
    clean_name = re.sub(r'\W+', '_', clean_name)
    # Remove leading/trailing underscores that might appear after replacements
    clean_name = re.sub(r'^_|_$', '', clean_name)
    return clean_name

# Get current columns in the DataFrame
current_columns = sdf.columns

# Generate new cleaned column names
new_columns = [clean_column_name(column) for column in current_columns]

# Rename columns in the DataFrame
sdf = sdf.toDF(*new_columns)

# Show the updated DataFrame schema to confirm changes
sdf

name,description,property_url,cost_text,rooms,parking,prop_type,additional_features,beds,baths,parkings,cost,unit_floor,street,suburb,state,postal_code,built_in_wardrobes,dishwasher,air_conditioning,floorboards,secure_parking,intercom,heating,ensuite,balcony_deck,gym,furnished,close_to_shops,close_to_transport,fully_fenced,close_to_schools,ducted_heating,split_system_heating,swimming_pool,remote_garage,balcony,study,garden_courtyard,pets_allowed,internal_laundry,alarm_system
1001/39 queen str...,this stunning res...,https://www.domai...,$825 per week,"[3 Beds, 1 Bath]",[− Parking],apartment / unit ...,[],3,1,,825,,1001/39 queen street,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2817s/220 spencer...,stunning furnishe...,https://www.domai...,$800 per week,"[2 Beds, 2 Baths]",[1 Parking],apartment / unit ...,[Furnished],2,2,1.0,800,,2817s/220 spencer...,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3207/80 a'beckett...,the my80 developm...,https://www.domai...,$760/w furnished,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Intercom, B...",2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0
1413/199 william ...,excellent fully f...,https://www.domai...,$650,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,"[Balcony, Outdoor...",2,1,,650,,1413/199 william ...,melbourne,vic,3000,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2505/371 little l...,380 melbourne ele...,https://www.domai...,$550,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,[],1,1,,550,,2505/371 little l...,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
208/547 flinders ...,***to book a time...,https://www.domai...,$525 weekly,"[2 Beds, 1 Bath]",[− Parking],apartment / unit ...,[],2,1,,525,,208/547 flinders ...,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1910/22 - 24 jane...,this unfurnished ...,https://www.domai...,$525.00 pw,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,[Air conditioning...,1,1,1.0,525,,1910/22 - 24 jane...,melbourne,vic,3000,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
303/18-20 bank pl...,how do i register...,https://www.domai...,$525 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,[Built in wardrob...,1,1,,525,,303/18-20 bank place,melbourne,vic,3000,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
819/555 flinders ...,perched high on t...,https://www.domai...,$525 per week not...,"[1 Bed, 1 Bath]",[1 Parking],apartment / unit ...,"[Ducted Cooling, ...",1,1,1.0,525,,819/555 flinders ...,melbourne,vic,3000,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2704/288 spencer ...,**booking an insp...,https://www.domai...,$480 per week,"[1 Bed, 1 Bath]",[− Parking],apartment / unit ...,"[Gym, Built in wa...",1,1,,480,,2704/288 spencer ...,melbourne,vic,3000,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0


In [55]:
sdf.select(col("prop_type")).distinct()

prop_type
new house & land
villa
new apartments / ...
apartment / unit ...
block of units
duplex
townhouse
semi-detached
studio
terrace


In [56]:
# Create an instance of StringIndexer
indexer = StringIndexer(inputCol="prop_type", outputCol="prop_type_index")

# Fit the indexer to the DataFrame and transform it
sdf = indexer.fit(sdf).transform(sdf)

# Show the DataFrame with the new numeric 'prop_type_index' column
sdf.select("prop_type", "prop_type_index").show(truncate=False)

+-----------------------+---------------+
|prop_type              |prop_type_index|
+-----------------------+---------------+
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|studio                 |3.0            |
|studio                 |3.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
|apartment / unit / flat|0.0            |
+-----------------------+---------

In [57]:
columns_to_drop = ['cost_text', 'rooms', 'parking', 'description', 'prop_type', 'additional_features']

# Dropping the columns from the DataFrame
sdf = sdf.drop(*columns_to_drop)

# Show the updated DataFrame to confirm that columns are dropped
sdf

name,property_url,beds,baths,parkings,cost,unit_floor,street,suburb,state,postal_code,built_in_wardrobes,dishwasher,air_conditioning,floorboards,secure_parking,intercom,heating,ensuite,balcony_deck,gym,furnished,close_to_shops,close_to_transport,fully_fenced,close_to_schools,ducted_heating,split_system_heating,swimming_pool,remote_garage,balcony,study,garden_courtyard,pets_allowed,internal_laundry,alarm_system,prop_type_index
1001/39 queen str...,https://www.domai...,3,1,,825,,1001/39 queen street,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
2817s/220 spencer...,https://www.domai...,2,2,1.0,800,,2817s/220 spencer...,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
3207/80 a'beckett...,https://www.domai...,2,1,,760,,3207/80 a'beckett...,melbourne,vic,3000,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0.0
1413/199 william ...,https://www.domai...,2,1,,650,,1413/199 william ...,melbourne,vic,3000,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0.0
2505/371 little l...,https://www.domai...,1,1,,550,,2505/371 little l...,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
208/547 flinders ...,https://www.domai...,2,1,,525,,208/547 flinders ...,melbourne,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
1910/22 - 24 jane...,https://www.domai...,1,1,1.0,525,,1910/22 - 24 jane...,melbourne,vic,3000,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
303/18-20 bank pl...,https://www.domai...,1,1,,525,,303/18-20 bank place,melbourne,vic,3000,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0
819/555 flinders ...,https://www.domai...,1,1,1.0,525,,819/555 flinders ...,melbourne,vic,3000,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0.0
2704/288 spencer ...,https://www.domai...,1,1,,480,,2704/288 spencer ...,melbourne,vic,3000,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0.0


In [58]:
# Write to Parquet
sdf.write.parquet("../data/curated/domain_data.parquet", mode="overwrite")

                                                                                

In [59]:
properties_df = spark.read.csv(
    "../data/curated/properties_stats.csv",
    header=True,  # Assumes first row is header
    inferSchema=True  # Infers the input schema automatically from data
)
properties_df.show(truncate=False)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/home/anshan/project-2-group-real-estate-industry-project-21/data/curated/properties_stats.csv.

In [281]:
properties_df.count()

237

In [282]:
properties_df.select("Location Type").distinct()

Location Type
parks
schools
supermarkets
hospitals
shopping_districts
CBD
train_stations


In [283]:
# Grouping by 'URLS' and 'Location Type'
grouped_df = properties_df.groupBy("URLS", "Location Type").agg(
    F.mean("Location Latitude").alias("Average Latitude"),
    F.mean("Location Longitude").alias("Average Longitude"),
    F.mean("Travel Time (minutes)").alias("Average Travel Time"),
    F.mean("Travel Distance (km)").alias("Average Travel Distance")
).orderBy("URLS")

grouped_df = grouped_df.withColumn(
    "Time_Distance_Product",
    F.col("Average Travel Time") * F.col("Average Travel Distance")
)

grouped_df.show(truncate=False)

+-----------------------------------------------------------------------------------------+------------------+-------------------+------------------+-------------------+-----------------------+---------------------+
|URLS                                                                                     |Location Type     |Average Latitude   |Average Longitude |Average Travel Time|Average Travel Distance|Time_Distance_Product|
+-----------------------------------------------------------------------------------------+------------------+-------------------+------------------+-------------------+-----------------------+---------------------+
|https://www.domain.com.au/17-246-albert-street-east-melbourne-vic-3002-17146354          |schools           |-37.81125945       |144.99623920000002|5.98               |2.8899999999999997     |17.2822              |
|https://www.domain.com.au/17-246-albert-street-east-melbourne-vic-3002-17146354          |hospitals         |-37.81072065714285 |144.98

In [284]:
pivot_df = grouped_df.groupBy("URLS").pivot("Location Type", [
    "parks", "schools", "supermarkets", "hospitals", "shopping_districts", "CBD", "train_stations"
]).agg(F.first("Time_Distance_Product"))
pivot_df

URLS,parks,schools,supermarkets,hospitals,shopping_districts,CBD,train_stations
https://www.domai...,,17.2822,3.911882,0.978061224489796,,14.161,4.3152
https://www.domai...,,4.4407,3.16755375,2.0303250000000004,2.4369125,2.4232,4.4897
https://www.domai...,,4.1629000000000005,3.86,3.768220408163265,,7.5992,2.3484
https://www.domai...,,4.76259375,1.7990628257887509,,1.5214833333333335,0.4428,9.2718
https://www.domai...,4.352,3.932499999999999,2.5041088435374146,,1.7250800000000002,1.1448,6.3879
https://www.domai...,,100.18506944444444,1.3477383742911149,,1.768536,1.0675,
https://www.domai...,4.925066666666666,4.775725,3.5125017301038057,,4.741708,5.1561,7.33935
https://www.domai...,6.216900000000001,179.5314888888889,3.5388396694214883,1.90850625,,17.232400000000002,4.5499


In [285]:
# Join grouped_df with sdf on 'URLS'
joined_df = sdf.join(pivot_df, sdf.property_url == pivot_df.URLS, how="left").drop("URLS")
joined_df

                                                                                

name,property_url,beds,baths,parkings,cost,unit_floor,street,suburb,state,postal_code,built_in_wardrobes,dishwasher,air_conditioning,floorboards,secure_parking,intercom,heating,ensuite,balcony_deck,gym,furnished,close_to_shops,close_to_transport,fully_fenced,close_to_schools,ducted_heating,split_system_heating,swimming_pool,remote_garage,balcony,study,garden_courtyard,pets_allowed,internal_laundry,alarm_system,prop_type_index,parks,schools,supermarkets,hospitals,shopping_districts,CBD,train_stations
1001/39 queen str...,https://www.domai...,3,1,,825,,1001/39 queen street,melbourne v,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,,,,
2817s/220 spencer...,https://www.domai...,2,2,1.0,800,,2817s/220 spencer...,melbourne v,vic,3000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,,,,
3207/80 a'beckett...,https://www.domai...,2,1,,760,,3207/80 a'beckett...,melbourne v,vic,3000,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0.0,,,,,,,
1413/199 william ...,https://www.domai...,2,1,,650,,1413/199 william ...,melbourne v,vic,3000,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0.0,,,,,,,
2505/371 little l...,https://www.domai...,1,1,,550,,2505/371 little l...,melbourne v,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,,,,
1910/22 - 24 jane...,https://www.domai...,1,1,1.0,525,,1910/22 - 24 jane...,melbourne v,vic,3000,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,,,,
208/547 flinders ...,https://www.domai...,2,1,,525,,208/547 flinders ...,melbourne v,vic,3000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,,,,,,,
303/18-20 bank pl...,https://www.domai...,1,1,,525,,303/18-20 bank place,melbourne v,vic,3000,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,,,,,,,
2704/288 spencer ...,https://www.domai...,1,1,,480,,2704/288 spencer ...,melbourne v,vic,3000,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0.0,,,,,,,
819/555 flinders ...,https://www.domai...,1,1,1.0,525,,819/555 flinders ...,melbourne v,vic,3000,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0.0,,,,,,,


In [286]:
# Write to Parquet
joined_df.write.parquet("../data/curated/prop_data.parquet", mode="overwrite")

                                                                                

## NULL ANALYSIS

In [296]:
sdf = spark.read.parquet("../data/curated/prop_data.parquet")

In [297]:
# Calculating the count of nulls in each column
null_counts = joined_df.select([count(when(isnull(c), c)).alias(c) for c in joined_df.columns])

# Display the count of nulls per column
null_counts.show(truncate=False)

[Stage 965:>                                                        (0 + 1) / 1]

+----+------------+----+-----+--------+----+----------+------+------+-----+-----------+------------------+----------+----------------+-----------+--------------+--------+-------+-------+------------+---+---------+--------------+------------------+------------+----------------+--------------+--------------------+-------------+-------------+-------+-----+----------------+------------+----------------+------------+---------------+-----+-------+------------+---------+------------------+-----+--------------+
|name|property_url|beds|baths|parkings|cost|unit_floor|street|suburb|state|postal_code|built_in_wardrobes|dishwasher|air_conditioning|floorboards|secure_parking|intercom|heating|ensuite|balcony_deck|gym|furnished|close_to_shops|close_to_transport|fully_fenced|close_to_schools|ducted_heating|split_system_heating|swimming_pool|remote_garage|balcony|study|garden_courtyard|pets_allowed|internal_laundry|alarm_system|prop_type_index|parks|schools|supermarkets|hospitals|shopping_districts|CBD 

                                                                                

In [300]:
location_columns = ["parks", "schools", "supermarkets", "hospitals", "shopping_districts", "CBD", "train_stations"]

# Calculate min and max for each column and store in a dictionary
ranges = {
    c: (sdf.agg({c: "min"}).first()[0], sdf.agg({c: "max"}).first()[0])
    for c in location_columns
}

# Apply a random value within the range for each column
for column, (min_val, max_val) in ranges.items():
    # Generate a random value between min and max for each row
    sdf = sdf.withColumn(column, when(col(column).isNull(), 
                                       (lit(min_val) + (lit(max_val) - lit(min_val)) * rand())).otherwise(col(column)))

# Show the DataFrame to confirm changes
sdf.show(truncate=False)

+---------------------------------------------------+-------------------------------------------------------------------------------------+----+-----+--------+----+----------+-------------------------------+-----------+-----+-----------+------------------+----------+----------------+-----------+--------------+--------+-------+-------+------------+---+---------+--------------+------------------+------------+----------------+--------------+--------------------+-------------+-------------+-------+-----+----------------+------------+----------------+------------+---------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|name                                               |property_url                                                                         |beds|baths|parkings|cost|unit_floor|street                         |suburb     |state|postal_code|built_in_wardrobes|dishwasher|air_conditioning|f

In [305]:
sdf = sdf.fillna({'unit_floor': 1, 'beds': 1, 'baths': 1, 'parkings': 1, 'street': 'empty'})

In [306]:
null_counts = sdf.select([count(when(isnull(c), c)).alias(c) for c in sdf.columns])

# Display the count of nulls per column
null_counts.show(truncate=False)

+----+------------+----+-----+--------+----+----------+------+------+-----+-----------+------------------+----------+----------------+-----------+--------------+--------+-------+-------+------------+---+---------+--------------+------------------+------------+----------------+--------------+--------------------+-------------+-------------+-------+-----+----------------+------------+----------------+------------+---------------+-----+-------+------------+---------+------------------+---+--------------+
|name|property_url|beds|baths|parkings|cost|unit_floor|street|suburb|state|postal_code|built_in_wardrobes|dishwasher|air_conditioning|floorboards|secure_parking|intercom|heating|ensuite|balcony_deck|gym|furnished|close_to_shops|close_to_transport|fully_fenced|close_to_schools|ducted_heating|split_system_heating|swimming_pool|remote_garage|balcony|study|garden_courtyard|pets_allowed|internal_laundry|alarm_system|prop_type_index|parks|schools|supermarkets|hospitals|shopping_districts|CBD|tr

In [307]:
sdf.write.parquet("../data/curated/prop_data_cleaned.parquet", mode="overwrite")

In [308]:
spark.stop()