In [239]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_extract, regexp_replace

In [240]:
# Create a SparkSession
spark = (SparkSession.builder.appName("Project 2")
         .config("spark.sql.repl.eagerEval.enabled", True)
         .config("spark.sql.parquet.cacheMetadata", "true")
         .config("spark.sql.session.timeZone", "Etc/UTC")
         .getOrCreate()
)

In [241]:
# Read the parquet file 
domain_current = spark.read.parquet('../data/raw/domain_current_listings.parquet')
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197


In [242]:
# Count the number of properties
domain_current.count()

14199

In [243]:
# Print the data types of the columns
domain_current.printSchema()

root
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- car_parks: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cost_text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [244]:
# Dectect the rows that have missing values in the columns 'name', 'cost_text', 'type', 'latitude' and 'longitude'
domain_current.where(domain_current['name'].isNull()).count(), domain_current.where(domain_current['cost_text'].isNull()).count(), domain_current.where(domain_current['type'].isNull()).count(), domain_current.where(domain_current['latitude'].isNull()).count(), domain_current.where(domain_current['longitude'].isNull()).count()

(0, 0, 0, 0, 0)

In [245]:
# Drop the rows that have missing values in the columns 'bedrooms', 'bathrooms' and 'car_parks'
domain_current = domain_current.dropna(subset=['bedrooms', 'bathrooms', 'car_parks'])
domain_current.count()

14166

In [246]:
# Creat a new column 'suburb' by extracting the suburbs from 'name'
domain_current = domain_current.withColumn('suburb', split(domain_current['name'], ',')[1])
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181


In [247]:
# Create a new column 'postcode' by extracting the postcodes from 'suburb'
domain_current = domain_current.withColumn('postcode', regexp_extract(domain_current['suburb'], r'\b(\d{4})\b', 0))
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008,3008
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...,3141
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220,3220
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181,3181


In [248]:
# Create a new column 'rent_pw' by extracting the rents from 'cost_text'
domain_current = domain_current.withColumn('rent_pw', regexp_extract(regexp_replace(domain_current['cost_text'], ',', ''), r'\b(\d+)\b', 0))
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode,rent_pw
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008,3008,600
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087,650
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...,3141,1150
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220,3220,475
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181,3181,2200


In [249]:
# Convert the data type of 'rent_pw' to integer
domain_current = domain_current.withColumn('rent_pw', domain_current['rent_pw'].cast('int'))
domain_current.printSchema()

root
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- car_parks: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cost_text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- suburb: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- rent_pw: integer (nullable = true)



In [250]:
# Dectect the rows that have missing values in the column 'rent_pw'
domain_current.where(domain_current['rent_pw'].isNull()).count()

838

In [251]:
# Drop the the rows that have missing values in the column 'rent_pw'
domain_current = domain_current.dropna(subset=['rent_pw'])
domain_current.count()

13328

In [252]:
# Save the cleaned data to a parquet file
domain_current.write.mode('overwrite').parquet('../data/landing/cleaned_domain_current_listings.parquet')

In [253]:
# Aggregate the properties by 'postcode', calculate the average rent per week, and ordered by average rent per week in descending order
avg_rent_per_suburb = domain_current.groupBy('postcode').avg('rent_pw').orderBy('avg(rent_pw)', ascending=False)
# Correct the average rent per week to 2 decimal places
avg_rent_per_suburb = avg_rent_per_suburb.withColumn('avg(rent_pw)', avg_rent_per_suburb['avg(rent_pw)'].cast('decimal(7,2)'))
avg_rent_per_suburb.show(10)


+--------+------------+
|postcode|avg(rent_pw)|
+--------+------------+
|    3699|    39823.33|
|    3730|     6901.56|
|    3722|     2763.40|
|    3428|     1800.00|
|    3564|     1733.07|
|    3953|     1560.60|
|    3233|     1553.89|
|    3944|     1412.50|
|    3186|     1357.15|
|    3113|     1275.00|
+--------+------------+
only showing top 10 rows



In [254]:
# Aggregate the properties by 'type', calculate the average rent per week, and ordered by average rent per week in descending order
avg_rent_per_type = domain_current.groupBy('type').avg('rent_pw').orderBy('avg(rent_pw)', ascending=False)
# Correct the average rent per week to 2 decimal places
avg_rent_per_type = avg_rent_per_type.withColumn('avg(rent_pw)', avg_rent_per_type['avg(rent_pw)'].cast('decimal(6,2)'))
avg_rent_per_type.show()
avg_rent_per_type.count()

+--------------------+------------+
|                type|avg(rent_pw)|
+--------------------+------------+
|    New House & Land|     6768.53|
|Acreage / Semi-Rural|      886.46|
|             Terrace|      794.55|
|              Duplex|      771.25|
|           Townhouse|      766.33|
|New Apartments / ...|      734.60|
|       Semi-Detached|      661.43|
|               House|      651.68|
|Apartment / Unit ...|      637.10|
|               Villa|      571.96|
|      Block of Units|      450.00|
|              Studio|      416.19|
|            Carspace|      202.69|
|            New land|      200.00|
+--------------------+------------+



14