This notebook preprocesses the current listings of domain.

By running all the cells in this notebook, the preprocessed dataset will be saved to the raw layer such that '../data/2. raw/cleaned_domain_current_listings.parquet'.

Before running this notebook, make sure you have:
- 'domain_current_listings.parquet' in the landing layer such that '../data/1. landing/domain_current_listings.parquet'.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, regexp_extract, regexp_replace
import pandas as pd

In [None]:
# Create a SparkSession
spark = (SparkSession.builder.appName("Project 2")
         .config("spark.sql.repl.eagerEval.enabled", True)
         .config("spark.sql.parquet.cacheMetadata", "true")
         .config("spark.sql.session.timeZone", "Etc/UTC")
         .getOrCreate()
)

In [3]:
# Read the parquet file 
domain_current = spark.read.parquet('../data/1. landing/domain_current_listings.parquet')
domain_current.limit(5)

                                                                                

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197


In [4]:
# Count the number of properties
domain_current.count()

14199

In [5]:
# Print the data types of the columns
domain_current.printSchema()

root
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- car_parks: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cost_text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [6]:
# Dectect the rows that have missing values in the columns 'name', 'cost_text', 'type', 'latitude' and 'longitude'
domain_current.where(domain_current['name'].isNull()).count(), domain_current.where(domain_current['cost_text'].isNull()).count(), domain_current.where(domain_current['type'].isNull()).count(), domain_current.where(domain_current['latitude'].isNull()).count(), domain_current.where(domain_current['longitude'].isNull()).count()

(0, 0, 0, 0, 0)

In [7]:
# Drop the rows that have missing values in the columns 'bedrooms', 'bathrooms' and 'car_parks'
domain_current = domain_current.dropna(subset=['bedrooms', 'bathrooms', 'car_parks'])
domain_current.count()

14166

In [8]:
# Creat a new column 'suburb' by extracting the suburbs from 'name'
domain_current = domain_current.withColumn('suburb', split(domain_current['name'], ',')[1])
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181


In [9]:
# Create a new column 'postcode' by extracting the postcodes from 'suburb'
domain_current = domain_current.withColumn('postcode', regexp_extract(domain_current['suburb'], r'\b(\d{4})\b', 0))
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008,3008
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...,3141
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220,3220
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181,3181


In [10]:
# Create a new column 'rent_pw' by extracting the rents from 'cost_text'
domain_current = domain_current.withColumn('rent_pw', regexp_extract(regexp_replace(domain_current['cost_text'], ',', ''), r'\b(\d+)\b', 0))
domain_current.limit(5)

bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode,rent_pw
2,1,1,https://www.domai...,705/8 Marmion Pla...,$600 per week,Apartment / Unit ...,-37.8134708,144.9424794,Docklands VIC 3008,3008,600
3,2,2,https://www.domai...,5/18-20 Ibbottson...,$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087,650
3,2,1,https://www.domai...,2109/35 Malcolm S...,"$1,150/week",Apartment / Unit ...,-37.8369683,144.9964622,South Yarra VIC ...,3141,1150
2,1,1,https://www.domai...,4/17a The Esplana...,$475.00 per week,Apartment / Unit ...,-38.1345686,144.3548803,Geelong VIC 3220,3220,475
3,2,2,https://www.domai...,501/446 Malvern R...,$2200 Per Week,Apartment / Unit ...,-37.8479885,145.0012197,Prahran VIC 3181,3181,2200


In [11]:
# Convert the data type of 'rent_pw' to integer
domain_current = domain_current.withColumn('rent_pw', domain_current['rent_pw'].cast('int'))
domain_current.printSchema()

root
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- car_parks: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cost_text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- suburb: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- rent_pw: integer (nullable = true)



In [12]:
# Dectect the rows that have missing values in the column 'rent_pw'
domain_current.where(domain_current['rent_pw'].isNull()).count()

838

In [13]:
# Drop the the rows that have missing values in the column 'rent_pw'
domain_current = domain_current.dropna(subset=['rent_pw'])
domain_current.count()

13328

In [14]:
# Convert 'domain_current' to a pandas DataFrame
domain_current = domain_current.toPandas()

In [None]:
# Display the full list
pd.set_option('display.max_rows', None)

# Detect the rows with any 'annual' or 'month' or 'season' in 'cost_text'
domain_current[domain_current['cost_text'].str.contains('annual|month|season', case=False)]

In [16]:
# Manually correct 'rent_pw'
domain_current.loc[25, 'rent_pw'] = 507
domain_current.loc[154, 'rent_pw'] = 350
domain_current.loc[473, 'rent_pw'] = 70
domain_current.loc[927, 'rent_pw'] = 70
domain_current.loc[1795, 'rent_pw'] = 3850
domain_current.loc[1827, 'rent_pw'] = 450
domain_current.loc[1912, 'rent_pw'] = 608
domain_current.loc[1929, 'rent_pw'] = 76
domain_current.loc[2308, 'rent_pw'] = 627
domain_current.loc[2693, 'rent_pw'] = 18
domain_current.loc[2865, 'rent_pw'] = 76
domain_current.loc[3514, 'rent_pw'] = 608
domain_current.loc[3756, 'rent_pw'] = 51
domain_current.loc[3813, 'rent_pw'] = 558
domain_current.loc[4004, 'rent_pw'] = 455
domain_current.loc[4081, 'rent_pw'] = 35
domain_current.loc[4372, 'rent_pw'] = 29
domain_current.loc[4446, 'rent_pw'] = 456
domain_current.loc[5021, 'rent_pw'] = 637
domain_current.loc[5167, 'rent_pw'] = 51
domain_current.loc[6169, 'rent_pw'] = 58
domain_current.loc[6223, 'rent_pw'] = 26
domain_current.loc[6769, 'rent_pw'] = 425
domain_current.loc[6996, 'rent_pw'] = 54
domain_current.loc[7515, 'rent_pw'] = 558
domain_current.loc[7564, 'rent_pw'] = 70
domain_current.loc[7658, 'rent_pw'] = 3500
domain_current.loc[7979, 'rent_pw'] = 558
domain_current.loc[8501, 'rent_pw'] = 58
domain_current.loc[9351, 'rent_pw'] = 3850
domain_current.loc[9808, 'rent_pw'] = 747
domain_current.loc[10585, 'rent_pw'] = 3500
domain_current.loc[10684, 'rent_pw'] = 627
domain_current.loc[11143, 'rent_pw'] = 456
domain_current.loc[11411, 'rent_pw'] = 54
domain_current.loc[11427, 'rent_pw'] = 61
domain_current.loc[11536, 'rent_pw'] = 56
domain_current.loc[11664, 'rent_pw'] = 425
domain_current.loc[11998, 'rent_pw'] = 26
domain_current.loc[12448, 'rent_pw'] = 70
domain_current.loc[12571, 'rent_pw'] = 506
domain_current.loc[12826, 'rent_pw'] = 3850
domain_current.loc[12996, 'rent_pw'] = 517

In [17]:
# List all the types of properties
domain_current['type'].unique()

array(['Apartment / Unit / Flat', 'Townhouse', 'House', 'Studio', 'Villa',
       'Terrace', 'Duplex', 'Carspace', 'New House & Land',
       'Semi-Detached', 'Acreage / Semi-Rural',
       'New Apartments / Off the Plan', 'New land', 'Block of Units'],
      dtype=object)

In [18]:
# Remove the rows with 'Carspace' in 'type'
domain_current = domain_current[~domain_current['type'].str.contains('Carspace', case=False)]

In [None]:
# Save the cleaned data to a parquet file
domain_current = spark.createDataFrame(domain_current)
domain_current.write.mode('overwrite').parquet('../data/2. raw/cleaned_domain_current_listings.parquet')