jupyter notebook which goes through data cleaning 

In [148]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession, functions as F
import os
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

In [149]:
df = spark.read.parquet('../data/landing/property_data.parquet')
initial_instances = df.count()

In [150]:
import re
pdf = df.toPandas()

# Delete Rows that do not contain the price in cost_text
valid_costs = [x if re.search(r"([\d\.]+)", x) else 0 for x in pdf['cost_text']]
pdf['cost_text'] = valid_costs
pdf = pdf[pdf['cost_text']!=0]

pdf[0:100]

Unnamed: 0,url,postcode,suburb,name,cost_text,beds,baths,parking,property_type
0,https://www.domain.com.au/-leased-3-yarra-stre...,3141,south-yarra,"(Leased) 3 Yarra Street, South Yarra VIC 3141",$460,1,1,1,Apartment / Unit / Flat
1,https://www.domain.com.au/04-390-burwood-highw...,3125,burwood,"04/390 Burwood Highway, Burwood VIC 3125","$310 per week, with AC",1,1,0,Apartment / Unit / Flat
2,https://www.domain.com.au/1-2-32-folkstone-cre...,3156,ferntree-gully,"1 & 2/32 Folkstone Crescent, Ferntree Gully VI...",$800,4,3,1,House
3,https://www.domain.com.au/1-acacia-street-torq...,3228,torquay,"1 Acacia Street, Torquay VIC 3228","$1,000 per week",4,2,2,House
4,https://www.domain.com.au/1-aleppo-crescent-fr...,3200,frankston-north,"1 Aleppo Crescent, Frankston North VIC 3200",$450,3,1,1,House
...,...,...,...,...,...,...,...,...,...
97,https://www.domain.com.au/1-17-view-street-pas...,3044,pascoe-vale,"1/17 View Street, Pascoe Vale VIC 3044",$690pw / $2998pcm,3,2,1,Townhouse
98,https://www.domain.com.au/1-177-widford-street...,3047,broadmeadows,"1/177 Widford Street, Broadmeadows VIC 3047",$550 Per Week,3,2,2,House
99,https://www.domain.com.au/1-18-arnott-street-c...,3168,clayton,"1/18 Arnott Street, Clayton VIC 3168",$280 per week,1,1,1,House
100,https://www.domain.com.au/1-18-arthur-street-s...,3141,south-yarra,"1/18 Arthur Street, South Yarra VIC 3141",$750pw,2,2,1,Apartment / Unit / Flat


In [151]:
sdf = spark.createDataFrame(pdf)

In [152]:
raw_sdf = sdf.withColumn(
    # Properties priced per week
    'Week',
    F.when(F.lower(F.col('cost_text')).contains('pw') \
           | F.lower(F.col('cost_text')).contains('p/w') \
           | F.lower(F.col('cost_text')).contains('week'), True).otherwise(False)
).withColumn(
    # see if property is priced per annum
    'month',
    F.when(F.lower(F.col('cost_text')).contains('month') \
        | F.lower(F.col('cost_text')).contains('pcm'), True ).otherwise(False)

).withColumn(
    # remove $ and ,
    'cost_pw',
    F.regexp_replace(F.col("cost_text"), r"[$,]", "").cast("float")
).withColumn(
    # see if property is priced per annum
    'contains_pa',
    F.when(F.lower(F.col('cost_text')).contains('p.a') \
        | F.lower(F.col('cost_text')).contains('pa') \
        | F.lower(F.col('cost_text')).contains('per annum') \
        | F.lower(F.col('cost_text')).contains('per_annum') \
        | F.lower(F.col('cost_text')).contains('p/a'), True ).otherwise(False)
).withColumn(
    # grab price
    'cost_pw',
    F.regexp_extract(F.col("cost_text"), r"(\$*\d+(?:,\d*)*(?:\.\d*)?)", 1)
).withColumn(
    # removes decimal places
    'cost_text',
    F.regexp_replace(F.col("cost_text"), r"\.\d+", "")
).withColumn(
    # if property is priced per annum, divide by 52
    'cost_pw',
    F.when(F.col('contains_pa') == True, F.round(F.col('cost_text') / 52, 2)).otherwise(F.col('cost_text'))
).withColumn(
    # if property is priced per month, divide by 4
    'cost_pw',
    F.when(F.col('month') == True, F.round(F.col('cost_text') / 4, 2)).otherwise(F.col('cost_text'))
).drop('month', 'contains_pa', 'Week').withColumn(
    # remove $ and ,
    'cost_pw',
    F.regexp_replace(F.col("cost_pw"), r"[$,\D]", "").cast("float") # casting to float makes it include the decimal point
).withColumn(
    # see if property is furnished - note: this will contain partially furnished as well
    'furnished', 
    F.when(F.lower(F.col('cost_text')).contains('furnish') \
        | F.lower(F.col('cost_text')).contains('furniture'), True ).otherwise(False)
)

# Returns Null in cost_pw when there's multiple prices

raw_sdf

url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost_pw,furnished
https://www.domai...,3141,south-yarra,(Leased) 3 Yarra ...,$460,1,1,1,Apartment / Unit ...,460.0,False
https://www.domai...,3125,burwood,04/390 Burwood Hi...,"$310 per week, wi...",1,1,0,Apartment / Unit ...,310.0,False
https://www.domai...,3156,ferntree-gully,1 & 2/32 Folkston...,$800,4,3,1,House,800.0,False
https://www.domai...,3228,torquay,"1 Acacia Street, ...","$1,000 per week",4,2,2,House,1000.0,False
https://www.domai...,3200,frankston-north,1 Aleppo Crescent...,$450,3,1,1,House,450.0,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/34 C...,$480 P/W,1,1,0,Apartment / Unit ...,480.0,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/48 C...,$500 P/W,1,1,1,Apartment / Unit ...,500.0,False
https://www.domai...,3030,point-cook,1 Bensonhurst Par...,$570 per week,4,2,2,House,570.0,False
https://www.domai...,3047,broadmeadows,"1 Biltris Court, ...",$500,3,1,2,House,500.0,False
https://www.domai...,3149,mount-waverley,1 Birralee Street...,$625 per week,3,1,1,House,625.0,False


In [153]:
raw_sdf = raw_sdf.withColumn(
    # Check multiple pricings
    'mult_pricing',
    F.when(F.lower(F.col('cost_text')).contains('pw') \
           | F.lower(F.col('cost_text')).contains('p/w') \
           | F.lower(F.col('cost_text')).contains('month') \
           | F.lower(F.col('cost_text')).contains('pcm') \
           | F.lower(F.col('cost_text')).contains('p.a') \
           | F.lower(F.col('cost_text')).contains('pa') \
           | F.lower(F.col('cost_text')).contains('per annum') \
           | F.lower(F.col('cost_text')).contains('per_annum') \
           | F.lower(F.col('cost_text')).contains('p/a')\
           | F.lower(F.col('cost_text')).contains('week'), True).otherwise(False)
)

raw_sdf

url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost_pw,furnished,mult_pricing
https://www.domai...,3141,south-yarra,(Leased) 3 Yarra ...,$460,1,1,1,Apartment / Unit ...,460.0,False,False
https://www.domai...,3125,burwood,04/390 Burwood Hi...,"$310 per week, wi...",1,1,0,Apartment / Unit ...,310.0,False,True
https://www.domai...,3156,ferntree-gully,1 & 2/32 Folkston...,$800,4,3,1,House,800.0,False,False
https://www.domai...,3228,torquay,"1 Acacia Street, ...","$1,000 per week",4,2,2,House,1000.0,False,True
https://www.domai...,3200,frankston-north,1 Aleppo Crescent...,$450,3,1,1,House,450.0,False,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/34 C...,$480 P/W,1,1,0,Apartment / Unit ...,480.0,False,True
https://www.domai...,3011,footscray,1 Bed 1 Bath/48 C...,$500 P/W,1,1,1,Apartment / Unit ...,500.0,False,True
https://www.domai...,3030,point-cook,1 Bensonhurst Par...,$570 per week,4,2,2,House,570.0,False,True
https://www.domai...,3047,broadmeadows,"1 Biltris Court, ...",$500,3,1,2,House,500.0,False,False
https://www.domai...,3149,mount-waverley,1 Birralee Street...,$625 per week,3,1,1,House,625.0,False,True
