jupyter notebook which goes through data cleaning 

In [530]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession, functions as F
import os
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

import warnings
warnings.filterwarnings("ignore") # suppress warnings

In [531]:
df = spark.read.parquet('../data/landing/property_data.parquet')
initial_instances = df.count()

In [532]:
import re
pdf = df.toPandas()

# Delete Rows that do not contain the price in cost_text
valid_costs = [x if re.search(r"([\d\.]+)", x) else 0 for x in pdf['cost_text']]
pdf['cost_text'] = valid_costs
pdf = pdf[pdf['cost_text']!=0]

# Remove commas
pdf['cost'] = pdf['cost_text'].replace(',','', regex=True)

# Manual deletions 
pdf = pdf.drop(8235) # invalid cost

In [533]:
sdf = spark.createDataFrame(pdf)

In [534]:
raw_sdf = sdf.withColumn(
    # Properties priced per week
    'Week',
    F.when(F.lower(F.col('cost_text')).contains('pw') \
           | F.lower(F.col('cost_text')).contains('p/w') \
           | F.lower(F.col('cost_text')).contains('wk') \
           | F.lower(F.col('cost_text')).contains('week'), True).otherwise(False)
).withColumn(
    # see if property is priced per annum
    'month',
    F.when(F.lower(F.col('cost_text')).contains('month') \
        | F.lower(F.col('cost_text')).contains('per month') \
        | F.lower(F.col('cost_text')).contains('pcm'), True ).otherwise(False)
).withColumn(
    # see if property is priced per annum
    'contains_pa',
    F.when(F.lower(F.col('cost_text')).contains('p.a') \
        | F.lower(F.col('cost_text')).contains('pa') \
        | F.lower(F.col('cost_text')).contains('per annum') \
        | F.lower(F.col('cost_text')).contains('per_annum') \
        | F.lower(F.col('cost_text')).contains('p/a'), True ).otherwise(False)
).withColumn(
    # remove $ and ,
    'cost',
    F.regexp_replace(F.col("cost"), r"[$]", "") # casting to float makes it include the decimal point
).withColumn(
    # removes decimal places
    'cost',
    F.regexp_replace(F.col("cost"), r"\.\d+", "")
)

In [535]:
pdf = raw_sdf.toPandas()

In [536]:
pdf['cost'] = [(re.findall(r'\d\d+', x)) for x in pdf['cost']]
pdf[(pdf['month']==True)&(pdf['Week']==False)]

Unnamed: 0,url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost,Week,month,contains_pa
108,https://www.domain.com.au/1-1a-civic-avenue-ec...,3564,echuca,"1/1a Civic Avenue, Echuca VIC 3564",$2275 calendar month,3.0,2.0,3,House,[2275],False,True,False
608,https://www.domain.com.au/120-127-mc-kenzie-st...,3995,wonthaggi,"120-127 Mc Kenzie Street, Wonthaggi VIC 3995",$77 per calendar month,1.0,1.0,0,Apartment / Unit / Flat,[77],False,True,False
637,https://www.domain.com.au/125-stawell-street-e...,3564,echuca,"125 Stawell Street, Echuca VIC 3564",$2774 calendar month,3.0,2.0,2,House,[2774],False,True,False
747,https://www.domain.com.au/14-jubilee-street-my...,3737,myrtleford,"14 Jubilee Street, Myrtleford VIC 3737",From $95.00 per month,,,0,House,[95],False,True,False
1304,https://www.domain.com.au/2-7-aylesbury-cresce...,3043,gladstone-park,"2/7 Aylesbury Crescent, Gladstone Park VIC 3043","UNDER APPLICATION | $3,042 PCM",3.0,2.0,2,Townhouse,[3042],False,True,False
1427,https://www.domain.com.au/21-hannah-street-ben...,3672,benalla,"21 Hannah Street, Benalla VIC 3672",Prices Starting From $155 pcm,,,0,House,[155],False,True,False
1558,https://www.domain.com.au/23c-oakes-avenue-cla...,3169,clayton-south,"23C Oakes Avenue, Clayton South VIC 3169",$1000 pcm,4.0,3.0,2,House,[1000],False,True,False
2044,https://www.domain.com.au/317-ogilvie-avenue-e...,3564,echuca,"317 Ogilvie Avenue, Echuca VIC 3564",$1950 calendar month,3.0,2.0,3,House,[1950],False,True,False
2120,https://www.domain.com.au/35-king-street-myrtl...,3737,myrtleford,"35 King Street, Myrtleford VIC 3737",$325 per month + initial bond,1.0,1.0,0,House,[325],False,True,False
2226,https://www.domain.com.au/4-cochrane-street-ec...,3564,echuca,"4 Cochrane Street, Echuca VIC 3564",$2607 per calander month,4.0,2.0,4,House,[2607],False,True,False


In [537]:
pdf['cost'] = [x[0] if (len(x)>=1) else 0 for x in pdf['cost'] ] # extract lowest estimate  of price when a range is given

for i in range(len(pdf)):
    if ((pdf['month'][i] == True)&(pdf['Week'][i]==False)):
        cost = re.findall(r'\d+', pdf['cost'][i])[0]
        cost_pw = int(cost)/4
        pdf['cost'][i] = int(cost)/4

pdf

Unnamed: 0,url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost,Week,month,contains_pa
0,https://www.domain.com.au/-leased-3-yarra-stre...,3141,south-yarra,"(Leased) 3 Yarra Street, South Yarra VIC 3141",$460,1,1,1,Apartment / Unit / Flat,460,False,False,False
1,https://www.domain.com.au/04-390-burwood-highw...,3125,burwood,"04/390 Burwood Highway, Burwood VIC 3125","$310 per week, with AC",1,1,0,Apartment / Unit / Flat,310,True,False,False
2,https://www.domain.com.au/1-2-32-folkstone-cre...,3156,ferntree-gully,"1 & 2/32 Folkstone Crescent, Ferntree Gully VI...",$800,4,3,1,House,800,False,False,False
3,https://www.domain.com.au/1-acacia-street-torq...,3228,torquay,"1 Acacia Street, Torquay VIC 3228","$1,000 per week",4,2,2,House,1000,True,False,False
4,https://www.domain.com.au/1-aleppo-crescent-fr...,3200,frankston-north,"1 Aleppo Crescent, Frankston North VIC 3200",$450,3,1,1,House,450,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11978,https://www.domain.com.au/unit-2-198-knight-st...,3630,shepparton,"Unit 2/198 Knight St, Shepparton VIC 3630",$350.00 Per Week,2,1,1,Townhouse,350,True,False,False
11979,https://www.domain.com.au/unit-2-23-chesterton...,3029,tarneit,"Unit 2/23 Chesterton Avenue, Tarneit VIC 3029",$490,3,2,1,Apartment / Unit / Flat,490,False,False,False
11980,https://www.domain.com.au/unit-6-34-grevillia-...,3046,oak-park,"Unit 6/34 Grevillia Road, Oak Park VIC 3046",$525 weekly,2,2,1,Townhouse,525,True,False,False
11981,https://www.domain.com.au/unit-63-394-collins-...,3000,melbourne,"Unit 63/394 Collins Street, Melbourne VIC 3000",$559 pw Furnished,1,1,0,Apartment / Unit / Flat,559,True,False,False


In [538]:
pdf[(pdf['month']==True)&(pdf['Week']==False)] # Cost is working

Unnamed: 0,url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost,Week,month,contains_pa
108,https://www.domain.com.au/1-1a-civic-avenue-ec...,3564,echuca,"1/1a Civic Avenue, Echuca VIC 3564",$2275 calendar month,3.0,2.0,3,House,568.75,False,True,False
608,https://www.domain.com.au/120-127-mc-kenzie-st...,3995,wonthaggi,"120-127 Mc Kenzie Street, Wonthaggi VIC 3995",$77 per calendar month,1.0,1.0,0,Apartment / Unit / Flat,19.25,False,True,False
637,https://www.domain.com.au/125-stawell-street-e...,3564,echuca,"125 Stawell Street, Echuca VIC 3564",$2774 calendar month,3.0,2.0,2,House,693.5,False,True,False
747,https://www.domain.com.au/14-jubilee-street-my...,3737,myrtleford,"14 Jubilee Street, Myrtleford VIC 3737",From $95.00 per month,,,0,House,23.75,False,True,False
1304,https://www.domain.com.au/2-7-aylesbury-cresce...,3043,gladstone-park,"2/7 Aylesbury Crescent, Gladstone Park VIC 3043","UNDER APPLICATION | $3,042 PCM",3.0,2.0,2,Townhouse,760.5,False,True,False
1427,https://www.domain.com.au/21-hannah-street-ben...,3672,benalla,"21 Hannah Street, Benalla VIC 3672",Prices Starting From $155 pcm,,,0,House,38.75,False,True,False
1558,https://www.domain.com.au/23c-oakes-avenue-cla...,3169,clayton-south,"23C Oakes Avenue, Clayton South VIC 3169",$1000 pcm,4.0,3.0,2,House,250.0,False,True,False
2044,https://www.domain.com.au/317-ogilvie-avenue-e...,3564,echuca,"317 Ogilvie Avenue, Echuca VIC 3564",$1950 calendar month,3.0,2.0,3,House,487.5,False,True,False
2120,https://www.domain.com.au/35-king-street-myrtl...,3737,myrtleford,"35 King Street, Myrtleford VIC 3737",$325 per month + initial bond,1.0,1.0,0,House,81.25,False,True,False
2226,https://www.domain.com.au/4-cochrane-street-ec...,3564,echuca,"4 Cochrane Street, Echuca VIC 3564",$2607 per calander month,4.0,2.0,4,House,651.75,False,True,False


In [539]:
# Convert pdf back to spark
raw_sdf = spark.createDataFrame(pdf)
raw_sdf

url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost,Week,month,contains_pa
https://www.domai...,3141,south-yarra,(Leased) 3 Yarra ...,$460,1,1,1,Apartment / Unit ...,460,False,False,False
https://www.domai...,3125,burwood,04/390 Burwood Hi...,"$310 per week, wi...",1,1,0,Apartment / Unit ...,310,True,False,False
https://www.domai...,3156,ferntree-gully,1 & 2/32 Folkston...,$800,4,3,1,House,800,False,False,False
https://www.domai...,3228,torquay,"1 Acacia Street, ...","$1,000 per week",4,2,2,House,1000,True,False,False
https://www.domai...,3200,frankston-north,1 Aleppo Crescent...,$450,3,1,1,House,450,False,False,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/34 C...,$480 P/W,1,1,0,Apartment / Unit ...,480,True,False,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/48 C...,$500 P/W,1,1,1,Apartment / Unit ...,500,True,False,False
https://www.domai...,3030,point-cook,1 Bensonhurst Par...,$570 per week,4,2,2,House,570,True,False,False
https://www.domai...,3047,broadmeadows,"1 Biltris Court, ...",$500,3,1,2,House,500,False,False,False
https://www.domai...,3149,mount-waverley,1 Birralee Street...,$625 per week,3,1,1,House,625,True,False,False


In [540]:
raw_sdf = raw_sdf.withColumn(
    # see if property is furnished - note: this will contain partially furnished as well
    'furnished', 
    F.when(F.lower(F.col('cost_text')).contains('furnish') \
        | F.lower(F.col('cost_text')).contains('furniture'), True ).otherwise(False)
)

raw_sdf

url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost,Week,month,contains_pa,furnished
https://www.domai...,3141,south-yarra,(Leased) 3 Yarra ...,$460,1,1,1,Apartment / Unit ...,460,False,False,False,False
https://www.domai...,3125,burwood,04/390 Burwood Hi...,"$310 per week, wi...",1,1,0,Apartment / Unit ...,310,True,False,False,False
https://www.domai...,3156,ferntree-gully,1 & 2/32 Folkston...,$800,4,3,1,House,800,False,False,False,False
https://www.domai...,3228,torquay,"1 Acacia Street, ...","$1,000 per week",4,2,2,House,1000,True,False,False,False
https://www.domai...,3200,frankston-north,1 Aleppo Crescent...,$450,3,1,1,House,450,False,False,False,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/34 C...,$480 P/W,1,1,0,Apartment / Unit ...,480,True,False,False,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/48 C...,$500 P/W,1,1,1,Apartment / Unit ...,500,True,False,False,False
https://www.domai...,3030,point-cook,1 Bensonhurst Par...,$570 per week,4,2,2,House,570,True,False,False,False
https://www.domai...,3047,broadmeadows,"1 Biltris Court, ...",$500,3,1,2,House,500,False,False,False,False
https://www.domai...,3149,mount-waverley,1 Birralee Street...,$625 per week,3,1,1,House,625,True,False,False,False
