jupyter notebook which goes through data cleaning 

In [133]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

In [134]:
df = spark.read.parquet('../data/raw/property_data.parquet')
initial_instances = df.count()

In [135]:
import re
pdf = df.toPandas()

# Delete Rows that do not contain the price in cost_text
valid_costs = [x if re.search(r"([\d\.]+)", x) else 0 for x in pdf['cost_text']]
pdf['cost_text'] = valid_costs
pdf = pdf[pdf['cost_text']!=0]

# Remove commas
pdf['cost'] = pdf['cost_text'].replace(',','', regex=True)

# Manual deletions 
pdf = pdf.drop(8235) # invalid cost

In [136]:
sdf = spark.createDataFrame(pdf)

In [137]:
raw_sdf = sdf.withColumn(
    # Properties priced per week
    'Week',
    F.when(F.lower(F.col('cost_text')).contains('pw') \
           | F.lower(F.col('cost_text')).contains('p/w') \
           | F.lower(F.col('cost_text')).contains('wk') \
           | F.lower(F.col('cost_text')).contains('/w') \
           | F.lower(F.col('cost_text')).contains('week'), True).otherwise(False)
).withColumn(
    # see if property is priced per annum
    'month',
    F.when(F.lower(F.col('cost_text')).contains('month') \
        | F.lower(F.col('cost_text')).contains('per month') \
        | F.lower(F.col('cost_text')).contains('pcm'), True ).otherwise(False)
).withColumn(
    # see if property is priced per annum
    'contains_pa',
    F.when(F.lower(F.col('cost_text')).contains('p.a') \
        | F.lower(F.col('cost_text')).contains('pa') \
        | F.lower(F.col('cost_text')).contains('per annum') \
        | F.lower(F.col('cost_text')).contains('per year') \
        | F.lower(F.col('cost_text')).contains('per_annum') \
        | F.lower(F.col('cost_text')).contains('p/a'), True ).otherwise(False)
).withColumn(
    # remove $ and ,
    'cost',
    F.regexp_replace(F.col("cost"), r"[$]", "") # casting to float makes it include the decimal point
).withColumn(
    # removes decimal places
    'cost',
    F.regexp_replace(F.col("cost"), r"\.\d+", "")
)

In [138]:
pdf = raw_sdf.toPandas()

In [139]:
pdf['cost'] = [(re.findall(r'\d\d+', x)) for x in pdf['cost']]

In [140]:
pdf['cost'] = [x[0] if (len(x)>=1) else 0 for x in pdf['cost'] ] # extract lowest estimate  of price when a range is given

for i in range(len(pdf)):
    if ((pdf['month'][i] == True)&(pdf['Week'][i]==False)):
        cost = re.findall(r'\d+', pdf['cost'][i])[0]
        # Divide monthly cost by 4
        pdf['cost'][i] = int(cost)/4
    if ((pdf['month'][i] == False)&(pdf['Week'][i]==False)&((pdf['contains_pa'][i])==True)):
        cost = re.findall(r'\d+', pdf['cost'][i])[0]
        # Divide annual cost by 52
        pdf['cost'][i] = int(cost)/52

In [141]:
pdf[(pdf['month'] == False)&(pdf['Week']==False)&((pdf['contains_pa'])==True)] # Cost is working

# Delete invalid costs
pdf = pdf.drop(4951)
pdf = pdf.drop(10581)

pdf[(pdf['month'] == False)&(pdf['Week']==False)&((pdf['contains_pa'])==True)] # Cost is working


Unnamed: 0,url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost,Week,month,contains_pa
3106,https://www.domain.com.au/8-chamberlain-road-r...,3523,redcastle,"8 Chamberlain Road, Redcastle VIC 3523","$28,000 Per Year !!",,,0,Vacant land,538.461538,False,False,True
4900,https://www.domain.com.au/22-enterprise-way-ya...,3730,yarrawonga,"22 Enterprise Way, Yarrawonga VIC 3730","$52,000 per annum plus GST",1.0,1.0,4,New House & Land,1000.0,False,False,True
6357,https://www.domain.com.au/667-glenhuntly-road-...,3162,caulfield,"667 Glenhuntly Road, Caulfield VIC 3162","$38,000 p.a. Incl. Outgoings + GST",,,0,House,730.769231,False,False,True
7664,https://www.domain.com.au/12-panama-street-wol...,3750,wollert,"12 Panama Street, Wollert VIC 3750","LEASED for $33,500 PA in ONE HOUR!",4.0,2.0,2,House,644.230769,False,False,True
9720,https://www.domain.com.au/30-south-concourse-b...,3193,beaumaris,"30 South Concourse, Beaumaris VIC 3193","$39,000 p.a + Outgoings",0.0,1.0,0,Studio,750.0,False,False,True


In [142]:
# Convert pdf back to spark
raw_sdf = spark.createDataFrame(pdf)

# Delete? # pdf = pdf.drop(4950)
# pdf = pdf.drop(10579)

In [143]:
raw_sdf = raw_sdf.withColumn(
    # see if property is furnished - note: this will contain partially furnished as well
    'furnished', 
    F.when(F.lower(F.col('cost_text')).contains('furnish') \
        | F.lower(F.col('cost_text')).contains('furniture'), True ).otherwise(False)
)

raw_sdf

url,postcode,suburb,name,cost_text,beds,baths,parking,property_type,cost,Week,month,contains_pa,furnished
https://www.domai...,3141,south-yarra,(Leased) 3 Yarra ...,$460,1,1,1,Apartment / Unit ...,460,False,False,False,False
https://www.domai...,3125,burwood,04/390 Burwood Hi...,"$310 per week, wi...",1,1,0,Apartment / Unit ...,310,True,False,False,False
https://www.domai...,3156,ferntree-gully,1 & 2/32 Folkston...,$800,4,3,1,House,800,False,False,False,False
https://www.domai...,3228,torquay,"1 Acacia Street, ...","$1,000 per week",4,2,2,House,1000,True,False,False,False
https://www.domai...,3200,frankston-north,1 Aleppo Crescent...,$450,3,1,1,House,450,False,False,False,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/34 C...,$480 P/W,1,1,0,Apartment / Unit ...,480,True,False,False,False
https://www.domai...,3011,footscray,1 Bed 1 Bath/48 C...,$500 P/W,1,1,1,Apartment / Unit ...,500,True,False,False,False
https://www.domai...,3030,point-cook,1 Bensonhurst Par...,$570 per week,4,2,2,House,570,True,False,False,False
https://www.domai...,3047,broadmeadows,"1 Biltris Court, ...",$500,3,1,2,House,500,False,False,False,False
https://www.domai...,3149,mount-waverley,1 Birralee Street...,$625 per week,3,1,1,House,625,True,False,False,False


In [144]:
pdf = raw_sdf.toPandas()

In [145]:
from pyspark.sql.functions import col
# Aggregate by postcode

raw_sdf = raw_sdf.select(
    col('cost').cast('float'),
    col('postcode').cast('int'),
    col('suburb').cast('string'),
    col('furnished').cast('int'),
    col('property_type').cast('string'),
    col('beds').cast('int'),
    col('baths').cast('int'),
    col('parking').cast('int'),
  )

pdf=raw_sdf.toPandas()

In [146]:
pdf[7521:7532] # 12 beds???

Unnamed: 0,cost,postcode,suburb,furnished,property_type,beds,baths,parking
7521,300.0,3630,shepparton,0,Apartment / Unit / Flat,2.0,1.0,1
7522,385.0,3182,st-kilda,0,Apartment / Unit / Flat,1.0,1.0,1
7523,475.0,3181,prahran,0,Apartment / Unit / Flat,2.0,1.0,0
7524,375.0,3040,essendon,0,Apartment / Unit / Flat,1.0,1.0,1
7525,320.0,3012,west-footscray,0,Apartment / Unit / Flat,1.0,1.0,1
7526,520.0,3163,glen-huntly,0,Apartment / Unit / Flat,2.0,1.0,1
7527,465.0,3142,toorak,0,Apartment / Unit / Flat,1.0,1.0,1
7528,520.0,3145,malvern-east,0,Apartment / Unit / Flat,2.0,1.0,1
7529,540.0,3196,chelsea,0,Apartment / Unit / Flat,2.0,1.0,2
7530,540.0,3175,dandenong,0,Apartment / Unit / Flat,3.0,1.0,1


In [147]:
# Delete non-zero values
pdf = pdf[pdf['cost']!=0]

# Remove NaN values
pdf = pdf.dropna()
num_deletions = initial_instances - len(pdf)
print("Deleted {} instances".format(num_deletions))

Deleted 335 instances


In [148]:
final_pdf = pdf
final_sdf = spark.createDataFrame(final_pdf)

final_sdf.write.mode("overwrite").parquet("../data/curated/furnishing.parquet")

                                                                                