notebook that checks if entries are valid


In [46]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession, functions as F
import os
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

In [47]:
df = spark.read.parquet('../data/curated/furnishing.parquet')

In [48]:
df.show(5)

+-----+--------+----------+---------+-------------+----+-----+-------+
| cost|postcode|    suburb|furnished|property_type|beds|baths|parking|
+-----+--------+----------+---------+-------------+----+-----+-------+
|520.0|    3029| truganina|        0|        House| 3.0|  1.0|      1|
|480.0|    3500|   mildura|        0|        House| 3.0|  1.0|      2|
|500.0|    3810|  pakenham|        0|        House| 3.0|  2.0|      1|
|480.0|    3216|   belmont|        0|        House| 3.0|  1.0|      2|
|530.0|    3030|point-cook|        0|    Townhouse| 3.0|  2.0|      2|
+-----+--------+----------+---------+-------------+----+-----+-------+
only showing top 5 rows



In [49]:
small_df= df.limit(20)

In [50]:
small_df.show()

+-----+--------+--------------+---------+--------------------+----+-----+-------+
| cost|postcode|        suburb|furnished|       property_type|beds|baths|parking|
+-----+--------+--------------+---------+--------------------+----+-----+-------+
|520.0|    3029|     truganina|        0|               House| 3.0|  1.0|      1|
|480.0|    3500|       mildura|        0|               House| 3.0|  1.0|      2|
|500.0|    3810|      pakenham|        0|               House| 3.0|  2.0|      1|
|480.0|    3216|       belmont|        0|               House| 3.0|  1.0|      2|
|530.0|    3030|    point-cook|        0|           Townhouse| 3.0|  2.0|      2|
|650.0|    3550| north-bendigo|        1|               House| 3.0|  2.0|      1|
|490.0|    3335|thornhill-park|        0|               House| 4.0|  2.0|      2|
|560.0|    3029|       tarneit|        0|               House| 4.0|  2.0|      2|
|600.0|    3155|       boronia|        0|               House| 3.0|  1.0|      2|
|500.0|    3216|

In [51]:
updated_df = df \
    .filter((F.col('beds') > 0) & (F.col('beds') <= 6)) \
    .filter((F.col('baths') > 0) & (F.col('baths') <= 6)) \
    .filter((F.col('parking') >= 0) & (F.col('parking') <= 6)) \

# updated_df = updated_df.withColumn(
#     "contains_price",
#     F.col("cost_text").rlike(r"(\$*\d+(?:,\d*)*(?:\.\d*)?)")
# ).filter(F.col("contains_price") == True)

In [52]:
updated_df.orderBy(F.desc("beds"))

# updated_df.orderBy(F.desc("parking"))

cost,postcode,suburb,furnished,property_type,beds,baths,parking
720.0,3750,wollert,0,House,6.0,3.0,2
670.0,3977,cranbourne,0,House,6.0,2.0,2
1495.0,3183,st-kilda-east,0,House,6.0,2.0,1
780.0,3132,mitcham,0,House,6.0,3.0,5
1200.0,3193,beaumaris,0,House,6.0,3.0,2
1100.0,3149,mount-waverley,0,House,6.0,2.0,2
1800.0,3227,barwon-heads,0,House,6.0,2.0,4
2600.0,3106,templestowe,0,House,6.0,4.0,5
570.0,3219,newcomb,0,House,6.0,3.0,2
1500.0,3806,berwick,0,Acreage / Semi-Rural,6.0,3.0,4


In [53]:
spec = updated_df.filter(updated_df.suburb == "cranbourne-east").orderBy(F.desc("beds"))

In [54]:
spec.show()

+------+--------+---------------+---------+-------------+----+-----+-------+
|  cost|postcode|         suburb|furnished|property_type|beds|baths|parking|
+------+--------+---------------+---------+-------------+----+-----+-------+
| 680.0|    3977|cranbourne-east|        0|        House| 6.0|  2.0|      2|
| 580.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      2|
| 540.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      2|
| 650.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      2|
| 625.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      2|
| 580.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      2|
| 580.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      1|
| 520.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      1|
| 650.0|    3977|cranbourne-east|        0|        House| 4.0|  2.0|      2|
| 560.0|    3977|cranbourne-east|        0|    Townhouse| 4.0|  3.0|      2|

In [55]:
# spec.withColumn(
#     'cost_text',
#     F.split(F.col('cost_text'), " ").getItem(0)
# )

df

cost,postcode,suburb,furnished,property_type,beds,baths,parking
520.0,3029,truganina,0,House,3.0,1.0,1
480.0,3500,mildura,0,House,3.0,1.0,2
500.0,3810,pakenham,0,House,3.0,2.0,1
480.0,3216,belmont,0,House,3.0,1.0,2
530.0,3030,point-cook,0,Townhouse,3.0,2.0,2
650.0,3550,north-bendigo,1,House,3.0,2.0,1
490.0,3335,thornhill-park,0,House,4.0,2.0,2
560.0,3029,tarneit,0,House,4.0,2.0,2
600.0,3155,boronia,0,House,3.0,1.0,2
500.0,3216,waurn-ponds,0,House,3.0,2.0,2


In [56]:
print("removing {} invalid rows".format(df.count() - updated_df.count()))


removing 156 invalid rows


In [57]:
updated_df.write.mode("overwrite").parquet("../data/curated/valid_df.parquet")

In [58]:
# creating a function for this task
def find_valid(df: SparkSession.DataFrame , output_path: str) -> None:
    """Function that filters for rows that are valid and outputs into a parquet file.
    A row is deemed valid if:
                    - beds : (0,6]
                    - baths : (0,6]
                    - parking : [0, 6]
    The justification for these values is based on looking at the data and deciding that prices with properties which state these amounts of beds, baths and parking are far too cheap
    
    Parameters:
    df - Dataframe which we want to find valid rows for
    output_path - string that brings us to our desired output path

    """
    updated_df = df \
    .filter((F.col('beds') > 0) & (F.col('beds') <= 6)) \
    .filter((F.col('baths') > 0) & (F.col('baths') <= 6)) \
    .filter((F.col('parking') >= 0) & (F.col('parking') <= 6)) \

    updated_df = updated_df.withColumn(
        "contains_price",
        F.col("cost_text").rlike(r"(\$*\d+(?:,\d*)*(?:\.\d*)?)")
    ).filter(F.col("contains_price") == True).drop(F.col("contains_price"))

    print("removing {} invalid rows".format(df.count() - updated_df.count()))
    # write to file
    updated_df.write.mode("overwrite").parquet(output_path)


AttributeError: type object 'SparkSession' has no attribute 'DataFrame'