## Clean Rental Rate Notebook

This notebook aims to clean the rental rates for the given data.

First we perform necessary imports.

In [70]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import json
import re

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Crime Curated")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

FILE_PATH = '../data/raw/example.json'

All the functions used are defined below:

In [80]:
def extract_number(text, keyword):
    """
    Takes a string input and extracts the numeric value, casting it to an int.
    """
    match = re.search(rf'(\d+)\s*{keyword}', text, re.IGNORECASE)
    return int(match.group(1)) if match else None

def find_largest_number(text, number_pattern):
    """
    Finds the largest number based on a number pattern in a given string.
    """
    matches = re.findall(number_pattern, text)

    valid_numbers = []
    for match in matches:
        # clean the match by removing non numeric chars
        cleaned = match.replace('$', '').replace(',', '').strip()
        # attempt to convert/clean non-empty valid strings
        if cleaned:  
            try:
                valid_numbers.append(float(cleaned))
            except ValueError:
                pass
    
    # if there's a valid number, return it
    return max(valid_numbers) if valid_numbers else None

Now load the data into a data frame.

In [5]:
# 0: define the schema for the dataframe
schema = StructType([
    StructField("url", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("bedrooms", IntegerType(), True),
    StructField("bathrooms", IntegerType(), True)
])

# 1: read the file
with open(FILE_PATH, 'r') as fp:
    data = json.load(fp)

# 2: extract the cost, along with the number of bedrooms and bathrooms
extracted_data = []
for url, entity in data.items():
    cost_text = entity.get("cost_text", "")
    rooms = entity.get("rooms", [])
    
    bedrooms = None
    bathrooms = None
    for room in rooms:
        if 'bed' in room.lower():
            bedrooms = extract_number(room, 'bed')
        elif 'bath' in room.lower():
            bathrooms = extract_number(room, 'bath')
    
    extracted_data.append((url, cost_text, bedrooms, bathrooms))

# 3: create the dataframe from this data
df = spark.createDataFrame(extracted_data, schema=schema)

# verify the dataframe is working as expected
print(f"Dataframe of {df.count()} rows.")
df.limit(5)

Dataframe of 977 rows.


url,cost_text,bedrooms,bathrooms
https://www.domai...,Price on Application,9.0,9.0
https://www.domai...,"$38,000 p.a. Incl...",,
https://www.domai...,"$12,500 pw",4.0,4.0
https://www.domai...,"$9,999 pw",5.0,5.0
https://www.domai...,"$5,750.00",2.0,2.0


First, we will do some exploratory analysis. 

1. How many don't have the price included at all?

In [82]:
# df.where(df['cost_text'])

COST_COLUMNS = ['unknown_rate', 'per_month', 'per_week', 'per_year']

number_pattern = r'\$?\d+[\d,]*(\.\d+)?'

per_annum_text = r'p\.?a\.?|per annum|py|p\.y\.|per year|yr|year|yearly|p/y'
per_calendar_month_text = r'p\.?m\.?|per month|p\.?c\.?m\.?|month|monthly|mth|p/m'
per_week_text = r'p\.?w\.?|per week|week|weekly|p/w|wk|w'

between_amount_and_rate = f'\s*/?\s*'

# match if just a number
df_extracted = df.withColumn("unknown_rate", F.regexp_extract(F.col("cost_text"), fr'^{number_pattern}$', 0)) \

# match with per week strings
df_extracted = df_extracted.withColumn("per_week", F.regexp_extract(F.col("cost_text"), fr'(?i){number_pattern}{between_amount_and_rate}({per_week_text})',
                                                          0))

# match with per year strings
df_extracted = df_extracted.withColumn("per_year", F.regexp_extract(F.col("cost_text"), fr'(?i){number_pattern}{between_amount_and_rate}({per_annum_text})',
                                                          0))

# match with per month strings
df_extracted = df_extracted.withColumn("per_month", F.regexp_extract(F.col("cost_text"), fr'(?i){number_pattern}{between_amount_and_rate}({per_calendar_month_text})',
                                                          0))

# finally match with the largest number in the text
find_largest_number_udf = F.udf(lambda text: find_largest_number(text, number_pattern), FloatType())

# find and save the largest number to a new column
df_extracted = df_extracted.withColumn("unknown_rate_within_text", find_largest_number_udf(F.col("cost_text"))) \
                            .withColumn(
                                "unknown_rate", 
                                F.coalesce(F.col("unknown_rate"), F.col("unknown_rate_within_text"))
                            ).drop("unknown_rate_within_text")

for col_name in COST_COLUMNS:
    df_extracted = df_extracted.withColumn(col_name, F.regexp_replace(F.col(col_name), r'\D', '')) \
    .withColumn(col_name, F.col(col_name).cast("float"))

df_extracted.limit(20)

24/09/06 08:54:12 ERROR Executor: Exception in task 0.0 in stage 129.0 (TID 224)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_16340/1434314076.py", line 29, in <lambda>
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in find_largest_number
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in <listcomp>
ValueError: could not convert string to float: ''

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collectio

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_16340/1434314076.py", line 29, in <lambda>
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in find_largest_number
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in <listcomp>
ValueError: could not convert string to float: ''


24/09/06 08:54:13 ERROR Executor: Exception in task 0.0 in stage 130.0 (TID 225)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_16340/1434314076.py", line 29, in <lambda>
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in find_largest_number
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in <listcomp>
ValueError: could not convert string to float: ''

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collectio

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_16340/1434314076.py", line 29, in <lambda>
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in find_largest_number
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in <listcomp>
ValueError: could not convert string to float: ''


In [76]:
# check for rows where all is null
null_condition = F.col(COST_COLUMNS[0]).isNull()

for col in COST_COLUMNS[1:]:
    null_condition = null_condition & F.col(col).isNull()

# apply the filter (+ also filter out where no numbers in text for clarity)
df_view = df_extracted.filter(null_condition).filter(F.col("cost_text").rlike(r'\d'))

print("Length: ", df_view.count())
df_view.show(20, truncate=False)

24/09/06 08:51:31 ERROR Executor: Exception in task 2.0 in stage 119.0 (TID 209)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_16340/836938486.py", line 29, in <lambda>
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in find_largest_number
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in <listcomp>
ValueError: could not convert string to float: ''

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_16340/836938486.py", line 29, in <lambda>
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in find_largest_number
  File "/tmp/ipykernel_16340/3228535467.py", line 14, in <listcomp>
ValueError: could not convert string to float: ''


In [24]:
df.take(2)[1]

Row(url='https://www.domain.com.au/667-glenhuntly-road-caulfield-vic-3162-11598047', cost_text='$38,000 p.a. Incl. Outgoings + GST', bedrooms=None, bathrooms=None)