## Clean Rental Rate Notebook

This notebook aims to clean the rental rates for the given data.

First we perform necessary imports.

In [18]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import json
import re

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Crime Curated")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .config("spark.driver.port", "6066")
    .getOrCreate()
)

FILE_PATH = '../data/raw/domain_listing_openRouteServices'

All the functions used are defined below:

In [19]:
def extract_number(text, keyword):
    """
    Takes a string input and extracts the numeric value, casting it to an int.
    """
    match = re.search(rf'(\d+)\s*{keyword}', text, re.IGNORECASE)
    return int(match.group(1)) if match else None

def find_largest_number(text, number_pattern):
    """
    Finds the largest number based on a number pattern in a given string.
    """
    number_pattern = r'\$?\d+[\d,]*(?:\.\d+)?'
    matches = re.findall(number_pattern, text)

    if matches:
        numbers = [float(match.replace('$', '').replace(',', '')) for match in matches]
        return max(numbers)
    return None

Now load the data into a data frame.

In [20]:
## Make df from domain listing distance joined
domain_openRouteServices = spark.read.parquet(FILE_PATH)


# 3: create the dataframe from this data
df = domain_openRouteServices.select("url", "cost_text", "bedrooms", "bathrooms")

# verify the dataframe is working as expected
print(f"Dataframe of {df.count()} rows.")


Dataframe of 13328 rows.


First, we will do some exploratory analysis. 

1. How many don't have the price included at all?

In [21]:
# df.where(df['cost_text'])

COST_COLUMNS = ['unknown_rate', 'per_month', 'per_week', 'per_year']

number_pattern = r'\$?\d+[\d,]*(\.\d+)?'

per_annum_text = r'p\.?a\.?|per annum|py|p\.y\.|per year|yr|year|yearly|p/y'
per_calendar_month_text = r'p\.?m\.?|per month|p\.?c\.?m\.?|month|monthly|mth|p/m'
per_week_text = r'p\.?w\.?|per week|week|weekly|p/w|wk|w'

between_amount_and_rate = f'\s*/?\s*'

# match if just a number
df_extracted = df.withColumn("unknown_rate", F.regexp_extract(F.col("cost_text"), fr'^{number_pattern}$', 0)) \

# match with per week strings
df_extracted = df_extracted.withColumn("per_week", F.regexp_extract(F.col("cost_text"), fr'(?i){number_pattern}{between_amount_and_rate}({per_week_text})',
                                                          0))

# match with per year strings
df_extracted = df_extracted.withColumn("per_year", F.regexp_extract(F.col("cost_text"), fr'(?i){number_pattern}{between_amount_and_rate}({per_annum_text})',
                                                          0))

# match with per month strings
df_extracted = df_extracted.withColumn("per_month", F.regexp_extract(F.col("cost_text"), fr'(?i){number_pattern}{between_amount_and_rate}({per_calendar_month_text})',
                                                          0))

# finally match with the largest number in the text
find_largest_number_udf = F.udf(lambda text: find_largest_number(text, number_pattern), FloatType())

# convert columns to floats
for col_name in COST_COLUMNS:
    df_extracted = df_extracted.withColumn(col_name, F.regexp_replace(F.col(col_name), r'[^\d.]', '')) \
    .withColumn(col_name, F.col(col_name).cast("float"))

# find and save the largest number to a new column
df_extracted = df_extracted.withColumn("unknown_rate_within_text", find_largest_number_udf(F.col("cost_text"))) \
                            .withColumn(
                                "unknown_rate", 
                                F.coalesce(F.col("unknown_rate"), F.col("unknown_rate_within_text"))
                            ).drop("unknown_rate_within_text")



# df_extracted.where(F.col('unknown_rate_within_text').isNotNull())

df_extracted.limit(5)

url,cost_text,bedrooms,bathrooms,unknown_rate,per_week,per_year,per_month
https://www.domai...,$460,2,1,460.0,,,
https://www.domai...,$515.00,4,2,515.0,,,
https://www.domai...,$550 weekly,5,2,550.0,550.0,,
https://www.domai...,$500.00,2,1,500.0,,,
https://www.domai...,$500,3,1,500.0,,,


Checking to see how many entries have unknown rates.

In [22]:
KNOWN_RATE_COLUMNS = ['per_month', 'per_week', 'per_year']

unknown_rate_condition = F.col(KNOWN_RATE_COLUMNS[0]).isNull()

for col in KNOWN_RATE_COLUMNS[1:]:
    unknown_rate_condition = unknown_rate_condition & F.col(col).isNull()

# apply the filter (+ also filter out where no numbers in text for clarity)
print(f"There are: {df_extracted.filter(unknown_rate_condition).count()} of {df_extracted.count()} total columns with unknown rates.")

There are: 5235 of 13328 total columns with unknown rates.


In [23]:
null_counts = df_extracted.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df_extracted.columns])

# Show the result
null_counts.show()

+---+---------+--------+---------+------------+--------+--------+---------+
|url|cost_text|bedrooms|bathrooms|unknown_rate|per_week|per_year|per_month|
+---+---------+--------+---------+------------+--------+--------+---------+
|  0|        0|       0|        0|           0|    5275|   13323|    12985|
+---+---------+--------+---------+------------+--------+--------+---------+



                                                                                

In [24]:
from pyspark.sql import functions as F

# Create a new column 'rate_type' based on the non-null values in per_week, per_month, and per_year
df_labeled = df_extracted.withColumn(
    "rate_type", 
    F.when(F.col("per_week").isNotNull(), "weekly")
     .when(F.col("per_month").isNotNull(), "monthly")
     .when(F.col("per_year").isNotNull(), "yearly")
     .otherwise(None)
)

df_full = df_labeled.toPandas()
df_null = df_labeled.filter(F.col("rate_type").isNull())

# Drop rows where rate_type is null
df_labeled = df_labeled.filter(F.col("rate_type").isNotNull())

df_pd = df_labeled.toPandas()


In [25]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
import numpy as np

# Prepare the data
X = df_pd[['bedrooms', 'bathrooms', 'unknown_rate']]  # Features
y = df_pd['rate_type']  # Target variable

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Cross-validation function
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
    return np.mean(scores), np.std(scores)

# Models to evaluate
models = {
    "ZeroR": DummyClassifier(strategy="most_frequent"),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVC": SVC(random_state=42)
}

# Evaluate each model using cross-validation
for model_name, model in models.items():
    mean_accuracy, std_accuracy = evaluate_model(model, X, y_encoded)
    print(f"{model_name} - Mean Accuracy: {mean_accuracy:.8f} ± {std_accuracy:.8f}")


ZeroR - Mean Accuracy: 0.99505746 ± 0.00000150
Random Forest - Mean Accuracy: 0.99505715 ± 0.00146302
SVC - Mean Accuracy: 0.99555174 ± 0.00024653


In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Prepare Features (X) and Target (y)
X = df_pd[['bedrooms', 'bathrooms', 'unknown_rate']]  # Features
y = df_pd['rate_type']  # Target variable

# Step 2: Encode Target Variable (rate_type)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Transform categorical target to numerical values

# Step 3: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Create and Train the SVC Model
svc_model = SVC(random_state=42)  # Initialize the model
svc_model.fit(X_train, y_train)  # Train the model

# Step 5: Make Predictions
y_pred = svc_model.predict(X_test)


In [27]:
import pandas as pd

# Assuming df_pd is your original DataFrame with some null rate_type rows
# 1. Filter the rows with missing rate_type
missing_rate_df = df_full[df_full['rate_type'].isnull()]

# Separate the features (e.g., bedrooms, bathrooms, unknown_rate) for prediction
X_missing = missing_rate_df[['bedrooms', 'bathrooms', 'unknown_rate']]

# 2. Predict the rate_type using your trained SVC model
# svc_model is the trained model
predicted_rate_type = svc_model.predict(X_missing)

# 3. Assign the predicted values to the missing_rate_df DataFrame
missing_rate_df['rate_type'] = predicted_rate_type

# 4. Replace the null rate_type values in the original DataFrame with the predicted ones
df_full.loc[df_full['rate_type'].isnull(), 'rate_type'] = missing_rate_df['rate_type']

# Now df_pd has the missing rate_type values filled with the predicted rate_type
df_full.head(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rate_df['rate_type'] = predicted_rate_type


Unnamed: 0,url,cost_text,bedrooms,bathrooms,unknown_rate,per_week,per_year,per_month,rate_type
0,https://www.domain.com.au/1-4-hayley-court-mil...,$460,2,1,460.0,,,,1
1,https://www.domain.com.au/28-barbra-drive-char...,$515.00,4,2,515.0,,,,1
2,https://www.domain.com.au/59-botanic-dr-doncas...,$550 weekly,5,2,550.0,550.0,,,weekly
3,https://www.domain.com.au/5-admiral-place-geel...,$500.00,2,1,500.0,,,,1
4,https://www.domain.com.au/33-burleigh-drive-gr...,$500,3,1,500.0,,,,1


In [28]:
# Mapping numeric values to corresponding strings
rate_type_mapping = {
    1: 'weekly',
    2: 'monthly',
    3: 'yearly'
}

# Apply the mapping to convert any numeric values to strings
df_full['rate_type'] = df_full['rate_type'].replace(rate_type_mapping)

# Now the rate_type column should have only strings: 'weekly', 'monthly', or 'yearly'
df_full.head(5)


Unnamed: 0,url,cost_text,bedrooms,bathrooms,unknown_rate,per_week,per_year,per_month,rate_type
0,https://www.domain.com.au/1-4-hayley-court-mil...,$460,2,1,460.0,,,,weekly
1,https://www.domain.com.au/28-barbra-drive-char...,$515.00,4,2,515.0,,,,weekly
2,https://www.domain.com.au/59-botanic-dr-doncas...,$550 weekly,5,2,550.0,550.0,,,weekly
3,https://www.domain.com.au/5-admiral-place-geel...,$500.00,2,1,500.0,,,,weekly
4,https://www.domain.com.au/33-burleigh-drive-gr...,$500,3,1,500.0,,,,weekly


Since this is a large portion of rents, a future step is to create a model to predict if the rate is monthly, weekly or per year. This will be trained as a supervised model on the known rate types.

In [29]:
# for now, merge everything into one column 

def yearly_to_weekly_rate(yearly_rate):
    """ 
    Converts a yearly rental rate to a weekly rate.
    """
    return yearly_rate / 52.14

def monthly_to_weekly_rate(monthly_rate):
    """
    Converts a monthly rental rate to a weekly rate.
    """
    return yearly_to_weekly_rate(monthly_rate * 12)

In [30]:
import pandas as pd

# Example function to convert rates
def calculate_cost_per_week(row):
    if row['rate_type'] == 'weekly':
        return row['unknown_rate']  # Already weekly rate
    elif row['rate_type'] == 'monthly':
        return monthly_to_weekly_rate(row['unknown_rate'])  # Convert monthly to weekly
    elif row['rate_type'] == 'yearly':
        return yearly_to_weekly_rate(row['unknown_rate'])  # Convert yearly to weekly
    return None

# Apply the function to create the new column 'cost_per_week'
df_full['cost_per_week'] = df_full.apply(calculate_cost_per_week, axis=1)

# Display the updated DataFrame
df_full.head(5)

Unnamed: 0,url,cost_text,bedrooms,bathrooms,unknown_rate,per_week,per_year,per_month,rate_type,cost_per_week
0,https://www.domain.com.au/1-4-hayley-court-mil...,$460,2,1,460.0,,,,weekly,460.0
1,https://www.domain.com.au/28-barbra-drive-char...,$515.00,4,2,515.0,,,,weekly,515.0
2,https://www.domain.com.au/59-botanic-dr-doncas...,$550 weekly,5,2,550.0,550.0,,,weekly,550.0
3,https://www.domain.com.au/5-admiral-place-geel...,$500.00,2,1,500.0,,,,weekly,500.0
4,https://www.domain.com.au/33-burleigh-drive-gr...,$500,3,1,500.0,,,,weekly,500.0


In [31]:
pd_openroute = domain_openRouteServices.toPandas()
df_full.drop(["cost_text", "per_week",
              "per_year", "per_month", 
              "bedrooms", "bathrooms",
              "url", "cost_text", "unknown_rate"], 
              axis = 1, inplace =True)
final_df = pd.concat([pd_openroute, df_full], axis=1)
final_df.head(5)

Unnamed: 0,bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,...,bank_walking,bank_driving,restaurant_name,restaurant_walking,restaurant_driving,PTV_name,PTV_walking,PTV_driving,rate_type,cost_per_week
0,2,1,1,https://www.domain.com.au/1-4-hayley-court-mil...,"1/4 Hayley court, Mill Park VIC 3082",$460,Townhouse,-37.664825,145.040977,Mill Park VIC 3082,...,24.19,24.19,Amalfi Pizza & Pasta,4.06,16.48,Broadmeadows Railway Station (Broadmeadows),3.27,37.47,weekly,460.0
1,4,2,2,https://www.domain.com.au/28-barbra-drive-char...,"28 Barbra Drive, Charlemont VIC 3217",$515.00,House,-38.214272,144.3557042,Charlemont VIC 3217,...,11.13,12.74,Town and Country Pizza,16.99,16.99,Marshall Railway Station (Marshall),1.74,15.65,weekly,515.0
2,5,2,2,https://www.domain.com.au/59-botanic-dr-doncas...,"59 Botanic Dr, Doncaster VIC 3108",$550 weekly,House,-37.7784419,145.1387232,Doncaster VIC 3108,...,20.63,20.63,Rajam Restaurant,24.38,27.31,Caulfield Railway Station (Caulfield East),15.97,29.84,weekly,550.0
3,2,1,1,https://www.domain.com.au/5-admiral-place-geel...,"5 Admiral Place, Geelong VIC 3220",$500.00,Townhouse,-38.1525946,144.3676351,Geelong VIC 3220,...,8.67,22.33,Eastern Spice,11.64,11.64,South Geelong Railway Station (South Geelong),0.11,0.11,weekly,500.0
4,3,1,2,https://www.domain.com.au/33-burleigh-drive-gr...,"33 Burleigh Drive, Grovedale VIC 3216",$500,House,-38.2075368,144.3481287,Grovedale VIC 3216,...,11.13,12.74,Real Thai Cafe,18.17,18.17,Marshall Railway Station (Marshall),1.74,15.65,weekly,500.0


In [32]:
final_df.dtypes

bedrooms                int64
bathrooms               int64
car_parks               int64
url                    object
name                   object
cost_text              object
type                   object
latitude               object
longitude              object
suburb                 object
postcode               object
rent_pw                 int64
school_name            object
school_walking        float64
school_driving        float64
hospital_name          object
hospital_walking      float64
hospital_driving      float64
doctors_name           object
doctors_walking       float64
doctors_driving       float64
bank_name              object
bank_walking          float64
bank_driving          float64
restaurant_name        object
restaurant_walking    float64
restaurant_driving    float64
PTV_name               object
PTV_walking           float64
PTV_driving           float64
rate_type              object
cost_per_week         float64
dtype: object

This section is for debugging purposes. Use this to see if there's any phrases still not being matched.

In [41]:
final_df_spark = spark.createDataFrame(final_df)

# write to raw layer
import os

output_directory = '../data/raw/domain_cost_per_week'
os.makedirs(output_directory, exist_ok=True)

final_df_spark.write.mode('overwrite').parquet(output_directory)

                                                                                

In [39]:
# Count nulls in each column
null_counts = df_full.isnull().sum()

# Display the result
print(null_counts)

rate_type        0
cost_per_week    0
dtype: int64
