# Properties data preprocessing

In [21]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

# read in data
df = pd.read_json("../data/landing/property.json")

# properties datashape
df.shape

(5, 12014)

In [22]:
properties_df = df.transpose()
properties_df.head()

Unnamed: 0,name,cost_text,coordinates,rooms,desc
https://www.domain.com.au/904-265-exhibition-street-melbourne-vic-3000-16637685,904/265 Exhibition Street Melbourne VIC 3000,$850 Per Week,"[-37.8095116, 144.9691204]","[2 Beds, 2 Baths, 1 Parking]",Darren Krause
https://www.domain.com.au/210-422-collins-street-melbourne-vic-3000-16637229,210/422 Collins Street Melbourne VIC 3000,$475 Per Week,"[-37.8170971, 144.9601487]","[1 Bed, 1 Bath]",Annalise Zamparo
https://www.domain.com.au/1902-200-spencer-street-melbourne-vic-3000-16636968,1902/200 Spencer Street Melbourne VIC 3000,$630 pw,"[-37.816228, 144.9532465]","[2 Beds, 1 Bath, 1 Parking]",Paola Faba
https://www.domain.com.au/312b-399-bourke-street-melbourne-vic-3000-16096996,312B/399 Bourke Street Melbourne VIC 3000,$450,"[-37.8147259, 144.9621291]","[1 Bed, 1 Bath]",Natalie Subotsch
https://www.domain.com.au/3313-228-la-trobe-street-melbourne-vic-3000-16636845,3313/228 La Trobe Street Melbourne VIC 3000,$900,"[-37.8096052, 144.962371]","[2 Beds, 1 Bath, 2 Parking]",Paula Tran


In [23]:
def extract_rooms(rooms_list):
    """
    extract the nested rooms_list into beds, baths and parking
    """
    beds, baths, parkings = None, None, None
    for item in rooms_list:
        if 'Bed' in item:
            beds = int(item.split(' ')[0])
        elif 'Bath' in item:
            baths = int(item.split(' ')[0])
        elif 'Parking' in item:
            parkings = int(item.split(' ')[0])
    return beds, baths, parkings

In [24]:
# Applying the function and splitting the results
properties_df['beds'], properties_df['baths'], properties_df['parkings'] = zip(*properties_df['rooms'].apply(lambda x: extract_rooms(x)))
properties_df.drop('rooms', axis=1, inplace=True)

In [25]:
properties_df.head()

Unnamed: 0,name,cost_text,coordinates,desc,beds,baths,parkings
https://www.domain.com.au/904-265-exhibition-street-melbourne-vic-3000-16637685,904/265 Exhibition Street Melbourne VIC 3000,$850 Per Week,"[-37.8095116, 144.9691204]",Darren Krause,2.0,2.0,1.0
https://www.domain.com.au/210-422-collins-street-melbourne-vic-3000-16637229,210/422 Collins Street Melbourne VIC 3000,$475 Per Week,"[-37.8170971, 144.9601487]",Annalise Zamparo,1.0,1.0,
https://www.domain.com.au/1902-200-spencer-street-melbourne-vic-3000-16636968,1902/200 Spencer Street Melbourne VIC 3000,$630 pw,"[-37.816228, 144.9532465]",Paola Faba,2.0,1.0,1.0
https://www.domain.com.au/312b-399-bourke-street-melbourne-vic-3000-16096996,312B/399 Bourke Street Melbourne VIC 3000,$450,"[-37.8147259, 144.9621291]",Natalie Subotsch,1.0,1.0,
https://www.domain.com.au/3313-228-la-trobe-street-melbourne-vic-3000-16636845,3313/228 La Trobe Street Melbourne VIC 3000,$900,"[-37.8096052, 144.962371]",Paula Tran,2.0,1.0,2.0


In [26]:
def convert_to_weekly(price_text):
    """ 
    Converting price to weekly rent
    """
    try:
        # Identifying common text for different frequencies
        week = ["pw", "PW", "week", "/w", " pw"]
        month = ["pcm", "PCM", "pm", "PM", "mth", "month", "per month", "/mo"]
        annual = ["p.a", "pa", "annually"]
        
        # Check for range indications
        if " to " in price_text:
            return np.nan

        # Extract the first numerical value for the price, correctly handling commas and decimals
        price = float(re.search(r'(\d+\.\d+|\.\d+|\d+)', price_text.replace(',', '')).group(1))

        
        # Check for frequency and convert to weekly
        if any(ext in price_text.lower() for ext in week):
            return price
        elif any(ext in price_text.lower() for ext in month):
            return price / 4.33
        elif any(ext in price_text.lower() for ext in annual):
            return price / 52
        else:
            return price  # default to weekly if no specific frequency is mentioned
    except:
        return np.nan

# Apply the function to the cost_text column
properties_df['weekly_rent'] = properties_df['cost_text'].apply(convert_to_weekly)

# Create invalid df with rows that have NaN in weekly_rent
invalid_df = properties_df[properties_df['weekly_rent'].isna()]

# Drop these rows from the main dataframe
properties_df = properties_df.dropna(subset=['weekly_rent'])

In [27]:
properties_df.shape

(11854, 8)

In [20]:
# inspecting properties with rent over 10000
properties_df[properties_df['weekly_rent'] >= 10000]

Unnamed: 0,name,cost_text,coordinates,desc,beds,baths,parkings,weekly_rent
https://www.domain.com.au/2107-228-la-trobe-street-melbourne-vic-3000-15791133,2107/228 La Trobe Street Melbourne VIC 3000,"$480,000 - $500,000","[-37.8097514, 144.9626141]",* Unverified feature,1.0,1.0,,480000.0
https://www.domain.com.au/34-2-fastline-rd-truganina-vic-3029-16311935,34/2 fastline rd Truganina VIC 3029,contact 0477796377,"[-37.8079496, 144.7374707]",nbn® Fibre to the Premises (FTTP) is available...,1.0,1.0,,477796377.0
https://www.domain.com.au/37-8-graham-street-port-melbourne-vic-3207-15213601,37/8 Graham Street Port Melbourne VIC 3207,"$640,00 per week","[-37.84207500000001, 144.9444458]",* Unverified feature,2.0,1.0,2.0,64000.0
https://www.domain.com.au/7-recreation-road-mount-clear-vic-3350-16326287,7 Recreation Road Mount Clear VIC 3350,"$1,050,000","[-37.6035207, 143.8699667]",nbn® Fibre to the Premises (FTTP) is available...,4.0,2.0,2.0,1050000.0
https://www.domain.com.au/8-chamberlain-road-redcastle-vic-3523-16505678,8 Chamberlain Road Redcastle VIC 3523,"$28,000 Per Year !!","[-36.7247292, 144.7609424]",Century 21 Paramount Realtors proudly presents...,,,,28000.0
https://www.domain.com.au/2-3-schuss-street-falls-creek-vic-3699-16370571,2/3 Schuss Street Falls Creek VIC 3699,"$49,500 for the season","[-36.8649138, 147.2778861]",* Unverified feature,4.0,1.0,,49500.0
https://www.domain.com.au/2-3-schuss-street-falls-creek-vic-3699-16360345,2/3 Schuss Street Falls Creek VIC 3699,"$49,500 for the season","[-36.8649138, 147.2778861]",* Unverified feature,4.0,1.0,,49500.0
https://www.domain.com.au/28a-mccartin-street-leongatha-vic-3953-16157644,28A McCartin Street Leongatha VIC 3953,"$11,471.00 exc GST","[-38.4767246, 145.9449292]",nbn® Fibre to the Node (FTTN) is available in ...,1.0,1.0,,11471.0


In [28]:
# remove invalid records
# it appears that the prperties has some unhandled cases, but given that the we have a large amount of data
# we decided to drop them
filtered_df = properties_df[(properties_df['beds'] > 0) & (properties_df['baths'] > 0) & (properties_df['weekly_rent'] < 10000)]

In [29]:
# Drop duplicate rows based on the 'name' column
filtered_df = filtered_df.drop_duplicates(subset='name', keep='first')

# Fill NaN values in the 'parkings' column with 0
filtered_df['parkings'] = filtered_df['parkings'].fillna(0)


In [30]:
filtered_df.shape

(11542, 8)

In [31]:
filtered_df.to_csv("../data/raw/properties.csv")