# Properties data preprocessing

In [204]:
import pandas as pd
import numpy as np

df = pd.read_json("../data/property.json")
df.shape

(5, 12014)

In [205]:
properties_df = df.transpose()
properties_df.head()


Unnamed: 0,name,cost_text,coordinates,rooms,desc
https://www.domain.com.au/904-265-exhibition-street-melbourne-vic-3000-16637685,904/265 Exhibition Street Melbourne VIC 3000,$850 Per Week,"[-37.8095116, 144.9691204]","[2 Beds, 2 Baths, 1 Parking]",Darren Krause
https://www.domain.com.au/210-422-collins-street-melbourne-vic-3000-16637229,210/422 Collins Street Melbourne VIC 3000,$475 Per Week,"[-37.8170971, 144.9601487]","[1 Bed, 1 Bath]",Annalise Zamparo
https://www.domain.com.au/1902-200-spencer-street-melbourne-vic-3000-16636968,1902/200 Spencer Street Melbourne VIC 3000,$630 pw,"[-37.816228, 144.9532465]","[2 Beds, 1 Bath, 1 Parking]",Paola Faba
https://www.domain.com.au/312b-399-bourke-street-melbourne-vic-3000-16096996,312B/399 Bourke Street Melbourne VIC 3000,$450,"[-37.8147259, 144.9621291]","[1 Bed, 1 Bath]",Natalie Subotsch
https://www.domain.com.au/3313-228-la-trobe-street-melbourne-vic-3000-16636845,3313/228 La Trobe Street Melbourne VIC 3000,$900,"[-37.8096052, 144.962371]","[2 Beds, 1 Bath, 2 Parking]",Paula Tran


In [206]:
def extract_rooms(rooms_list):
    beds, baths, parkings = None, None, None
    for item in rooms_list:
        if 'Bed' in item:
            beds = int(item.split(' ')[0])
        elif 'Bath' in item:
            baths = int(item.split(' ')[0])
        elif 'Parking' in item:
            parkings = int(item.split(' ')[0])
    return beds, baths, parkings

In [207]:
# Applying the function and splitting the results
properties_df['beds'], properties_df['baths'], properties_df['parkings'] = zip(*properties_df['rooms'].apply(lambda x: extract_rooms(x)))

properties_df.drop('rooms', axis=1, inplace=True)

In [208]:
properties_df.head()

Unnamed: 0,name,cost_text,coordinates,desc,beds,baths,parkings
https://www.domain.com.au/904-265-exhibition-street-melbourne-vic-3000-16637685,904/265 Exhibition Street Melbourne VIC 3000,$850 Per Week,"[-37.8095116, 144.9691204]",Darren Krause,2.0,2.0,1.0
https://www.domain.com.au/210-422-collins-street-melbourne-vic-3000-16637229,210/422 Collins Street Melbourne VIC 3000,$475 Per Week,"[-37.8170971, 144.9601487]",Annalise Zamparo,1.0,1.0,
https://www.domain.com.au/1902-200-spencer-street-melbourne-vic-3000-16636968,1902/200 Spencer Street Melbourne VIC 3000,$630 pw,"[-37.816228, 144.9532465]",Paola Faba,2.0,1.0,1.0
https://www.domain.com.au/312b-399-bourke-street-melbourne-vic-3000-16096996,312B/399 Bourke Street Melbourne VIC 3000,$450,"[-37.8147259, 144.9621291]",Natalie Subotsch,1.0,1.0,
https://www.domain.com.au/3313-228-la-trobe-street-melbourne-vic-3000-16636845,3313/228 La Trobe Street Melbourne VIC 3000,$900,"[-37.8096052, 144.962371]",Paula Tran,2.0,1.0,2.0


In [209]:
def convert_to_weekly(price_text):
    try:
        # Identifying common text for different frequencies
        week = ["pw", "PW", "week", "/w", " pw"]
        month = ["pcm", "PCM", "pm", "PM", "mth", "month", "per month"]
        annual = ["p.a", "pa", "annually"]
        
        # Check for range indications
        if " to " in price_text:
            return np.nan

        # Extract the first numerical value for the price, correctly handling commas and decimals
        price = float(re.search(r'(\d+\.\d+|\.\d+|\d+)', price_text.replace(',', '')).group(1))

        
        # Check for frequency and convert to weekly
        if any(ext in price_text.lower() for ext in week):
            return price
        elif any(ext in price_text.lower() for ext in month):
            return price / 4.33
        elif any(ext in price_text.lower() for ext in annual):
            return price / 52
        else:
            return price  # default to weekly if no specific frequency is mentioned
    except:
        return np.nan

# Apply the function to the cost_text column
properties_df['weekly_rent'] = properties_df['cost_text'].apply(convert_to_weekly)

# Create invalid df with rows that have NaN in weekly_rent
invalid_df = properties_df[properties_df['weekly_rent'].isna()]

# Drop these rows from the main dataframe
properties_df = properties_df.dropna(subset=['weekly_rent'])

In [202]:
import re
import numpy as np

def convert_to_weekly(price_text):
    try:
        # Identifying common text for different frequencies
        week = ["/w", "pw", "PW", "Week", "week", "w.", "w,", "PER WEEK", "/W", "wk", "p.w", "/ WK", "/ WEEK"]
        month = ["pcm", "PCM", "pm", "PM", "mth", "month", "Month", "MONTH", "m.", "m,",]
        annual = ["p.a", "pa", "Annually"]

        # Extract all floating-point numbers from the text
        prices = [float(price.replace(',', '')) for price in re.findall(r'(\d+,\d+\.\d+|\d+\.\d+|\d+)', price_text)]
        # If there's a '/', consider the first part as weekly and the second part as monthly
        weekly_price = prices[0] if prices else np.nan
        monthly_price = prices[1] if len(prices) > 1 else None

        # Check for frequency and convert to weekly
        if any(keyword in price_text for keyword in week) or not any(char.isalpha() for char in price_text):
            return weekly_price
        elif any(keyword in price_text for keyword in month):
            return weekly_price or (monthly_price / 4.33)  # If there's a secondary monthly price, use it.
        elif any(keyword in price_text for keyword in annual):
            return weekly_price / 52  # Convert yearly to weekly
        else:
            # If there are descriptive words without specified frequency, assume it as weekly rent
            descriptions = ["furnished", "unfurnished", "bedroom", "spot", "renovated"]
            if any(desc.lower() in price_text.lower() for desc in descriptions):
                return weekly_price
            return np.nan
    except:
        return np.nan


# Apply the function to the cost_text column
properties_df['weekly_rent'] = properties_df['cost_text'].apply(convert_to_weekly)

# Create invalid df with rows that have NaN in weekly_rent
invalid_df = properties_df[properties_df['weekly_rent'].isna()]

# Drop these rows from the main dataframe
properties_df = properties_df.dropna(subset=['weekly_rent'])

In [210]:
invalid_df['cost_text'].to_csv("../data/test.csv")

In [211]:
no_beds = properties_df[properties_df['beds'] < 1]

In [212]:
no_beds

Unnamed: 0,name,cost_text,coordinates,desc,beds,baths,parkings,weekly_rent
https://www.domain.com.au/2805-220-spencer-street-melbourne-vic-3000-16630441,2805/220 Spencer Street Melbourne VIC 3000,$430 per week,"[-37.815781, 144.9529156]",Paula Tran,0.0,1.0,,430.000000
https://www.domain.com.au/100-19-exploration-lane-melbourne-vic-3000-16615038,100/19 Exploration Lane Melbourne VIC 3000,"FROM $480PW, FURNITURE AND BILLS INCLUDED","[-37.8089339, 144.9680888]",Exploration Lane Community Manager,0.0,1.0,,480.000000
https://www.domain.com.au/2105s-220-spencer-street-melbourne-vic-3000-16620782,2105s/220 Spencer Street Melbourne VIC 3000,$430 Per Week,"[-37.815781, 144.9529156]",Paula Tran,0.0,1.0,,430.000000
https://www.domain.com.au/4206-220-spencer-street-melbourne-vic-3000-16607288,4206/220 Spencer Street Melbourne VIC 3000,$430 per week,"[-37.815781, 144.9529156]",Paula Tran,0.0,1.0,,430.000000
https://www.domain.com.au/236-la-trobe-street-melbourne-vic-3000-16142465,236 La Trobe Street Melbourne VIC 3000,"Furnished, all inclusive modern studios from $...","[-37.8098249, 144.9624192]",nbn® Fibre to the Premises (FTTP) is available...,0.0,1.0,,650.000000
...,...,...,...,...,...,...,...,...
https://www.domain.com.au/128-murray-valley-highway-yarrawonga-vic-3730-16418231,128 Murray Valley Highway Yarrawonga VIC 3730,$280,"[-36.020273, 145.9908219]",* Unverified feature,0.0,2.0,,280.000000
https://www.domain.com.au/16-echidna-street-loch-sport-vic-3851-16616900,16 Echidna Street Loch Sport VIC 3851,$280 per week,"[-38.037121, 147.5988684]",nbn® Fibre to the Node (FTTN) is available in ...,0.0,1.0,,280.000000
https://www.domain.com.au/4-10-duiker-court-langwarrin-vic-3910-16627726,4/10 Duiker court Langwarrin VIC 3910,"$1,297 per month","[-38.1406873, 145.1757278]",* Unverified feature,0.0,1.0,,299.538106
https://www.domain.com.au/2-52-childers-street-cranbourne-vic-3977-13535264,2/52 Childers Street Cranbourne VIC 3977,$320 per week,"[-38.112789, 145.289034]",Offering the ultimate in first-class accommoda...,0.0,1.0,1.0,320.000000
