## Helpers

In [200]:
# Helper variables
street_types = ['street', 'st', 'drive', 'dr', 'lane', 'road', 'rd', 'avenue', 'ave', 'av', 'estate']
street_types_regex = '|'.join(street_types)

mapping = {
    'House': 'House',
    'Apartment / Unit / Flat': 'Apartment',
    'Townhouse': 'House',
    'Studio': 'Apartment', 
    'Villa': 'House',
    'New House & Land': 'House',
    'New Apartments / Off the Plan': 'Apartment',
    'Semi-Detached': 'House',
    'Duplex': 'House',
    'Terrace': 'House',
    'Block of Units': 'Apartment'} ## double check whether apartment or house

# Helper functions

def extract_parkings(text):
    ''' A function that returns the number of parking spaces'''
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return 0

def extract_bed_bath(rooms):
    ''' A function that extracts the number of bedrooms and bathrooms'''
    bed_match = re.search(r"(\d+)\s*Bed", rooms)
    bath_match = re.search(r"(\d+)\s*Bath", rooms)

    bed = int(bed_match.group(1)) if bed_match else None
    bath = int(bath_match.group(1)) if bath_match else None
    
    return pd.Series([bed, bath])

def contains_number(text):
    ''' A function that returns True if the text contains numbers'''
    return bool(re.search(r'\d+', text))

def remove_postcode(address):
    '''A function that removes all instances of the postcode'''
    return re.sub(r'\b\d{4}\b', '', address).strip()

def insert_commas(address):
    '''A function that inserts a comma after street types due to inconsistencies in address formats'''
    
    # Create a regex pattern to match any of the street types followed by a space (to check for the next word)
    pattern = r'(\b(?:' + street_types_regex + r')\b)(\s)'
    
    # Insert a comma after the matched street type
    return re.sub(pattern, r'\1,\2', address, flags=re.IGNORECASE)

def extract_suburb(address):
    '''A function that extracts the suburb of an address'''

    # Extract the part before 'VIC'
    vic_part = re.search(r'(.+?)\bVIC\b', address)

    if vic_part:
        # Get everything before VIC and split by commas or street types
        pre_vic = vic_part.group(1)

        # Split by street types or commas
        parts = re.split(f',|\\b(?:{street_types_regex})\\b', pre_vic)

        # Take the last part, which would be just the suburb
        return parts[-1].strip()
        
    return None

def contains_furnished_and_unfurnished(text):
    '''A function that returns True when the rental price specifies both furnished and unfurnished options'''
    furnished = re.search(r'\bfurnished\b', text, re.IGNORECASE)
    unfurnished = re.search(r'\bunfurnished\b', text, re.IGNORECASE)
    return bool(furnished and unfurnished)

def is_price_furnished(text):
    '''A function that returns indicates whether the price specficies furnishings'''
    text = text.lower()
    furnished_pattern = re.compile(r'\bfurnished\b', re.IGNORECASE)
    unfurnished_pattern = re.compile(r'\bunfurnished\b', re.IGNORECASE)
    if unfurnished_pattern.search(text):
        return False
    if furnished_pattern.search(text):
        return True
    return None

# We assume that if there's an asterisk that comes after furnished, then it's unfurnished due to the unverifiability 
def is_feature_furnished(features_str):
    '''A function that determines the features indicate whether a property is furnished'''
    pattern = r'\bFurnished\b(?!\s*\*)'
    if features_str.strip() == "[]":
        return None
    return bool(re.search(pattern, features_str, re.IGNORECASE))

def is_desc_furnished(desc):
    '''A function that determines whether a property is furnished from the descriptions'''
    desc = desc.lower()
    if 'not furn' in desc or 'unfurn' in desc:
        return False
    elif 'furnished' in desc:
        return True
    else:
        return None

## Start

In [399]:
import warnings
import regex as re
import pandas as pd

# Suppress all warnings
warnings.filterwarnings('ignore')

# Read in the csv file (14073 entries)
properties = pd.read_csv('../data/landing/scraped_properties.csv')

# Drop url and availability columns (irrelevant to analysis)
properties = properties.drop(columns=['url', 'availability'])

# Drop properties not in Victoria (3 properties were in NSW)
properties = properties[properties['address'].str.contains('VIC')]

# Extract the postcode of the adress
properties['postcode'] = properties['address'].str.extract(r'(\d{4})$')

# Insert commas after street names to ensure consistency
properties['address'] = properties['address'].apply(insert_commas)

# Remove all instances of the postcode from the address
properties['address'] = properties['address'].apply(remove_postcode)

# Extract the suburb of the address
properties['suburb'] = properties['address'].apply(extract_suburb)

# Get rid of address as we've extracted the main information
properties = properties.drop('address', axis = 1)

# Get rid of properties that do not specify rental price (~139)
properties = properties[properties['rental_price'].apply(contains_number)]

# Only keep apartments and houses (~37)
properties['property_type'] = properties['property_type'].map(mapping)
properties = properties[~properties['property_type'].isna()]

# Remove properties that are shipping containers (~4)
properties = properties[~properties['desc'].str.contains('shipping', case=False)]

# Remove properties with 0 information on number of rooms (~23)
properties = properties[properties['rooms'] != '[]']

# Extract the number of bedrooms and bathrooms per property
properties[['num_bed', 'num_bath']] = properties['rooms'].apply(extract_bed_bath)

# Remove properties with 0 bedrooms or 0 bathrooms (~131)
properties = properties[properties['num_bed'] != 0]
properties = properties[properties['num_bath'] != 0]

# Get rid of the rooms feature
properties = properties.drop('rooms', axis = 1)

# Extract the number of parking spaces and remove the original column
properties['num_parkings'] = properties['parking'].apply(extract_parkings)
properties = properties.drop('parking', axis = 1)

# Get rid of properties with more than 12 parking spaces (~2)
properties = properties[properties['num_parkings'] <= 12]

# Copy for testing (13734)
test = properties.copy()

In [418]:
# Find if the property offers both furnished and unfurnished options
test['contains_both'] = test['rental_price'].apply(contains_furnished_and_unfurnished)

# Determine if a property is furnished from the price column
test['price_furnished'] = test['rental_price'].apply(is_price_furnished)

# Determine if a property is furnished from the features column
test['features_furnished'] = test['features'].apply(is_feature_furnished)

# Determine if a property is furnished from the description column
test['desc_furnished'] = test['desc'].apply(is_desc_furnished)

# If any of the price, features or description explicitly mention furnished, we assume furnished
test['furnished'] = test[['price_furnished', 'features_furnished', 'desc_furnished']].any(axis=1)

In [None]:
# get rid of anything above 5000

In [281]:
# Extract the numerical value of the rental price 
test['rental_price_numbers'] = test['rental_price'].str.extract(r'(\d+(?:,\d{3})*(?:\.\d+)?)')
test['rental_price_numbers'] = test['rental_price_numbers'].str.replace(',', '')
test['rental_price_numbers'] = test['rental_price_numbers'].astype(float)

test[test['rental_price_numbers'] >= 4000]

Unnamed: 0,address,rental_price,features,desc,bond,property_type,coordinates,postcode,suburb,num_bed,num_bath,num_parkings,rental_price_numbers
997,"7 Jeffcott Street, West Melbourne VIC","$9,999 pw",[],This ad is for Off - Market Property Applicati...,,Apartment,"-37.8116713,144.9532276",3003,West Melbourne,5,5,5,9999.0
1193,"/1 Queensbridge Square, Southbank VIC","$5,750.00",[],Luxurious 2-bedroom apartment plus study on th...,$35000,Apartment,"-37.821734,144.9621361",3006,Southbank,2,2,2,5750.0
1407,"/35 Queensbridge Street, Southbank VIC","$12,500 pw","['Air conditioning', 'Heating', 'Area Views', ...",Unparalleled space and bathed in natural light...,$75000,Apartment,"-37.8228837,144.9612147",3006,Southbank,4,4,2,12500.0
1769,"110 Beevers Street, Footscray VIC","$95,000.00","['Internal Laundry*', 'Broadband internet acce...",Situated in central Footscray and within walki...,,Apartment,"-37.7958662,144.9059775",3011,Footscray,12,3,6,95000.0
3213,"28A/300 Point Cook Road, Point Cook VIC",5000,[],AAAAAA,$500,House,"-37.8953338,144.7526675",3030,Point Cook,2,2,6,5000.0
5253,"2/4 Lahinch Street, Preston VIC",$57240 per week,"['Built in wardrobes*', 'Gas*', 'Secure Parkin...",Barry Plant Inner North welcome you to inspect...,$2477,House,"-37.7455858,145.0174923",3072,Preston,2,2,1,57240.0
7215,"G01/16 Kurneh Place, South Yarra VIC",$4500 Per Week,['Furnished'],Partially furnished at $4500 Per WeekIncluding...,$27000,Apartment,"-37.8342289,144.9845427",3141,South Yarra,3,3,3,4500.0
7502,"14A Ethel Street, Malvern VIC","Fully Furn - $5000 p/w, 3-Mos+ Lease, Avail No...","['Fireplace(s)*', 'Air conditioning', 'In grou...",Phone enquiry code for this property : 1013,$20000,House,"-37.8480587,145.0380481",3144,Malvern,4,4,2,5000.0
12574,"20 High Plains Road, Bogong VIC","$45,000 for winter season",[],Be part of the brand-new reenergised Bogong Vi...,$1000,House,"-36.8047187,147.2236166",3699,Bogong,3,2,4,45000.0
12575,"House 2 High Plains Road, Bogong VIC","$45,000 for winter season",['Balcony / Deck*'],Be part of the brand-new reenergised Bogong Vi...,$1000,House,"-36.8047187,147.2236166",3699,Bogong,3,2,4,45000.0


In [None]:
# Rental price with just the number and no text, no dollar sign
test[test['rental_price'].str.match(r'^\d+$')]

In [None]:
test['word_count'] = test['desc'].apply(lambda x: len(x.split()))

test[test['word_count'] == 1]

In [138]:
# if the description is just one word - get rid
test[len(test['desc'])) == 1]

TypeError: descriptor 'split' for 'str' objects doesn't apply to a 'Series' object

In [169]:

test[test['parking'] == "['12 Parking']"]

Unnamed: 0,url,address,rental_price,rooms,parking,features,desc,bond,property_type,coordinates,postcode,suburb,word_count,rental_price_numbers
2741,https://www.domain.com.au/797-sayers-road-hopp...,"797 Sayers Road, Hoppers Crossing VIC",$650,"['3 Beds', '2 Baths']",['12 Parking'],"['Secure Parking*', 'Fully fenced*']",Centrally located and full of potential - 797 ...,$2824,House,"-37.8467795,144.6702628",3029,Hoppers Crossing,33,650.0
6617,https://www.domain.com.au/224-burwood-highway-...,"224 Burwood Highway, Burwood VIC",APPLY NOW - from $385 per week,"['1 Bed', '1 Bath']",['12 Parking'],"['Heating*', 'Air conditioning', 'Balcony / De...",HUGE - 39-41sqm 1 Bedroom Apartment and 58+sqm...,,Apartment / Unit / Flat,"-37.8502085,145.1143992",3125,Burwood,13,385.0
12909,https://www.domain.com.au/75-75a-belgrave-hall...,"75 & 75A Belgrave Hallam Road, Hallam VIC",$1500 pw,"['6 Beds', '3 Baths']",['12 Parking'],"['Internal Laundry*', 'Fireplace(s)*', 'Shed*'...",Experience the epitome of luxury living in thi...,$6518,House,"-37.9960963,145.2750106",3803,Hallam,57,1500.0


In [101]:
test[test['rental_price'].str.contains('partly', case=False)]

Unnamed: 0,url,rental_price,rooms,parking,features,desc,availability,bond,property_type,coordinates,postcode,suburb
68,https://www.domain.com.au/6901-462-elizabeth-s...,$800/w PARTLY-FURNSIHED,"['2 Beds', '2 Baths']",['− Parking'],"['Gym', 'Intercom', 'Floorboards', 'Built in w...",Victoria One is an icon on the corner of Eliza...,Available Now,$3476,Apartment / Unit / Flat,"-37.807859,144.960611",3000,Melbourne
261,https://www.domain.com.au/627-118-franklin-str...,$480/week - partly furnished,"['0 Beds', '1 Bath']",['− Parking'],"['Bath*', 'Heating*', 'Balcony / Deck*', 'Buil...",Experience prime city living in this well main...,"Available fromThursday, 19th September 2024",$2086,Studio,"-37.8082052,144.9589035",3000,Melbourne
267,https://www.domain.com.au/1701-228-a-beckett-s...,$750 per week Partly Furnished,"['2 Beds', '2 Baths']",['1 Parking'],"['Alarm System', 'Intercom', 'In ground pool',...",his beautiful partly furnished 2 bedroom apart...,"Available fromFriday, 6th September 2024",$3259,Apartment / Unit / Flat,"-37.8102641,144.9566396",3000,Melbourne
4255,https://www.domain.com.au/1310-28-44-bouverie-...,$550 pw partly furnished with white goods,"['1 Bed', '1 Bath']",['− Parking'],"['Air conditioning', 'Built in wardrobes', 'Di...",This immaculately presented 2 bedrooms or 1 be...,"Available fromWednesday, 18th September 2024",$2390,Apartment / Unit / Flat,"-37.8056339,144.9618379",3053,Carlton


In [347]:
import pandas as pd
import re

# Example DataFrame
data = {
    'features': [
        "['Heating*', 'Study*', 'Gym', 'Furnished']",
        "['Heating*', 'Study*', 'Gym']",
        "['Furnished*', 'City Views']",
        "['Balcony', 'Dishwasher', 'Furnished']",
        "[]",
        "['Furnished']",
        "['Unfurnished']"
    ]
}

df = pd.DataFrame(data)



# Apply the function to the 'features' column
df['features_furnished'] = df['features'].apply(is_furnished)

print(df)


                                     features  features_furnished
0  ['Heating*', 'Study*', 'Gym', 'Furnished']                   1
1               ['Heating*', 'Study*', 'Gym']                   0
2                ['Furnished*', 'City Views']                   0
3      ['Balcony', 'Dishwasher', 'Furnished']                   1
4                                          []                   0
5                               ['Furnished']                   1
6                             ['Unfurnished']                   0


In [59]:
def contains_phrase(text):
    phrases = ['pw', 'per week', '/w', 'weekly', 'week', 'p.w.', 'p.w']
    pattern = '|'.join(phrases)

    # Search for any of the phrases in the text
    return not bool(re.search(pattern, text, re.IGNORECASE))

def contains_words_excluding_dollar(text):
# Regex pattern to find words that are not preceded by a dollar sign
    pattern = r'\b(?!\$)[a-zA-Z]+\b'
    return bool(re.search(pattern, text))

In [61]:
without_week = test[test['rental_price'].apply(contains_phrase)]
outliers = without_week[without_week['rental_price'].apply(contains_words_excluding_dollar)]

In [62]:
print(outliers['rental_price'].unique())

['$630 Bills and Wifi Included'
 'Furnished, all inclusive modern studios from $749' '$770 Furnished'
 '$150 per month' '$530 Fully Furnished' '$380 Furnished!'
 '$820 Furnished' '$230 per month' '$950 with 6 Mths Only'
 '$220 per month' '$850 **INSPECTION UPON REQUEST**' '$50car park only'
 '$780 **INSPECTION UPON REQUEST**' '$790 **INSPECTION UPON REQUEST**'
 '$950 **INSPECTION UPON REQUEST**' '$870 **INSPECTION UPON REQUEST**'
 '$920 **INSPECTION UPON REQUEST**' '$600 **INSPECTION UPON REQUEST**'
 '$680 **FULLY FURNISHED**' '$550 **INSPECTION UPON REQUEST**'
 '$550 **GREAT CONDITION, CARPARK INCLUDED**'
 '$1250 **INSPECTION UPON REQUEST**'
 '$720 **OPPOSITE TO MELBOURNE UNIVERSITY**'
 '$750 **INSPECTION UPON REQUEST**' '$530 **AVAILABLE NOW**'
 '$80 **CARPARK AVAILABLE**' '$250 per month' '$700 W Unfurnished'
 '$300 per month' '$250/month' '750 Fully furnished'
 '$1200 **INSPECTION UPON REQUEST**' '$410/ Week' '$2,500 - Furnished'
 'Fully Furnished $690' '$260 per month'
 '$650 **IN

In [None]:
## Assumptions

# get suburbs and postcode for address (DONE)
# rental_price per week
# split rooms into bed and bath (DONE)
# parking: list format into just number
# features: furnished, pets
# number of features

# * is unconfirmed --> assume 0
# just 0's and 1's for all hot-encoded columns
# number of features would be len of features list
# check if furnished from description (only look for 'furnished' / 'unfurnished' in description)

# keep house & apartment
# townhouse in house
# new house & land to house
# semi detached --> house
# villa in house
# duplex to house

# studio in apt
# new apartments to apts

# if contain furnished --> flag, unless the str also contains 'extra'

# pw, per week, /w, weekly, a week, p.w, p.w., week, / week, /week, wk, / wk, 
# some just have the number --> assume weekly for these
# p.c.m., per month, /month, pcm, calendar month, pm

#################################################################################################################################################################################

# 5000 max per week

# $630 bills and wifi included
# could contain furnished / 'fully furnished' / 'partly-furnished', 'furnish', 'furnished!', 'fully furn'
# 'furnished option extra pw'

# problems: rent2own, for winter season, for the season, per night (get rid of these)
# (starting) from xx per month
# '$650 ROOM FOR EVERYONE' '$800 1,2 or 3 Month Lease' '$620 ROOM FOR BOAT' '$11,471.00 exc GST' 'From $160 - $170 per month' (take the lowest or highest maybe, or average maybe)
# $786 - $1572

# for calendar month: x 12 / 52? 

# Assume if rent doesn't have like p/w or pcm, then if its less than 5000 --> pw, if greater than 5000 --> per month or per year?
# 95000 per year

# Get rid of all rental prices above 5000

# Get rid of parkings above 12 (12 max)

In [396]:
sample = test[['suburb', 'postcode', 'coordinates']].sample(n=10, random_state=42)
sample.to_csv('../../../sample.csv')