## Important

### Before running this notebook, make sure to run `domain_rental_scrape.py` under the `scripts` directory and that you have `scraped_properties.json` in the `/data/landing` directory.

## Import Libraries

In [4]:
import os
import json
import warnings
import regex as re
import pandas as pd

# Ignore warnings
warnings.filterwarnings('ignore')

## Converting json to csv

In [1]:
# Load the JSON file
json_file_path = '../../data/landing/scraped_properties.json'

# Read JSON file
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Convert the JSON data into a flat structure (list of dictionaries)
flattened_data = []
for url, details in data.items():
    # Create a new dictionary for each property, combining the URL and details
    property_info = {"url": url}
    property_info.update(details)
    flattened_data.append(property_info)

# Convert the flattened data into a DataFrame
df = pd.DataFrame(flattened_data)

# Specify the output CSV file path
output_path = '../../data/landing/scraped_properties.csv'

# Save the DataFrame to a CSV file
df.to_csv(output_path, index=False)

## Helper Functions and Variables

In [2]:
# Helper variables
street_types = ['street', 'st', 'drive', 'dr', 'lane', 'road', 'rd', 'avenue', 'ave', 'av', 'estate']
street_types_regex = '|'.join(street_types)

weekly_terms = ['pw', 'week', '/w', 'p.w.', 'p.w', 'p/w', 'wk']
monthly_terms = ['pcm', 'month', 'p.c.m', 'pm', 'p/m']
yearly_terms = ['year', 'annum']

mapping = {
    'House': 'House',
    'Apartment / Unit / Flat': 'Apartment',
    'Townhouse': 'House',
    'Studio': 'Apartment', 
    'Villa': 'House',
    'New House & Land': 'House',
    'New Apartments / Off the Plan': 'Apartment',
    'Semi-Detached': 'House',
    'Duplex': 'House',
    'Terrace': 'House',
}

# Helper functions
def extract_parkings(text):
    ''' A function that returns the number of parking spaces'''

    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return 0

def extract_bed_bath(rooms):
    ''' A function that extracts the number of bedrooms and bathrooms'''

    bed_match = re.search(r"(\d+)\s*Bed", rooms)
    bath_match = re.search(r"(\d+)\s*Bath", rooms)

    bed = int(bed_match.group(1)) if bed_match else None
    bath = int(bath_match.group(1)) if bath_match else None
    
    return pd.Series([bed, bath])

def contains_number(text):
    ''' A function that returns True if the text contains at least 2 digits'''

    # 2 digits to avoid entries like "6 month lease available"
    return bool(re.search(r'\d{2,}', text))

def remove_postcode(address):
    '''A function that removes all instances of the postcode'''
    return re.sub(r'\b\d{4}\b', '', address).strip()

def insert_commas(address):
    '''A function that inserts a comma after street types due to inconsistencies in address formats'''
    
    # Create a regex pattern to match any of the street types followed by a space (to check for the next word)
    pattern = r'(\b(?:' + street_types_regex + r')\b)(\s)'
    
    # Insert a comma after the matched street type
    return re.sub(pattern, r'\1,\2', address, flags=re.IGNORECASE)

def extract_suburb(address):
    '''A function that extracts the suburb of an address'''

    # Extract the part before 'VIC'
    vic_part = re.search(r'(.+?)\bVIC\b', address)

    if vic_part:
        # Get everything before VIC and split by commas or street types
        pre_vic = vic_part.group(1)

        # Split by street types or commas
        parts = re.split(f',|\\b(?:{street_types_regex})\\b', pre_vic)

        # Take the last part, which would be just the suburb
        return parts[-1].strip()
        
    return None

def contains_furnished_and_unfurnished(text):
    '''A function that returns True when the rental price specifies both furnished and unfurnished options'''

    furnished = re.search(r'\bfurnished\b', text, re.IGNORECASE)
    unfurnished = re.search(r'\bunfurnished\b', text, re.IGNORECASE)

    return bool(furnished and unfurnished)

def is_price_furnished(text):
    '''A function that returns indicates whether the price specficies furnishings'''

    text = text.lower()

    furnished_pattern = re.compile(r'\bfurnished\b', re.IGNORECASE)
    unfurnished_pattern = re.compile(r'\bunfurnished\b', re.IGNORECASE)
    extra_pattern = re.compile(r'\bextra\b', re.IGNORECASE)

    # If the price indicates options for both furnished and unfurnished
    if unfurnished_pattern.search(text):
        if furnished_pattern.search(text):
            return True
        return False

    # If the price only specifies furnished price with 'extra', we take the original unfurnished price
    if furnished_pattern.search(text):
        if extra_pattern.search(text):
            return False
        return True
        
    return None

def is_feature_furnished(features_str):
    '''A function that determines if the features indicate whether a property is furnished'''

    pattern = r'\bFurnished\b(?!\s*\*)'
    if features_str.strip() == "[]":
        return None
    return bool(re.search(pattern, features_str, re.IGNORECASE))

def is_desc_furnished(desc):
    '''A function that determines whether a property is furnished from the descriptions'''

    desc = desc.lower()
    if 'not furn' in desc or 'unfurn' in desc:
        return False
    elif 'furnished' in desc:
        if 'option' in desc or 'extra' in desc:
            return False
        return True
    else:
        return None

def ensure_space_after_number(text):
    '''A function that puts a space after numbers'''

    # Define the pattern to find numbers followed immediately by alphabet characters
    pattern = r'(\d+)([a-zA-Z])'

    # Replace matches with the number followed by a space and then the alphabet character
    result = re.sub(pattern, r'\1 \2', text)
    return result

def find_with_dollar(price):
    '''A function that returns the numerical values that come after a dollar sign'''

    pattern = r'\$\s*\d+(?:,\d{3})*(?:\.\d+)?'
    matches = re.findall(pattern, price)
    numerical_values = [float(match.replace('$', '').replace(',', '').strip()) for match in matches]

    # Return the list of numerical values if any matches are found; otherwise, return None
    return numerical_values if numerical_values else None

def extract_numbers(text):
    '''A function that returns all numerical values from the text, excluding phone numbers and dates'''

    # Define patterns to exclude phone numbers and dates
    phone_pattern = r'\b\d{3,4}[\s/-]\d{3,4}[\s/-]\d{3,4}\b|\b\d{10,11}\b'
    date_pattern = r'\b\d{1,2}[//]\d{1,2}[//]\d{2,4}\b'
    
    # Remove phone numbers and dates from the text
    text_cleaned = re.sub(phone_pattern, '', text)
    text_cleaned = re.sub(date_pattern, '', text_cleaned)
    
    # Pattern to match numbers of at least 3 digits, including commas and decimals
    pattern = r'\b\d+(?:,\d{3})*(?:\.\d+)?\b'
    
    # Find all matches in the cleaned text
    matches = re.findall(pattern, text_cleaned)
    
    # Convert matches to float after cleaning commas
    numerical_values = [float(match.replace(',', '')) for match in matches]
    
    # Handle cases where text is too short or does not fit the expected pattern
    if numerical_values:
        return numerical_values
    else:
        return None

def normalise_rent(rent_list):
    '''A function that returns the average of rental prices for properties that specify a range for price'''
    if len(rent_list) > 2:
        filtered_values = [x for x in rent_list if x > 100]
        if filtered_values:
            return [sum(filtered_values) / len(filtered_values)]
    return rent_list

def remove_bond_from_rental_price(row):
    '''A function that removes instances of the bond from the rental price column'''

    bond_value = row['bond']
    rental_price = str(row['rental_price'])

    # Skip if bond is NaN
    if pd.notna(bond_value):

        # Create a regex pattern to match bond value, allowing optional commas and decimals
        bond_pattern = fr'\b{int(bond_value):,}(\.\d+)?\b'  # formats number with commas

        # Also include plain version without comma
        bond_pattern_alt = fr'\b{int(bond_value)}(\.\d+)?\b'

        # Remove any bond matches (with or without commas) from the rental price
        updated_rental_price = re.sub(bond_pattern, '', rental_price)
        updated_rental_price = re.sub(bond_pattern_alt, '', updated_rental_price)

        # Clean up extra spaces
        updated_rental_price = re.sub(r'\s+', ' ', updated_rental_price).strip()

        return updated_rental_price
    
    else:
        return rental_price


def calculate_final_rent(row):
    '''A function that returns the final rental price for a property'''
    
    prices = row['rent_nums']
    
    # Case when contains_both_furnished_options is True
    if row['contains_both_furnished_options']:
        return max(prices)

    if row['is_yearly']:
        return prices[0] * 7 / 365
        
    # If it doesn't specify whether rent is paid weekly or monthly
    if row['not_indicated']:
        if len(prices) > 1:
            # Get the average
            return sum(prices) / len(prices)
        elif len(prices) == 1:
            # We assume the maximum weekly rent would be $5000 and remove those above 5000
            if prices[0] > 5000:
                return None  
            return prices[0]
        return None

    # If rent is paid exclusively monthly
    if row['is_monthly'] and not row['is_weekly']:
        if len(prices) > 1:
            return sum(prices) / len(prices) * 84 / 365
        elif len(prices) == 1:
            # We assume monthly rent should be at least $500
            if prices[0] < 500:
                return None
            # To find the weekly rate of the rental price
            return prices[0] * 84 / 365
        return None

    # If rent is paid exclusively weekly
    if row['is_weekly'] and not row['is_monthly']:
        # Take the average if there is more than one price specified
        if len(prices) > 1:
            return sum(prices) / len(prices)
        elif len(prices) == 1:
            return prices[0]
        return None

    # Take the minimum (weekly rate)
    if row['is_both']:
        return min(prices)

    # If none of the above conditions are met, return None
    return None

def summarise(df):
    '''A function that summarises the properties, grouping them by suburb'''
    
    final = df.groupby('suburb').agg(
        median_rent=('weekly_rent', 'median'),
        median_bath=('num_bath', 'median'),
        median_parkings=('num_parkings', 'median'),
        furnished_count=('furnished', lambda x: (x == True).sum()),
        unfurnished_count=('furnished', lambda x: (x == False).sum()),
        pets_allowed=('pets_allowed', lambda x: (x == True).sum()),
        pets_not_allowed=('pets_allowed', lambda x: (x == False).sum()),
        num_properties=('suburb', 'size')
    ).reset_index()

    # Round to the nearest integer for number of baths and parkings
    final['median_bath'] = final['median_bath'].round().astype(int)
    final['median_parkings'] = final['median_parkings'].round().astype(int)

    return final

## Import Libraries & Read in Data

In [5]:
# Read in the csv file (14073 entries)
properties = pd.read_csv('../../data/landing/scraped_properties.csv')

## Extract Postcode & Suburb

In [6]:
# Drop url and availability columns (irrelevant to analysis)
properties = properties.drop(columns=['url', 'availability'])

# Drop properties not in Victoria (3 properties were in NSW)
properties = properties[properties['address'].str.contains('VIC')]

# Extract the postcode of the adress
properties['postcode'] = properties['address'].str.extract(r'(\d{4})$')

# Insert commas after street names to ensure consistency
properties['address'] = properties['address'].apply(insert_commas)

# Remove all instances of the postcode from the address
properties['address'] = properties['address'].apply(remove_postcode)

# Extract the suburb of the address
properties['suburb'] = properties['address'].apply(extract_suburb)

# Get rid of address as we've extracted the main information
properties = properties.drop('address', axis = 1)

## Extract Number of Beds, Baths & Parkings

In [7]:
# Get rid of properties that do not specify rental price (~139)
properties = properties[properties['rental_price'].apply(contains_number)]

# Only keep apartments and houses (~37)
properties['property_type'] = properties['property_type'].map(mapping)
properties = properties[~properties['property_type'].isna()]

# Remove properties that are shipping containers (~4)
properties = properties[~properties['desc'].str.contains('shipping', case=False)]

# Remove properties with 0 information on number of rooms (~23)
properties = properties[properties['rooms'] != '[]']

# Extract the number of bedrooms and bathrooms per property
properties[['num_bed', 'num_bath']] = properties['rooms'].apply(extract_bed_bath)

# Remove properties with 0 bedrooms or 0 bathrooms (~131)
properties = properties[properties['num_bed'] != 0]
properties = properties[properties['num_bath'] != 0]

# Get rid of the rooms feature
properties = properties.drop('rooms', axis = 1)

# Extract the number of parking spaces and remove the original column
properties['num_parkings'] = properties['parking'].apply(extract_parkings)
properties = properties.drop('parking', axis = 1)

# Get rid of properties with more than 12 parking spaces (~2)
properties = properties[properties['num_parkings'] <= 12]

## Extract Furnished & Pets Allowed Features

In [8]:
properties['rental_price'] = properties['rental_price'].apply(ensure_space_after_number)

# Find if the property offers both furnished and unfurnished options
properties['contains_both_furnished_options'] = properties['rental_price'].apply(contains_furnished_and_unfurnished)

# Determine if a property is furnished from the price column
properties['price_furnished'] = properties['rental_price'].apply(is_price_furnished)

# Determine if a property is furnished from the features column
properties['features_furnished'] = properties['features'].apply(is_feature_furnished)

# Determine if a property is furnished from the description column
properties['desc_furnished'] = properties['desc'].apply(is_desc_furnished)

# If any of the price, features or description explicitly mention furnished, we assume furnished
properties['furnished'] = properties[['price_furnished', 'features_furnished', 'desc_furnished']].any(axis=1)

# Create a new column that specifies whether pets are allowed
properties['pets_allowed'] = properties['features'].apply(lambda x: True if re.search(r'\bPets Allowed\b(?!\*)', str(x)) else False)

# Drop unused columns
properties = properties.drop(columns=['desc', 'features', 'price_furnished', 'features_furnished', 'desc_furnished'])

## Extract Weekly Rent

In [9]:
# Remove dollar signs and commas from bond
properties['bond'] = properties['bond'].str.replace('$', '', regex=False)
properties['bond'] = properties['bond'].str.replace(',', '', regex=False)

# Create columns to specify whether rent is specified by week, month, both, or none (or year)
properties['is_weekly'] = properties['rental_price'].str.contains('|'.join(weekly_terms), case=False)
properties['is_monthly'] = properties['rental_price'].str.contains('|'.join(monthly_terms), case=False)
properties['is_yearly'] = properties['rental_price'].str.contains('|'.join(yearly_terms), case=False)
properties['is_both'] = properties['is_weekly'] & properties['is_monthly']
properties['not_indicated'] = ~(properties['is_weekly'] | properties['is_monthly'] | properties['is_both'])

# Remove instances of the bond from the rental price
properties['rental_price'] = properties.apply(remove_bond_from_rental_price, axis=1)

# Extract rental price, separating by whether the price contains the dollar sign
# Without the dollar sign, we have to consider other numbers e.g. phone numbers and dates
with_dollar = properties[properties['rental_price'].str.contains(r'\$', na=False)]
with_dollar['rent_nums'] = with_dollar['rental_price'].apply(find_with_dollar)

without_dollar = properties[~properties['rental_price'].str.contains(r'\$', na=False)]
without_dollar['rent_nums'] = without_dollar['rental_price'].apply(extract_numbers)

# Concatenate back into one dataframe once rent is extracted
properties = pd.concat([with_dollar, without_dollar], ignore_index=True)

# Remove properties whose rent is not available (1) --> 13,681
properties = properties[~properties['rent_nums'].isna()]

# Normalise rent if we extracted more than 2 numbers
properties['rent_nums'] = properties['rent_nums'].apply(normalise_rent)

# Find weekly rent 
properties['weekly_rent'] = properties.apply(calculate_final_rent, axis=1)

# Remove NA values (rent too low / too high)
properties = properties[~properties['weekly_rent'].isna()]

# Remove weekly rent less than 100 (most likely to be carparks)
properties = properties[properties['weekly_rent'] > 100]

## Clean Up

In [11]:
# Make suburb lower case
properties['suburb'] = properties['suburb'].str.lower()

# Drop columns we no longer need
properties = properties.drop(columns = ['rental_price', 'contains_both_furnished_options', 'is_weekly', 'is_monthly', 'is_yearly', 'is_both', 'not_indicated', 'rent_nums'])

# Reorder columns
properties = properties[['suburb', 'postcode', 'property_type', 'weekly_rent', 'bond', 'num_bed', 'num_bath', 'num_parkings', 'furnished', 'pets_allowed', 'coordinates']]

# Ensuring consistent schema
properties['postcode'] = properties['postcode'].astype('int')
properties['bond'] = properties['bond'].astype('Int64')

# Final shape: 13,620 rows
# Save as csv
properties.to_csv('../../data/raw/properties.csv', index = False)

## Aggregate by Suburb

In [12]:
properties = pd.read_csv('../../data/raw/properties.csv')

one_bed_flat = properties[(properties['num_bed'] == 1) & (properties['property_type'] == 'Apartment')]
two_bed_flat = properties[(properties['num_bed'] == 2) & (properties['property_type'] == 'Apartment')]
three_bed_flat = properties[(properties['num_bed'] == 3) & (properties['property_type'] == 'Apartment')]

two_bed_house = properties[(properties['num_bed'] == 2) & (properties['property_type'] == 'House')]
three_bed_house = properties[(properties['num_bed'] == 3) & (properties['property_type'] == 'House')]
four_bed_house = properties[(properties['num_bed'] == 4) & (properties['property_type'] == 'House')]

all_properties = pd.concat([one_bed_flat,two_bed_flat,three_bed_flat,two_bed_house,three_bed_house,four_bed_house],ignore_index=True)

# Aggregate properties by suburb
one_bed_flat = summarise(one_bed_flat)
two_bed_flat = summarise(two_bed_flat)
three_bed_flat = summarise(three_bed_flat)

two_bed_house = summarise(two_bed_house)
three_bed_house = summarise(three_bed_house)
four_bed_house = summarise(four_bed_house)

all_properties = summarise(all_properties)

In [13]:
# Save to csvs
def save_domain_data():
    
    # Define the base path
    base_path = '../../data/curated/domain'

    # Ensure the directory exists
    if not os.path.exists(base_path):
        os.makedirs(base_path)

    # Save each dataframe to a CSV file
    one_bed_flat.to_csv(os.path.join(base_path, 'domain_one_bed_flat_rent.csv'), index=False)
    two_bed_flat.to_csv(os.path.join(base_path, 'domain_two_bed_flat_rent.csv'), index=False)
    three_bed_flat.to_csv(os.path.join(base_path, 'domain_three_bed_flat_rent.csv'), index=False)
    two_bed_house.to_csv(os.path.join(base_path, 'domain_two_bed_house_rent.csv'), index=False)
    three_bed_house.to_csv(os.path.join(base_path, 'domain_three_bed_house_rent.csv'), index=False)
    four_bed_house.to_csv(os.path.join(base_path, 'domain_four_bed_house_rent.csv'), index=False)
    all_properties.to_csv(os.path.join(base_path, 'domain_all_properties_rent.csv'), index=False)

# Call the function
save_domain_data()