## Helpers

In [11]:
# Helper variables
street_types = ['street', 'st', 'drive', 'dr', 'lane', 'road', 'rd', 'avenue', 'ave', 'av', 'estate']
street_types_regex = '|'.join(street_types)

weekly_terms = ['pw', 'week', '/w', 'p.w.', 'p.w', 'p/w', 'wk']
monthly_terms = ['pcm', 'month', 'p.c.m', 'pm', 'p/m']

mapping = {
    'House': 'House',
    'Apartment / Unit / Flat': 'Apartment',
    'Townhouse': 'House',
    'Studio': 'Apartment', 
    'Villa': 'House',
    'New House & Land': 'House',
    'New Apartments / Off the Plan': 'Apartment',
    'Semi-Detached': 'House',
    'Duplex': 'House',
    'Terrace': 'House',
    'Block of Units': 'Apartment'} ## double check whether apartment or house

# Helper functions

def extract_parkings(text):
    ''' A function that returns the number of parking spaces'''
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return 0

def extract_bed_bath(rooms):
    ''' A function that extracts the number of bedrooms and bathrooms'''
    bed_match = re.search(r"(\d+)\s*Bed", rooms)
    bath_match = re.search(r"(\d+)\s*Bath", rooms)

    bed = int(bed_match.group(1)) if bed_match else None
    bath = int(bath_match.group(1)) if bath_match else None
    
    return pd.Series([bed, bath])

def contains_number(text):
    ''' A function that returns True if the text contains at least 2 digits'''
    # 2 digits to avoid entries like "6 month lease available"
    return bool(re.search(r'\d{2,}', text))

def remove_postcode(address):
    '''A function that removes all instances of the postcode'''
    return re.sub(r'\b\d{4}\b', '', address).strip()

def insert_commas(address):
    '''A function that inserts a comma after street types due to inconsistencies in address formats'''
    
    # Create a regex pattern to match any of the street types followed by a space (to check for the next word)
    pattern = r'(\b(?:' + street_types_regex + r')\b)(\s)'
    
    # Insert a comma after the matched street type
    return re.sub(pattern, r'\1,\2', address, flags=re.IGNORECASE)

def extract_suburb(address):
    '''A function that extracts the suburb of an address'''

    # Extract the part before 'VIC'
    vic_part = re.search(r'(.+?)\bVIC\b', address)

    if vic_part:
        # Get everything before VIC and split by commas or street types
        pre_vic = vic_part.group(1)

        # Split by street types or commas
        parts = re.split(f',|\\b(?:{street_types_regex})\\b', pre_vic)

        # Take the last part, which would be just the suburb
        return parts[-1].strip()
        
    return None

def contains_furnished_and_unfurnished(text):
    '''A function that returns True when the rental price specifies both furnished and unfurnished options'''

    furnished = re.search(r'\bfurnished\b', text, re.IGNORECASE)
    unfurnished = re.search(r'\bunfurnished\b', text, re.IGNORECASE)

    return bool(furnished and unfurnished)

def is_price_furnished(text):
    '''A function that returns indicates whether the price specficies furnishings'''
    text = text.lower()
    furnished_pattern = re.compile(r'\bfurnished\b', re.IGNORECASE)
    unfurnished_pattern = re.compile(r'\bunfurnished\b', re.IGNORECASE)
    extra_pattern = re.compile(r'\bextra\b', re.IGNORECASE)

    if unfurnished_pattern.search(text):
        if furnished_pattern.search(text):
            return True
        return False
    if furnished_pattern.search(text):
        if extra_pattern.search(text):
            return False
        return True
        
    return None

# We assume that if there's an asterisk that comes after furnished, then it's unfurnished due to the unverifiability 
def is_feature_furnished(features_str):
    '''A function that determines if the features indicate whether a property is furnished'''
    pattern = r'\bFurnished\b(?!\s*\*)'
    if features_str.strip() == "[]":
        return None
    return bool(re.search(pattern, features_str, re.IGNORECASE))

def is_desc_furnished(desc):
    '''A function that determines whether a property is furnished from the descriptions'''
    desc = desc.lower()
    if 'not furn' in desc or 'unfurn' in desc:
        return False
    elif 'furnished' in desc:
        if 'option' in desc or 'extra' in desc:
            return False
        return True
    else:
        return None


def ensure_space_after_number(text):
    # Define the pattern to find numbers followed immediately by alphabet characters
    pattern = r'(\d+)([a-zA-Z])'
    
    # Replace matches with the number followed by a space and then the alphabet character
    result = re.sub(pattern, r'\1 \2', text)
    
    return result

def find_with_dollar(price):

    pattern = r'\$\s*\d+(?:,\d{3})*(?:\.\d+)?'

    matches = re.findall(pattern, price)

    numerical_values = [float(match.replace('$', '').replace(',', '').strip()) for match in matches]

    # Return the list of numerical values if any matches are found; otherwise, return None
    return numerical_values if numerical_values else None

def extract_numbers(text):
    # Define patterns to exclude phone numbers and dates
    phone_pattern = r'\b\d{3,4}[\s/-]\d{3,4}[\s/-]\d{3,4}\b|\b\d{10,11}\b'
    date_pattern = r'\b\d{1,2}[//]\d{1,2}[//]\d{2,4}\b'
    
    # Remove phone numbers and dates from the text
    text_cleaned = re.sub(phone_pattern, '', text)
    text_cleaned = re.sub(date_pattern, '', text_cleaned)
    
    # Pattern to match numbers of at least 3 digits, including commas and decimals
    pattern = r'\b\d+(?:,\d{3})*(?:\.\d+)?\b'
    
    # Find all matches in the cleaned text
    matches = re.findall(pattern, text_cleaned)
    
    # Convert matches to float after cleaning commas
    numerical_values = [float(match.replace(',', '')) for match in matches]
    
    # Handle cases where text is too short or does not fit the expected pattern
    if numerical_values:
        return numerical_values
    else:
        return None

def normalise_rent(rent_list):
    if len(rent_list) > 2:
        filtered_values = [x for x in rent_list if x > 100]
        if filtered_values:
            return [sum(filtered_values) / len(filtered_values)]
    return rent_list

def convert_to_num(rent_list):
    if len(rent_list) == 1:
        return int(rent_list[0])
    else:
        return rent_list

def avg(lst): 
    return sum(lst) / len(lst) 

## Start

In [12]:
import warnings
import regex as re
import pandas as pd

# Suppress all warnings
warnings.filterwarnings('ignore')

# Read in the csv file (14073 entries)
properties = pd.read_csv('../data/landing/scraped_properties.csv')

# Drop url and availability columns (irrelevant to analysis)
properties = properties.drop(columns=['url', 'availability'])

# Drop properties not in Victoria (3 properties were in NSW)
properties = properties[properties['address'].str.contains('VIC')]

# Extract the postcode of the adress
properties['postcode'] = properties['address'].str.extract(r'(\d{4})$')

# Insert commas after street names to ensure consistency
properties['address'] = properties['address'].apply(insert_commas)

# Remove all instances of the postcode from the address
properties['address'] = properties['address'].apply(remove_postcode)

# Extract the suburb of the address
properties['suburb'] = properties['address'].apply(extract_suburb)

# Get rid of address as we've extracted the main information
properties = properties.drop('address', axis = 1)

# Get rid of properties that do not specify rental price (~139)
properties = properties[properties['rental_price'].apply(contains_number)]

# Only keep apartments and houses (~37)
properties['property_type'] = properties['property_type'].map(mapping)
properties = properties[~properties['property_type'].isna()]

# Remove properties that are shipping containers (~4)
properties = properties[~properties['desc'].str.contains('shipping', case=False)]

# Remove properties with 0 information on number of rooms (~23)
properties = properties[properties['rooms'] != '[]']

# Extract the number of bedrooms and bathrooms per property
properties[['num_bed', 'num_bath']] = properties['rooms'].apply(extract_bed_bath)

# Remove properties with 0 bedrooms or 0 bathrooms (~131)
properties = properties[properties['num_bed'] != 0]
properties = properties[properties['num_bath'] != 0]

# Get rid of the rooms feature
properties = properties.drop('rooms', axis = 1)

# Extract the number of parking spaces and remove the original column
properties['num_parkings'] = properties['parking'].apply(extract_parkings)
properties = properties.drop('parking', axis = 1)

# Get rid of properties with more than 12 parking spaces (~2)
properties = properties[properties['num_parkings'] <= 12]

properties['rental_price'] = properties['rental_price'].apply(ensure_space_after_number)

# Copy for testing (13734) --> 13682 (i think from removing places with only 1 digit numbers)
test = properties.copy()

In [13]:
# Find if the property offers both furnished and unfurnished options
test['contains_both_furnished_options'] = test['rental_price'].apply(contains_furnished_and_unfurnished)

# Determine if a property is furnished from the price column
test['price_furnished'] = test['rental_price'].apply(is_price_furnished)

# Determine if a property is furnished from the features column
test['features_furnished'] = test['features'].apply(is_feature_furnished)

# Determine if a property is furnished from the description column
test['desc_furnished'] = test['desc'].apply(is_desc_furnished)

# If any of the price, features or description explicitly mention furnished, we assume furnished
test['furnished'] = test[['price_furnished', 'features_furnished', 'desc_furnished']].any(axis=1)

# Create a new column that specifies whether pets are allowed
test['pets_allowed'] = test['features'].apply(lambda x: True if re.search(r'\bPets Allowed\b(?!\*)', str(x)) else False)

# Drop unused columns
test = test.drop(columns=['desc', 'features', 'price_furnished', 'features_furnished', 'desc_furnished'])

############################################################################################################################################################################

# Remove dollar signs and commas from both bond
test['bond'] = test['bond'].str.replace('$', '', regex=False)
test['bond'] = test['bond'].str.replace(',', '', regex=False)

# Create columns to specify whether rent is specified by week, month or both
test['is_weekly'] = test['rental_price'].str.contains('|'.join(weekly_terms), case=False)
test['is_monthly'] = test['rental_price'].str.contains('|'.join(monthly_terms), case=False)
test['is_both'] = test['is_weekly'] & test['is_monthly']
test['not_indicated'] = ~(test['is_weekly'] | test['is_monthly'] | test['is_both'])

In [14]:
# Extract rental price, separating by whether the price contains the dollar sign
# Without the dollar sign, we have to consider other numbers e.g. phone numbers and dates

with_dollar = test[test['rental_price'].str.contains(r'\$', na=False)]
without_dollar = test[~test['rental_price'].str.contains(r'\$', na=False)]

with_dollar['rent_nums'] = with_dollar['rental_price'].apply(find_with_dollar)
without_dollar['rent_nums'] = without_dollar['rental_price'].apply(extract_numbers)

# Concatenate back into one dataframe once rent is extracted
test = pd.concat([with_dollar, without_dollar], ignore_index=True)

# Remove properties whose rent is not available
test = test[~test['rent_nums'].isna()]

# Normalise rent if we extracted more than 2 rental prices
test['rent_nums'] = test['rent_nums'].apply(normalise_rent)

# For properties with only one specified rent price, convert to integer
# test['rent'] = test['rent'].apply(convert_to_num)

In [18]:
# Month only
#test[(test['is_monthly'] == True) & (test['is_weekly'] == False) & (test['rent_nums'].apply(lambda x: len(x) > 1))]

# Weekly only
#test[(test['is_monthly'] == False) & (test['is_weekly'] == True) & (test['rent_nums'].apply(lambda x: len(x) > 1))]

# Both
# test[(test['is_both'] == True) & (test['rent_nums'].apply(lambda x: len(x) == 1))]

In [None]:
test[(test['not_indicated'] == True) & (test['rent_nums'].apply(lambda x: len(x) > 1))]

In [None]:
test[test['not_indicated'] == True]

In [None]:
# there's an entry that's 800 but considered monthly due to "1,2,3 month lease"
# i think we can just get rid of this one property LOL idk how else 

# if contains_both_furnished_options = True, take the maximum of rent_nums & output to final_rent (new column)

# remove contains_both_furnished options

# if not_indicated == True 
    # if len(rent_nums) > 1, we take the average
        # issue with one property: $600/$2607 
    # if len == 1
        # if rent_nums[0] > 5000 (we assume per week), we remove
        # else output to final_rent
        # if > 5000, we remove

# if monthly = True & weekly = False
    # if length rent_nums > 1, 
    #   we take the max of rent_nums , x 52 / 12, and output to final_rent
    # else if len == 1
        # if rent_nums[0] < 500, remove (most likly to be car park / storage)
        # else x 12 / 52 and output to final_rent

# if weekly = True & monthly = False
    # if len rent_nums > 1, 
    #   we take the minimum of rent_nums and output to final_rent (bc some rental prices include bond UGHH)
    # else we output to final_rent

    # if less than 125, we remove

# if is_both = True,
    # we take the minimum of rent_nums and output to final_rent


# if len rent_nums == 2
    # if is_both == False:
        # take average of rent_nums and output to final_rent
    # if is_both == True:
        # take minimum




#

In [23]:
def calculate_final_rent(row):

    # Case when contains_both_furnished_options is True
    if row['contains_both_furnished_options']:
        return max(row['rent_nums'])

    # Case when not_indicated is True
    if row['not_indicated']:
        if len(row['rent_nums']) > 1:
            return sum(row['rent_nums']) / len(row['rent_nums'])
        elif len(row['rent_nums']) == 1:
            if row['rent_nums'][0] > 5000:
                return None  # This entry is removed
            return row['rent_nums'][0]
        return None

    # Case when is_monthly is True and is_weekly is False
    if row['is_monthly'] and not row['is_weekly']:
        if len(row['rent_nums']) > 1:
            return max(row['rent_nums']) * 52 / 12
        elif len(row['rent_nums']) == 1:
            if row['rent_nums'][0] < 500:
                return None  # This entry is removed
            return row['rent_nums'][0] * 12 / 52
        return None

    # Case when is_weekly is True and is_monthly is False
    if row['is_weekly'] and not row['is_monthly']:
        if len(row['rent_nums']) > 1:
            return min(row['rent_nums'])
        elif len(row['rent_nums']) == 1:
            return row['rent_nums'][0]
        return None

    # Case when is_both is True
    if row['is_both']:
        return min(row['rent_nums'])

    # If none of the above conditions are met, return None
    return None

In [None]:
test['final_rent'] = test.apply(calculate_final_rent, axis=1)
test

In [None]:
sample = test.sample(n=20, random_state=123123)
sample

In [None]:
# $65 PER WEEK | $282 P.C.M	Tired of looking for parking?
# should we get rid of rents less than 100

In [None]:
## None of these are furnished
# 610 per week until 03/05/2025 then 640 per week
# $650 pw till end Nov then $700 pw
# $539.90 pw $2346.00 pcm to 822025 then rent in...
# $550 PW FOR 3 MTHS THEN $570 PW

# if its furnished, i'm getting the max price - if not i'm getting the mean? 
# 

test[test['rental_price'].str.contains('THEN', case=False)]

In [None]:
# Find the dollar sign numericals
# test['rental_numerical'] = test['rental_price'].str.findall(r'\$\d+(?:,\d{3})*(?:\.\d+)?')

# Function to find standalone numbers
def find_standalone_numbers(text):
    # Regex pattern to match standalone numbers
    pattern = r'(?<!\S)\d+(?:,\d{3})*(?:\.\d+)?(?!\S)'
    
    # Find all standalone number matches
    matches = re.findall(pattern, text)
    
    # Convert matches to float
    standalone_numbers = [float(num.replace(',', '')) for num in matches]
    
    return standalone_numbers if standalone_numbers else None

In [None]:
# if none & if less than 5000, assume weekly

    # remove greater than 5000

    # if pcm, x 12 / 52

    # if weekly, just take the number

    # if both, take the minimum

# find numerical values (if 1 and less than 5000, assume pw)

# if month, keep just the number

# if contains both furnished options, take the max

# If not_indicated = True & less than 5000, then we assume weekly

# remove $ from bond if not nan

# make suburb lower case
# test['suburb'] = test['suburb'].str.lower()

# reorder:  property_type, suburb, postcode, weekly_rent, bond, num_bed, num_bath, num_parkings, furnished, pets_allowed

In [None]:
test['rental_numerical'] = test['rental_price'].str.findall(r'\d+\.?\d*')
test[test['rental_numerical'].apply(lambda x: len(x) > 1 if x else False)]['rental_price'].unique()

In [101]:
def extract_numbers(price):
    numbers = re.findall(r'\d{1,4}(?:,\d{3})*(?:\.\d+)?', price)

    return [float(num.replace(',', '')) for num in numbers]

# Create the new column 'rental_numerical'
test['rental_numerical'] = test['rental_price'].apply(extract_numbers)

In [None]:
test[test['rental_price'].str.contains('650')]['rental_price'].unique()

In [None]:
test[test['rental_numerical'].apply(lambda x: len(x) > 1 if x else False)]['rental_price'].unique()

In [None]:
def contains_phrase(text):
    phrases = 
    pattern = '|'.join(phrases)

    # Search for any of the phrases in the text
    return not bool(re.search(pattern, text, re.IGNORECASE))

def contains_words_excluding_dollar(text):
# Regex pattern to find words that are not preceded by a dollar sign
    pattern = r'\b(?!\$)[a-zA-Z]+\b'
    return bool(re.search(pattern, text))

In [11]:
print(list(test[test['rental_price'].str.contains('w', case=False)]['rental_price'].unique()))

['$530 pw', '$500 per week', '$470 pw', '$400 per week', '$520 per week', '$475 pw', '$675 per week', '$585 weekly', '$550 pw', '$525 weekly', '$440 pw', '$650 per week furnished', '$450 per week', '$800 pw', '$650 pw', '$700 weekly', '$670 per week', '$490 pw', '$525 per week', '$680 per week', '$480 pw', '$575.00 pw', '$525 pw', '$1900 per week', '$625 per week', '$460 pw', '$750 pw', '$575 pw', '$700 per week', '$500 pw', '$2200 per week', '$550 per week', '$585 per week', '$580 Per Week Includes Bills and WIFI', '$630 Bills and Wifi Included', '$690 per week', '$400 Per Week', '$520 pw', '$690.00 pw', '$1,995 pw', '$580 Per Week', '$590 PW Fully Furnished, All Bills WiFi Included', '$520 Per Week Includes Bills, WIFI, GYM, Coworking Space, etc...', '$475 weekly', '$540 per week', '$475 per week', '$1,200 weekly', '$65 PER WEEK | $282 P.C.M', '$800/w PARTLY-FURNSIHED', '$450 pw', '$850 per week', '$430 Per Week', '$750pw', '$700 Per Week', '$680 pw', '$620 pw', '$590 per week', '$62

In [None]:
['pw', 'per week', '/w', 'weekly', 'week', 'p.w.', 'p.w', 'p/w', '/week', 'a week']

# could contain both pw and pcm

In [16]:
import pandas as pd
import numpy as np

# Sample DataFrame
df = pd.DataFrame({
    'rental_price': ['$1500 and $500', 'The price is $200', 'Rent is $600, bond $150'],
    'bond': [500, 200, np.nan]
})

def remove_bond_from_rental_price(row):
    # Convert bond to string, handling NaN
    bond_value = str(row['bond']) if pd.notna(row['bond']) else None
    rental_price = str(row['rental_price'])
    
    if bond_value:
        # Remove occurrences of the bond value from rental_price
        updated_rental_price = rental_price.replace(bond_value, '').strip()
        
        # Remove any extra spaces or separators
        updated_rental_price = ' '.join(updated_rental_price.split())
        
        return updated_rental_price
    else:
        # If bond is NaN, return the rental_price unchanged
        return rental_price

# Apply the function row-wise
df['rental_price'] = df.apply(remove_bond_from_rental_price, axis=1)

print(df)


   bond          rental_price
0   500              $1 and $
1   200        The price is $
2   150  Rent is $600, bond $


In [20]:
# Extract the numerical value of the rental price 
test['rental_price_numbers'] = test['rental_price'].str.extract(r'(\d+(?:,\d{3})*(?:\.\d+)?)')
test['rental_price_numbers'] = test['rental_price_numbers'].str.replace(',', '')
test['rental_price_numbers'] = test['rental_price_numbers'].astype(float)

test[test['rental_price_numbers'].isna()]

Unnamed: 0,rental_price,desc,bond,property_type,coordinates,postcode,suburb,num_bed,num_bath,num_parkings,contains_both_furnished_options,furnished,pets_allowed,is_weekly,is_monthly,is_both,not_indicated,rental_price_numbers


In [None]:
## Assumptions

# Rental price only have numbers --> assume weekly for these

# get suburbs and postcode for address (DONE)
# rental_price per week
# split rooms into bed and bath (DONE)
# parking: list format into just number (DONE)
# features: furnished, pets (DONE)

# * is unconfirmed --> assume 0
# just 0's and 1's for all hot-encoded columns
# number of features would be len of features list
# check if furnished from description (only look for 'furnished' / 'unfurnished' in description)

# keep house & apartment
# townhouse in house
# new house & land to house
# semi detached --> house
# villa in house
# duplex to house

# studio in apt
# new apartments to apts

# if contain furnished --> flag, unless the str also contains 'extra'

# pw, per week, /w, weekly, a week, p.w, p.w., week, / week, /week, wk, / wk, 
# some just have the number --> assume weekly for these
# p.c.m., per month, /month, pcm, calendar month, pm

#################################################################################################################################################################################

# 5000 max per week

# $630 bills and wifi included
# could contain furnished / 'fully furnished' / 'partly-furnished', 'furnish', 'furnished!', 'fully furn'
# 'furnished option extra pw'

# problems: rent2own, for winter season, for the season, per night (get rid of these)
# (starting) from xx per month
# '$650 ROOM FOR EVERYONE' '$800 1,2 or 3 Month Lease' '$620 ROOM FOR BOAT' '$11,471.00 exc GST' 'From $160 - $170 per month' (take the lowest or highest maybe, or average maybe)
# $786 - $1572

# for calendar month: x 12 / 52? 

# Assume if rent doesn't have like p/w or pcm, then if its less than 5000 --> pw, if greater than 5000 --> per month or per year?
# 95000 per year

# Get rid of all rental prices above 5000

# Get rid of parkings above 12 (12 max)

In [17]:
sample = test[['suburb', 'postcode', 'coordinates']].sample(n=20, random_state=42)
sample.to_csv('../../../sample.csv')

# Playground

In [None]:
def remove_single_digits(price):
    # Remove single standalone digits
    price = re.sub(r'\b\d\b', '', price)

    # Remove digits followed immediately by letters
    price = re.sub(r'\b\d(?=[a-zA-Z])', '', price)

    return price

# Remove numbers from records with like $xx pw 6 months only (so that it doesn't affect price extraction)
test['rental_price'] = test['rental_price'].apply(remove_single_digits)

# Define weekly terms
weekly_terms = ['pw', 'per week', '/w', 'weekly', 'week', 'p.w.', 'p.w', 'p/w', '/week', 'a week']
weekly_terms_pattern = '|'.join(re.escape(term) for term in weekly_terms)

def extract_numbers(price):
    # Pattern to match standalone numbers or numbers followed by weekly terms
    pattern = fr'\b\d+(?:,\d{{3}})*(?:\.\d+)?\b(?=\s*(?:{weekly_terms_pattern})|\b)(?![\d/\s])'

    # Find all matches in the input string
    matches = re.findall(pattern, price)

    # Convert matches to float after cleaning commas
    numerical_values = [float(match.replace(',', '').strip()) for match in matches]

    # Return the extracted numbers or None if there are no matches
    return numerical_values if numerical_values else None

import re

# Define the weekly terms
weekly_terms = ['pw', 'per week', '/w', 'weekly', 'week', 'p.w.', 'p.w', 'p/w', '/week', 'a week']

# Convert the weekly terms into a regex pattern
weekly_terms_pattern = '|'.join(map(re.escape, weekly_terms))

# Function to extract numbers based on conditions
def extract_numbers(rental_price):
    # Pattern for extracting numbers after a dollar sign
    dollar_pattern = r'\$\d{1,3}(?:,\d{3})*(?:\.\d+)?'  # Handles commas and decimals
    
    # Pattern for standalone numbers or those followed by an alphabet or weekly terms
    non_dollar_pattern = fr'\b\d{{1,3}}(?:,\d{{3}})*(?:\.\d+)?(?=\b|[a-zA-Z]|(?:\s*(?:{weekly_terms_pattern})))'
    
    # Extract numbers after a dollar sign
    dollar_matches = re.findall(dollar_pattern, rental_price)

    # Extract standalone numbers or those followed by an alphabet or weekly terms
    non_dollar_matches = re.findall(non_dollar_pattern, rental_price)

    # Combine all matches and clean up the formatting
    matches = [match.replace('$', '').replace(',', '') for match in dollar_matches + non_dollar_matches]

    return [float(num) for num in matches] if matches else None

# Example usage with a DataFrame
import pandas as pd

# Sample data
df = pd.DataFrame({
    'rental_price': [
        '$650', 
        '1,200 per week', 
        '560pw', 
        'only 650 dollars', 
        'price is 700 /w', 
        '$1,500 and 750 weekly'
    ]
})

# Apply the function to the rental_price column
df['rental_numerical'] = df['rental_price'].apply(extract_numbers)

print(df)
