# Preprocessing Rental Data

In this section we will procees with some cleaning and preprocessing of the *domain.com* rental data.

### Importing Libraries and Functions

In [5]:
import pandas as pd
import json
import folium
import re


### Reading in the Rental Data

In [6]:
# JUST LOOKING AT THE EXAMPLE OF THE RAW DATA THEY  GAVE US- L
with open('../data/landing/testing_scraping.json', 'r') as file:
    data = json.load(file)

print(data)

{'https://www.domain.com.au/667-glenhuntly-road-caulfield-vic-3162-11598047': {'name': '667 Glenhuntly Road, Caulfield VIC 3162', 'cost_text': '$38,000 p.a. Incl. Outgoings + GST', 'rooms': [], 'parking': [], 'desc': 'Can you hear it? Opportunity knocking!!', 'property_type': 'House', 'date_available': 'Available Now', 'bond': nan, 'property_features': [], 'coordinates': ['-37.8860233', '145.0173065']}, 'https://www.domain.com.au/8-chamberlain-road-redcastle-vic-3523-16505678': {'name': '8 Chamberlain Road, Redcastle VIC 3523', 'cost_text': '$28,000 Per Year !!', 'rooms': [], 'parking': [], 'desc': 'Century 21 Paramount Realtors proudly presents to you this beautiful home located in Redcastle.', 'property_type': 'Vacant land', 'date_available': 'Available Now', 'bond': nan, 'property_features': [], 'coordinates': ['-36.7247292', '144.7609424']}, 'https://www.domain.com.au/6501-35-queensbridge-street-southbank-vic-3006-16869807': {'name': '6501/35 Queensbridge Street, Southbank VIC 3006

### Getting Initial Insights on the Data

* Checking what keys we have in the data

In [7]:
# LOOK AT THE KEYS WE HAVE: 
first_item = next(iter(data.values()))
headings = list(first_item.keys())

print(headings)

['name', 'cost_text', 'rooms', 'parking', 'desc', 'property_type', 'date_available', 'bond', 'property_features', 'coordinates']


* Looking at the values that can be found in the *cost_text* key

In [8]:
# LOOKING AT THE VALUES THAT CAN BE FOUND IN THE COST_TEXT KEY

unique_cost_text_values = {v['cost_text'] for v in data.values() if 'cost_text' in v}

# Print the unique values
print("Unique values in the 'cost_text' column:")
for value in unique_cost_text_values:
    print(value)

Unique values in the 'cost_text' column:
$2,600 per week.
$3,850
$3,250.00
$3400 Per Week
$2900 pw
$3,750 Per Week
$12,500 pw
Fine Furnished Residences @ Westprecinct
$2995 Per Week
See property description for tariffs.
$45,000 for winter season
$2,950 per week
$3,375 per week
$5,750.00
$49,500 for the season
$9,999 pw
$3,800.00 per week
$38,000 p.a. Incl. Outgoings + GST
5000
$3900 pw Furnished
Fully Furn - $5000 p/w, 3-Mos+ Lease, Avail Nov 18
$3,300.00
$3,000.00
Holiday Let-Per night rate
$28,000 Per Year !!
$2700 Per Week
$2990 per week
$4,000 per week
$2,650 Per week
$3,800 weekly
$2,650.00
$2750 Per Week
See property description for tariffs
$2,900
$4500 Per Week
$3,302
$2800.00


### Feature Engineering

1. EXTRACTING THE WEEKLY RATES FROM COST TEXT

In [9]:
# Function to extract weekly cost
def extract_weekly_cost(cost_text):
    # Define regex patterns for weekly, annual, and monthly costs
    weekly_patterns = [r"(\$[\d,]+\.?\d*)\s*(?:per week|pw|p/w|p\.w\.|week)", r"(\$[\d,]+\.?\d*)\s*weekly"]
    annual_patterns = [r"(\$[\d,]+\.?\d*)\s*(?:p\.a\.|pa|annum|per year|annual|year)"]
    monthly_patterns = [r"(\$[\d,]+\.?\d*)\s*(?:p/m|month|pm|p\.m\.)"]
    seasonal_pattern = r'\b(season)\b'

    # Try matching weekly patterns
    for pattern in weekly_patterns:
        match = re.search(pattern, cost_text, re.IGNORECASE)
        if match:
            return float(match.group(1).replace('$', '').replace(',', ''))

    # Try matching annual patterns and divide by 52 to get weekly cost
    for pattern in annual_patterns:
        match = re.search(pattern, cost_text, re.IGNORECASE)
        if match:
            return float(match.group(1).replace('$', '').replace(',', '')) / 52
    
    # Try matching monthly patterns and divide by 4.3 to get weekly cost
    for pattern in monthly_patterns:
        match = re.search(pattern, cost_text, re.IGNORECASE)
        if match:
            return float(match.group(1).replace('$', '').replace(',', '')) / 4.3

    # Check for seasonal cost (divide by 13 for an estimate of weekly cost)
    if re.search(seasonal_pattern, cost_text, re.IGNORECASE):
        match = re.search(r"\$([\d,]+\.?\d*)", cost_text)
        if match:
            return float(match.group(1).replace('$', '').replace(',', '')) / 13

    # If there's only a number or just a dollar amount, assume it's the weekly cost
    match = re.search(r"^\$?([\d,]+\.?\d*)$", cost_text.strip())
    if match:
        return float(match.group(1).replace('$', '').replace(',', ''))

    # If none of the patterns match, return None
    return None


In [10]:
# Apply the function to extract weekly costs
weekly_costs = {}
for key, value in data.items():
    cost_text = value.get('cost_text', '')
    weekly_cost = extract_weekly_cost(cost_text)
    weekly_costs[key] = {
        'weekly_cost': weekly_cost,
        'cost_text': cost_text,
        'coordinates': value.get('coordinates', [])
    }

In [11]:
# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(weekly_costs, orient='index')

# Drop rows where weekly_cost is None
df = df.dropna(subset=['weekly_cost'])

# Convert weekly_cost to numeric
df['weekly_cost'] = pd.to_numeric(df['weekly_cost'])

# Get the top 10 most expensive places
top_10_expensive = df.nlargest(10, 'weekly_cost')

### Data Visualisation

1. Top 10 most expensive rental properties

In [12]:
# Create the base map centered on Victoria, Australia
m = folium.Map(location=[-37.4713, 144.7852],  # Coordinates for Victoria, Australia
               tiles="cartodb positron",
               zoom_start=7,
               zoom_control=False,
               width=475,
               height=500)

# Add markers for the top 10 most expensive places
for index, row in top_10_expensive.iterrows():
    lat, lon = float(row['coordinates'][0]), float(row['coordinates'][1])
    
    folium.Marker(
        location=[lat, lon],
        popup=f"Cost: ${row['weekly_cost']:,}",
        icon=folium.Icon(icon='home', color='red')  # icon is a house :)
    ).add_to(m)

# Display the map
m
