# Preprocessing

In this section we will clean and preprocess all rental property data.

We will also fetch and calculate the driving distances of each property to both a selection of important amenities, and also to major city centres within Victoria.

We fetch both the locations and then calculate the distance using different API services.

### Importing Libraries and Functions

In [2]:
import pandas as pd
import json
import folium
import overpy
import os
import sys
sys.path.append("../")
from scripts.preproccessing import extract_weekly_cost, extract_house_details, extract_latitude, extract_longitude, extract_suburb
from scripts.preproccessing import check_empty_or_zero, clean_property_type, add_data
from scripts.preproccessing import split_by_gcc


### Reading in the Rental Data

In [4]:
# Read in raw domain data
with open('../data/landing/all_properties_metadata.json', 'r') as file:
    data = json.load(file)

### Feature Engineering of Domain Data

1. Extracting the weekly rates from the *'cost'* column

In [5]:
# Apply the function to extract weekly costs
weekly_costs = {}
for key, value in data.items():
    cost_text = value.get('cost_text', '')
    weekly_cost = extract_weekly_cost(cost_text)
    
    # Create a new dictionary, keeping all original keys/values and adding 'weekly_cost'
    weekly_costs[key] = {
        **value,  # Unpack all original key-value pairs from the 'value' dictionary
        'weekly_cost': weekly_cost  # Add/overwrite the 'weekly_cost' key
    }

In [6]:
# Convert dictionary to DataFrame
domain_data_df = pd.DataFrame.from_dict(weekly_costs, orient='index')

# Drop rows where weekly_cost is None
domain_data_df = domain_data_df.dropna(subset=['weekly_cost'])

# Convert weekly_cost to numeric
domain_data_df['weekly_cost'] = pd.to_numeric(domain_data_df['weekly_cost'])

domain_data_df.head()

Unnamed: 0,name,cost_text,rooms,parking,desc,property_type,date_available,bond,property_features,coordinates,weekly_cost
https://www.domain.com.au/5-2-elata-street-oakleigh-south-vic-3167-17176641,"5/2 Elata Street, Oakleigh South VIC 3167",$660 per week,"[3 Beds, 2 Baths]",[3 Parking],This spacious and stylish 3 bedroom unit with ...,Apartment / Unit / Flat,Available Now,$2868,"[Air conditioning, Ducted Heating, Split-Syste...","[-37.9112993, 145.0956022]",660.0
https://www.domain.com.au/10-trevena-close-rowville-vic-3178-17192717,"10 Trevena Close, Rowville VIC 3178",$670,"[3 Beds, 2 Baths]",[2 Parking],* Unverified feature,House,,$2911,"[Internal Laundry*, Shed*, Balcony / Deck*]","[-37.9119049, 145.25462]",670.0
https://www.domain.com.au/20a-kirstina-road-glen-waverley-vic-3150-17182160,"20A Kirstina Road, Glen Waverley VIC 3150","$1,000","[4 Beds, 3 Baths]",[3 Parking],David Hua,Townhouse,Available Now,$4345,"[Ensuite, Floorboards, Built in wardrobes, Int...","[-37.890415, 145.1665739]",1000.0
https://www.domain.com.au/3-20-rankin-road-boronia-vic-3155-17196273,"3/20 Rankin Road, Boronia VIC 3155",$550 pw,"[3 Beds, 2 Baths]",[2 Parking],* Unverified feature,Townhouse,,$2390,"[Internal Laundry*, Secure Parking*, Heating*,...","[-37.8635282, 145.2720704]",550.0
https://www.domain.com.au/98-halcyon-road-deanside-vic-3336-17161911,"98 Halcyon Road, Deanside VIC 3336",$560 pw,"[4 Beds, 2 Baths]",[2 Parking],Engage Real Estate proudly present this stunni...,House,Available Now,$2433,[],"[-37.7357938, 144.6931697]",560.0


2. Preperation for merging with additional house data

In [7]:
# PROCESSING AND SAVING DATA
out_dir = '../data/raw/domain/'
if not os.path.exists(out_dir):
        os.makedirs(out_dir)

# Extract house details for merge
processed_domain_data = extract_house_details(domain_data_df)
processed_domain_data = processed_domain_data[~processed_domain_data['coordinates'].apply(check_empty_or_zero)]
processed_domain_data = clean_property_type(processed_domain_data)
processed_domain_data['latitude'] = processed_domain_data['coordinates'].apply(extract_latitude)
processed_domain_data['longitude'] = processed_domain_data['coordinates'].apply(extract_longitude)
processed_domain_data = processed_domain_data.dropna(subset=['longitude', 'latitude', 'suburb'])
processed_domain_data = processed_domain_data.drop(columns=['coordinates'], errors='ignore')
# Quick preprocessing on suburb
processed_domain_data['suburb'] = processed_domain_data['suburb'].apply(extract_suburb)
indices_to_drop = processed_domain_data[processed_domain_data['suburb'].str.match('^\d')].index
processed_domain_data.drop(indices_to_drop, inplace=True)


processed_domain_data.to_csv(f"{out_dir}all_domain_properties.csv", index=False)


In [8]:
df1 = extract_house_details(domain_data_df)['suburb'].sort_values()
print(df1.head())
df = processed_domain_data['suburb'].sort_values()
print(df.head())

https://www.domain.com.au/5005-45-clarke-street-5005-263-city-southbank-vic-3006-17119175    (5005/263 city), southbank
https://www.domain.com.au/1-98-blyth-street-altona-vic-3018-17056290                                           , altona
https://www.domain.com.au/4-19-mcilwriath-st-carlton-vic-3053-14413684                                        , carlton
https://www.domain.com.au/3002b-11-rose-lane-melbourne-vic-3000-17195136                                    , melbourne
https://www.domain.com.au/2e-hines-lane-mount-egerton-vic-3352-16834853                                 , mount egerton
Name: suburb, dtype: object
https://www.domain.com.au/109-6-acacia-place-abbotsford-vic-3067-16969687         abbotsford
https://www.domain.com.au/4-church-street-abbotsford-vic-3067-16934548            abbotsford
https://www.domain.com.au/4-16-princes-st-abbotsford-vic-3067-17192365            abbotsford
https://www.domain.com.au/302-312-johnston-street-abbotsford-vic-3067-17007645    abbotsford


### Split property data by *Greater Melbourne* & *Rest of Vic* for separate analysis

In [9]:
# Reading in the datasets
domain_df = pd.read_csv("../data/raw/domain/all_domain_properties.csv")
oldlistings_df = pd.read_csv("../data/landing/oldlisting/oldlisting.csv")


In [None]:
output_dir = "../data/raw"

split_by_gcc(oldlistings_df, output_dir, "oldlistings")
split_by_gcc(domain_df, output_dir, "domain")

### Driving Distances of Properties to P.O.I's

#### Importing Necessary Libraries and Functions

In [None]:
from scripts.driving_dist_functions import fetch_amenities, get_amenity_distances
from scripts.driving_dist_functions import get_cities, get_dist_to_city

#### Accessing OverPass API

In [None]:
# Initialize the Overpass API
overpass_api = overpy.Overpass()

#### Reading in the datasets
*(optional)*

In [None]:
# Domain datasets
gm_domain_df = pd.read_parquet("../data/curated/properties.parquet")
rv_domain_df = pd.read_parquet("../data/curated/properties.parquet")

# oldlistings datasets
gm_oldlisting_df = pd.read_csv("../data/raw/oldlisting/gm_oldlisting.csv")
rv_oldlisting_df = pd.read_csv("../data/raw/oldlisting/rv_oldlisting.csv")

#### Fetching the Amenities Data

In [None]:
# Dictionary storing the queries for different amenities (nodes and ways)
queries = {
    "education": ["""
        node["amenity"="kindergarten"](area.searchArea);
        node["amenity"="school"](area.searchArea);
    """, 
    """
        way["amenity"="kindergarten"](area.searchArea);
        way["amenity"="school"](area.searchArea);
    """],
    "parks_and_gardens": ["""
        node["leisure"="park"](area.searchArea);
        node["leisure"="garden"](area.searchArea);
    """,
    """
        way["leisure"="park"](area.searchArea);
        way["leisure"="garden"](area.searchArea);
    """],
    "train_station": ["""
        node["railway"="station"](area.searchArea);
    """, 
    """
        way["railway"="station"](area.searchArea);
    """],
    "shopping": ["""
        node["shop"="supermarket"](area.searchArea);
        node["shop"="mall"](area.searchArea);
    """, 
    """
        way["shop"="supermarket"](area.searchArea);
        way["shop"="mall"](area.searchArea);
    """],
    "healthcare": ["""
        node["amenity"="hospital"](area.searchArea);
        node["amenity"="clinic"](area.searchArea);
    """,
    """
        way["amenity"="hospital"](area.searchArea);
        way["amenity"="clinic"](area.searchArea);
    """]
}


In [None]:
# Dictionary to store DataFrames for each amenity
amenities_dfs = {}

# Iterate through the queries and fetch data for each
for amenity_type, query in queries.items():
    try:
        df = fetch_amenities(overpass_api, query[0], query[1])
        amenities_dfs[amenity_type] = df
        print(f"Successfully fetched data for {amenity_type}")
    except Exception as e:
        print(f"Error fetching data for {amenity_type}: {e}")

Successfully fetched data for education
Successfully fetched data for parks_and_gardens
Successfully fetched data for train_station
Successfully fetched data for shopping
Successfully fetched data for healthcare


In [None]:
amenities_dfs

{'education':               id                                     name       amenity  \
 0      148544339                        Syndal Pre-School  kindergarten   
 1      191834621                       Tally Ho Preschool  kindergarten   
 2      207718805                      St Johns Pre-School  kindergarten   
 3      246969693             Waverley Foothills Preschool  kindergarten   
 4      247169615           Brunswick Crèche & Day Nursery  kindergarten   
 ...          ...                                      ...           ...   
 3475  1315871094  Shine Bright St Margaret's Kindergarten  kindergarten   
 3476  1316142653                 St Albans East Preschool  kindergarten   
 3477  1318009340             Country Bunch Early Learning  kindergarten   
 3478  1318025539                    Montessori Beginnings  kindergarten   
 3479  1318235998                   Happy Turtle Childcare  kindergarten   
 
                                  lat                            lon  
 0

#### Fetching Major City Coordinates in Victoria

In [None]:
# Define the query to get major cities within Victoria (place = city or town)
query = """
[out:json];
area[name="Victoria"]->.searchArea;
(
  node["place"="city"](area.searchArea);
);
out body;
"""


In [None]:
# Fetches and saves the cities as a dataframe
cities_df = get_cities(overpass_api, query)
cities_df.head(10)

Unnamed: 0,name,place_type,lat,lon
0,Melbourne,city,-37.8142454,144.9631732
1,Mildura,city,-34.195274,142.1503146
2,Bendigo,city,-36.7590183,144.2826718
3,Geelong,city,-38.1493248,144.3598241
4,Ballarat,city,-37.5623013,143.8605645
5,Shepparton,city,-36.3831633,145.3988874
6,Warrnambool,city,-38.3826242,142.4814199
7,Traralgon,city,-38.1946636,146.5381646
8,Wodonga,city,-36.1205539,146.8880837


#### Getting Driving Distance to Cities and Amentities

In this section you will need to create an Open Route Services account and obtain your own personal API key which you will then paste into the list below *(api_keys)*.

Then you may run the function. We have left 2 API keys if you would like to test out the code.

In [None]:
# Initialising ORS api_keys list

api_keys = [
    #'5b3ce3597851110001cf62484999c1f7edce4ac5a072b1c9fb50ffa2', # 500 call limit
    #'5b3ce3597851110001cf6248c9d76723ef574cf3a8479cd0665e80fa', # 2500 call limit 
            ]



1. Getting all the distances to the Major Cities

In [None]:
# Greater Melbourne - oldlisting data

gm_c_oldlisting_df = get_dist_to_city(gm_oldlisting_df, cities_df, api_keys)
gm_c_oldlisting_df.to_csv("../data/raw/oldlisting/gm_c_oldlisting.csv")

In [None]:
# Rest of Vic - oldlisting data

rv_c_oldlisting_df = get_dist_to_city(rv_oldlisting_df, cities_df, api_keys)
rv_c_oldlisting_df.to_csv("../data/raw/oldlisting/rv_c_oldlisting.csv")

In [None]:
# Greater Melbourne - Domain data

gm_c_domain_df = get_dist_to_city(gm_domain_df, cities_df, api_keys)
gm_c_domain_df.to_csv("../data/raw/domain/gm_c_domain.csv")

In [None]:
# Rest of Vic - Domain data

rv_c_domain_df = get_dist_to_city(rv_domain_df, cities_df, api_keys)
rv_c_domain_df.to_csv("../data/raw/domain/rv_c_domain.csv")

KeyboardInterrupt: 

2. Getting all the distances to the Amenities

In [None]:
# Greater Melbourne - oldlisting data

gm_c_a_oldlisting_df = get_amenity_distances(gm_c_oldlisting_df, amenities_dfs, api_keys)
gm_c_a_oldlisting_df.to_csv("../data/raw/oldlisting/gm_c+a_oldlisting.csv")

Processing education...
Error with batch 0.0: 403 ({'error': 'Quota exceeded'})
Quota limit exceeded for API key 5b3ce3597851110001cf62484999c1f7edce4ac5a072b1c9fb50ffa2
Using a new key... Waiting for 10 seconds before continuing.
Error with batch 360.0: HTTP Error: 502
Unhandled error occurred: HTTP Error: 502. Retrying after 10 seconds...
Processing parks_and_gardens...
Error with batch 0.0: 403 ({'error': 'Quota exceeded'})
Quota limit exceeded for API key 5b3ce3597851110001cf62484999c1f7edce4ac5a072b1c9fb50ffa2
Using a new key... Waiting for 10 seconds before continuing.
Error with batch 148.0: 403 ({'error': 'Quota exceeded'})
Quota limit exceeded for API key 5b3ce3597851110001cf6248fdd0ae85071d43598ef26e7a446a4f78
Using a new key... Waiting for 10 seconds before continuing.
Processing train_station...
Error with batch 0.0: 403 ({'error': 'Quota exceeded'})
Quota limit exceeded for API key 5b3ce3597851110001cf62484999c1f7edce4ac5a072b1c9fb50ffa2
Using a new key... Waiting for 10 s

In [None]:
# Rest of Vic - oldlisting data

rv_c_a_oldlisting_df = get_amenity_distances(rv_c_oldlisting_df, amenities_dfs, api_keys)
rv_c_a_oldlisting_df.to_csv("../data/raw/oldlisting/rv_c+a_oldlisting.csv")

Processing education...
Processing parks_and_gardens...
Processing train_station...
Processing shopping...
Error with batch 186.0: HTTP Error: 502
Unexpected error occurred: HTTP Error: 502.
Retrying after 3 seconds...
Processing healthcare...


In [None]:
# Greater Melbourne - Domain data

gm_c_a_domain_df = get_amenity_distances(gm_c_domain_df, amenities_dfs, api_keys)
gm_c_a_domain_df.to_csv("../data/raw/domain/gm_c+a_domain.csv")

In [None]:
# Rest of Vic - Domain data
rv_c_domain_df = pd.read_csv("../data/raw/domain/rv_c_domain.csv")
rv_c_a_domain_df = get_amenity_distances(rv_c_domain_df, amenities_dfs, api_keys)
rv_c_a_domain_df.to_csv("../data/raw/domain/rv_c+a_domain.csv")

Processing education...
Processing parks_and_gardens...
Processing train_station...
Processing shopping...
Processing healthcare...


### Preprocess Oldlisting Data

In [5]:
# Import preprocessing functions
from scripts.preprocess_oldlistings import lowercase_string_attributes
from scripts.preprocess_oldlistings import preprocess_dates
from scripts.preprocess_oldlistings import preprocess_bbp
from scripts.preprocess_oldlistings import preprocess_address
from scripts.preprocess_oldlistings import preprocess_house_type
from scripts.preprocess_oldlistings import get_weekly_price

In [None]:
# Defining datasets and inputs
read_dir = '../data/raw/oldlisting/'
out_dir = '../data/raw/oldlisting/'

# Creating the directory if it doesn't yet exist
if not os.path.exists(out_dir):
        os.makedirs(out_dir)

# Adding all wanted dataset names into list
DATASETS = ['gm_c+a_oldlisting.csv', 'rv_c+a_oldlisting.csv']

In [None]:
# Applies preprocessing steps to all datasets
for i, region in enumerate(DATASETS):

    print(f"\n{i+1}. Preprocessing {region}...\n")
        
    # Step 1: Read in dataframe
    listings_df = pd.read_csv(f"{read_dir}{region}")


    # Step 2: Drops any index columns that were added on when opening and saving dataset previously
    cols_to_remove = [col for col in listings_df.columns if "Unnamed:" in col]
    listings_df = listings_df.drop(cols_to_remove, axis=1)


    # Step 3: Dropping duplicates rows
    listings_df = listings_df.drop_duplicates()  # nothing gets dropped but will keep this anyways
    

    # Step 4: Lowercasing all the values that are strings
    listings_df = lowercase_string_attributes(listings_df) # only lowercases 3 cols. There are more string cols


    # Step 5: Formatting suburb names for readability
    listings_df["suburb"] = listings_df["suburb"].str.replace("+", " ")


    # Step 6: Converting dates from [yyyy, MM] to [yyyy]
    listings_df['dates'] = listings_df['dates'].apply(preprocess_dates)


    # Step 7: Handling incorrect or missing values for no. of beds, baths and parking spaces
    listings_df = preprocess_bbp(listings_df)


    # Step 8: Formatting address into "House No., Street Name"
    listings_df = preprocess_address(listings_df) # need to add 1 more line to remove comma from end of street names


    # Step 9: Filtering the house types
    listings_df = preprocess_house_type(listings_df)


    # Step 10: Converting price to weekly cost
    listings_df = get_weekly_price(listings_df)

    # These only for spark dataframes??
    print(listings_df.head())
    # Saving the finalised dataframes into their respective directories
    if region == 'gm_c+a_oldlisting.csv':
        listings_df.to_csv(f"{out_dir}gm_oldlisting_final.csv", index=False)
    else:
        listings_df.to_csv(f"{out_dir}rv_oldlisting_final.csv", index=False)


### Merge Oldlisting and Domain Data by GCC Region

In [30]:
# Merge Greater Melbourne Data
read_dir = '../data/raw/'

gm_oldlisting_df = pd.read_csv(f"{read_dir}oldlisting/gm_oldlisting_final.csv")
gm_domain_df = pd.read_csv(f"{read_dir}domain/gm_c+a_domain.csv")

gm_oldlisting_df = gm_oldlisting_df[['SA2_CODE21', 'SA2_NAME21', 'GCC_NAME21', 'suburb', 'postcode',
                                    'address', 'latitude','longitude', 'beds','baths','parking',
                                    'dist_to_city', 'dist_to_education',
                                    'dist_to_parks_and_gardens', 'dist_to_train_station',
                                    'dist_to_healthcare', 'date_available', 'weekly_cost']]
gm_domain_df = gm_domain_df[['SA2_CODE21', 'SA2_NAME21','GCC_NAME21', 'suburb', 'postcode',
                            'address', 'latitude','longitude', 'beds','baths','parking',
                            'dist_to_city', 'dist_to_education',
                            'dist_to_parks_and_gardens', 'dist_to_train_station',
                            'dist_to_healthcare', 'date_available', 'weekly_cost']]

all_gm_df = pd.concat([gm_oldlisting_df, gm_domain_df], axis=0, ignore_index=True)

In [None]:
# Merge Rest of Vic Data
read_dir = '../data/raw/'

rv_oldlisting_df = pd.read_csv(f"{read_dir}oldlisting/rv_oldlisting_final.csv")
rv_domain_df = pd.read_csv(f"{read_dir}domain/rv_c+a_domain.csv")

rv_oldlisting_df = rv_oldlisting_df[['SA2_CODE21', 'SA2_NAME21', 'GCC_NAME21', 'suburb', 'postcode',
                                    'address', 'latitude','longitude', 'beds','baths','parking',
                                    'dist_to_city', 'dist_to_education',
                                    'dist_to_parks_and_gardens', 'dist_to_train_station',
                                    'dist_to_healthcare', 'date_available', 'weekly_cost']]
rv_domain_df = rv_domain_df[['SA2_CODE21', 'SA2_NAME21','GCC_NAME21', 'suburb', 'postcode',
                            'address', 'latitude','longitude', 'beds','baths','parking',
                            'dist_to_city', 'dist_to_education',
                            'dist_to_parks_and_gardens', 'dist_to_train_station',
                            'dist_to_healthcare', 'date_available', 'weekly_cost']]

all_rv_df = pd.concat([rv_oldlisting_df, rv_domain_df], axis=0, ignore_index=True)

#### Add External Datasets

In [34]:
# Prep Greater Melbourne data to merge with External Data 
all_gm_df.rename(columns={'date_available': 'year'}, inplace=True)
all_gm_df['year'] = all_gm_df['year'].astype(str)

In [35]:

# Prep Rest of Vic data with External Data
all_rv_df.rename(columns={'date_available': 'year'}, inplace=True)
all_rv_df['year'] = all_rv_df['year'].astype(str)

In [36]:
# Combine with external data
gm_with_external_df = add_data(all_gm_df)
rv_with_external_df = add_data(all_rv_df)

In [37]:
# Check for missing values post-merge and drop

gm_forecast_count = len(gm_with_external_df)
rv_forecast_count = len(rv_with_external_df)

gm_with_external_df.dropna(inplace=True)
rv_with_external_df.dropna(inplace=True)

print(f"NaN records removed from GM Forecast: {gm_forecast_count - len(gm_with_external_df)}")
print(f"NaN records removed from RV Forecast: {rv_forecast_count - len(rv_with_external_df)}")


NaN records removed from GM Forecast: 0
NaN records removed from RV Forecast: 2


#### Create Forecast Template, to predict price on 

In [38]:
# Create DataFrame of Years to append
years = ["2025", "2026", "2027", "2028", "2029"]
years_df = pd.DataFrame(years, columns=["year"])

# Create Dummy Key to Cross Join
years_df['key'] = 1

In [39]:
unique_gm_properties_df = pd.DataFrame(all_gm_df.drop_duplicates(["address"]))
unique_rv_properties_df = pd.DataFrame(all_rv_df.drop_duplicates(["address"]))

unique_gm_properties_df.drop(['year', 'weekly_cost'], axis=1, inplace=True)
unique_rv_properties_df.drop(['year', 'weekly_cost'], axis=1, inplace=True)

# Create Dummy Key to Cross Join
unique_gm_properties_df['key'] = 1
unique_rv_properties_df['key'] = 1

In [40]:
gm_template_df = pd.merge(unique_gm_properties_df, years_df, on='key')
rv_template_df = pd.merge(unique_rv_properties_df, years_df, on='key')

# Drop Dummy Variable
gm_template_df.drop('key', axis=1, inplace=True)
rv_template_df.drop('key', axis=1, inplace=True)


In [41]:
gm_template_external_df = add_data(gm_template_df)
rv_template_external_df = add_data(rv_template_df)

In [42]:
# Check for missing values post-merge and drop

gm_forecast_count = len(gm_template_external_df)
rv_forecast_count = len(rv_template_external_df)

gm_template_external_df.dropna(inplace=True)
rv_template_external_df.dropna(inplace=True)

print(f"NaN records removed from GM Forecast: {gm_forecast_count - len(gm_template_external_df)}")
print(f"NaN records removed from RV Forecast: {rv_forecast_count - len(rv_template_external_df)}")

NaN records removed from GM Forecast: 0
NaN records removed from RV Forecast: 10


#### Drop Redundant Columns and Save

In [43]:
# Define output directory
out_dir = '../data/curated/final_datasets/'
template_out_dir =  '../data/curated/forecast/'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
if not os.path.exists(template_out_dir):
    os.makedirs(template_out_dir)

In [44]:
COLS_TO_DROP = ['suburb', 'year', 'postcode', 'address', 'latitude', 'longitude',
                'SA2_CODE21', 'SA2_NAME21', 'GCC_NAME21', 'median_weekly_rent']

In [45]:
# Save corresponding suburbs and years as separate dataframes 
gm_train_identifiers = gm_with_external_df[['suburb', 'year']]
rv_train_identifiers = rv_with_external_df[['suburb', 'year']]

gm_predict_identifiers = gm_template_external_df[['suburb', 'year']]
rv_predict_identifiers = rv_template_external_df[['suburb', 'year']]

gm_train_identifiers.to_csv(f"{out_dir}gm_train_identifiers.csv", index=False)
rv_train_identifiers.to_csv(f"{out_dir}rv_train_identifiers.csv", index=False)

gm_predict_identifiers.to_csv(f"{template_out_dir}gm_predict_identifiers.csv", index=False)
rv_predict_identifiers.to_csv(f"{template_out_dir}rv_predict_identifiers.csv", index=False)

# Drop Columns
gm_with_external_df.drop(columns=COLS_TO_DROP, axis=1, inplace=True)
rv_with_external_df.drop(columns=COLS_TO_DROP, axis=1, inplace=True)

gm_template_external_df.drop(columns=COLS_TO_DROP, axis=1, inplace=True)
rv_template_external_df.drop(columns=COLS_TO_DROP, axis=1, inplace=True)

In [46]:
# Save Data to CSV
gm_with_external_df.to_csv(f"{out_dir}greater_melbourne_train.csv", index=False)
rv_with_external_df.to_csv(f"{out_dir}rest_of_vic_train.csv", index=False)

gm_template_external_df.to_csv(f"{template_out_dir}greater_melbourne_predict.csv", index=False)
rv_template_external_df.to_csv(f"{template_out_dir}rest_of_vic_predict.csv", index=False)

### Data Visualisation

1. Top 10 most expensive rental properties

In [6]:
# Get the top 10 most expensive places
top_10_expensive = domain_data_df.nlargest(10, 'weekly_cost')
print(top_10_expensive.head())

                                                                                               name  \
https://www.domain.com.au/110-beevers-street-fo...           110 Beevers Street, Footscray VIC 3011   
https://www.domain.com.au/6501-35-queensbridge-...  6501/35 Queensbridge Street, Southbank VIC 3006   
https://www.domain.com.au/7-jeffcott-street-wes...       7 Jeffcott Street, West Melbourne VIC 3003   
https://www.domain.com.au/5604-1-queensbridge-s...   5604/1 Queensbridge Square, Southbank VIC 3006   
https://www.domain.com.au/28a-300-point-cook-ro...     28A/300 Point Cook Road, Point Cook VIC 3030   

                                                     cost_text  \
https://www.domain.com.au/110-beevers-street-fo...  $95,000.00   
https://www.domain.com.au/6501-35-queensbridge-...  $12,500 pw   
https://www.domain.com.au/7-jeffcott-street-wes...   $9,999 pw   
https://www.domain.com.au/5604-1-queensbridge-s...   $5,750.00   
https://www.domain.com.au/28a-300-point-cook-ro... 

In [7]:
# Create the base map centered on Victoria, Australia
m = folium.Map(location=[-37.4713, 144.7852],  # Coordinates for Victoria, Australia
               tiles="cartodb positron",
               zoom_start=7,
               zoom_control=False,
               width=475,
               height=500)

# Add markers for the top 10 most expensive places
for index, row in top_10_expensive.iterrows():
    lat, lon = float(row['coordinates'][0]), float(row['coordinates'][1])
    
    folium.Marker(
        location=[lat, lon],
        popup=f"Cost: ${row['weekly_cost']:,}",
        icon=folium.Icon(icon='home', color='red')  # icon is a house :)
    ).add_to(m)

# Display the map
m
