# Preprocessing Rental Data

In this section we will procees with some cleaning and preprocessing of the *domain.com* rental data.

### Importing Libraries and Functions

In [1]:
import pandas as pd
import json
import folium
import re
import geopandas as gpd
import sys 
import os
sys.path.append(os.path.abspath(".."))
from scripts.preproccessing import extract_weekly_cost, extract_house_details, check_empty_or_zero, combine_SA2
import pyarrow



### Reading in the Rental Data

In [2]:
# JUST LOOKING AT THE EXAMPLE OF THE RAW DATA THEY  GAVE US- L
with open('../data/landing/all_properties_metadata.json', 'r') as file:
    data = json.load(file)

print(data)

{'https://www.domain.com.au/7b-norland-street-cheltenham-vic-3192-17139649': {'name': '7B Norland Street, Cheltenham VIC 3192', 'cost_text': '$700 weekly', 'rooms': ['3 Beds', '2 Baths'], 'parking': ['1 Parking'], 'desc': '* Unverified feature', 'property_type': 'Townhouse', 'date_available': 'Available Now', 'bond': '$3042', 'property_features': ['Built in wardrobes*'], 'coordinates': ['-37.9683452', '145.0688744']}, 'https://www.domain.com.au/1008-915-collins-street-docklands-vic-3008-17199836': {'name': '1008/915 Collins Street, Docklands VIC 3008', 'cost_text': '$900 per week', 'rooms': ['2 Beds', '2 Baths'], 'parking': ['1 Parking'], 'desc': '* Unverified feature', 'property_type': 'Apartment / Unit / Flat', 'date_available': 'Available Now', 'bond': '$3911', 'property_features': ['Internal Laundry*', 'Ensuite', 'Built in wardrobes', 'Gym', 'Ducted Heating', 'Ducted Cooling', 'Intercom', 'Swimming Pool', 'Secure Parking', 'Balcony'], 'coordinates': ['-37.8206745', '144.9401081']},

### Feature Engineering

1. EXTRACTING THE WEEKLY RATES FROM COST TEXT

In [3]:
# Apply the function to extract weekly costs
weekly_costs = {}
for key, value in data.items():
    cost_text = value.get('cost_text', '')
    weekly_cost = extract_weekly_cost(cost_text)
    
    # Create a new dictionary, keeping all original keys/values and adding 'weekly_cost'
    weekly_costs[key] = {
        **value,  # Unpack all original key-value pairs from the 'value' dictionary
        'weekly_cost': weekly_cost  # Add/overwrite the 'weekly_cost' key
    }

In [4]:
# Convert dictionary to DataFrame
domain_data_df = pd.DataFrame.from_dict(weekly_costs, orient='index')

# Drop rows where weekly_cost is None
domain_data_df = domain_data_df.dropna(subset=['weekly_cost'])

# Convert weekly_cost to numeric
domain_data_df['weekly_cost'] = pd.to_numeric(domain_data_df['weekly_cost'])



Some preperation for merging with additional house data

In [5]:
# PROCESSING AND SAVING DATA

# Extract house details for merge
processed_domain_data = extract_house_details(domain_data_df)
# Drop rows where the coordinates list is empty or contains '0'
processed_domain_data = processed_domain_data[~processed_domain_data['coordinates'].apply(check_empty_or_zero)]
# Save to parquet and view
processed_domain_data.to_parquet("../data/raw/all_domain_properties.parquet", index=False)
processed_domain_data.head()

Unnamed: 0,name,rooms,parking,property_type,date_available,bond,coordinates,weekly_cost,address,suburb,postcode
https://www.domain.com.au/7b-norland-street-cheltenham-vic-3192-17139649,"7B Norland Street, Cheltenham VIC 3192","[3 Beds, 2 Baths]",[1 Parking],Townhouse,09/24,$3042,"[-37.9683452, 145.0688744]",700.0,7b norland street,cheltenham,3192
https://www.domain.com.au/1008-915-collins-street-docklands-vic-3008-17199836,"1008/915 Collins Street, Docklands VIC 3008","[2 Beds, 2 Baths]",[1 Parking],Apartment / Unit / Flat,09/24,$3911,"[-37.8206745, 144.9401081]",900.0,1008/915 collins street,docklands,3008
https://www.domain.com.au/1009-555-swanston-street-carlton-vic-3053-17172492,"1009/555 Swanston Street, Carlton VIC 3053","[1 Bed, 1 Bath]",[− Parking],Apartment / Unit / Flat,09/24,$1955,"[-37.8057745, 144.9626168]",450.0,1009/555 swanston street,carlton,3053
https://www.domain.com.au/2-decore-drive-south-morang-vic-3752-12868271,"2 Decore Drive, South Morang VIC 3752","[3 Beds, 1 Bath]",[2 Parking],House,09/24,$2086,"[-37.6334274, 145.0684827]",480.0,2 decore drive,south morang,3752
https://www.domain.com.au/4-11-lane-road-ferntree-gully-vic-3156-17181626,"4/11 Lane Road, Ferntree Gully VIC 3156","[2 Beds, 1 Bath]",[1 Parking],Apartment / Unit / Flat,09/24,$2390,"[-37.8776374, 145.2921217]",550.0,4/11 lane road,ferntree gully,3156


## EXPERIMENTING WITH SOME VISUALISATIONS:

In [17]:
# Get the top 10 most expensive places
top_10_expensive = domain_data_df.nlargest(10, 'weekly_cost')
print(top_10_expensive.head())

                                                    weekly_cost   cost_text  \
https://www.domain.com.au/110-beevers-street-fo...      95000.0  $95,000.00   
https://www.domain.com.au/6501-35-queensbridge-...      12500.0  $12,500 pw   
https://www.domain.com.au/7-jeffcott-street-wes...       9999.0   $9,999 pw   
https://www.domain.com.au/5604-1-queensbridge-s...       5750.0   $5,750.00   
https://www.domain.com.au/28a-300-point-cook-ro...       5000.0        5000   

                                                                   coordinates  
https://www.domain.com.au/110-beevers-street-fo...  [-37.7958662, 144.9059775]  
https://www.domain.com.au/6501-35-queensbridge-...  [-37.8228837, 144.9612147]  
https://www.domain.com.au/7-jeffcott-street-wes...  [-37.8116713, 144.9532276]  
https://www.domain.com.au/5604-1-queensbridge-s...   [-37.821734, 144.9621361]  
https://www.domain.com.au/28a-300-point-cook-ro...  [-37.8953338, 144.7526675]  


### Data Visualisation

1. Top 10 most expensive rental properties

In [18]:
# Create the base map centered on Victoria, Australia
m = folium.Map(location=[-37.4713, 144.7852],  # Coordinates for Victoria, Australia
               tiles="cartodb positron",
               zoom_start=7,
               zoom_control=False,
               width=475,
               height=500)

# Add markers for the top 10 most expensive places
for index, row in top_10_expensive.iterrows():
    lat, lon = float(row['coordinates'][0]), float(row['coordinates'][1])
    
    folium.Marker(
        location=[lat, lon],
        popup=f"Cost: ${row['weekly_cost']:,}",
        icon=folium.Icon(icon='home', color='red')  # icon is a house :)
    ).add_to(m)

# Display the map
m


In [2]:
# now lets make each house connected to its corresponding SA2

In [6]:
gdf_joined = combine_SA2(processed_domain_data, 'coordinates')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['point'] = df[column].apply(lambda x: Point(x[1], x[0]))  # Point(longitude, latitude)
  exec(code_obj, self.user_global_ns, self.user_ns)
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:7844

  gdf_joined = gpd.sjoin(gdf_points, sf, how='left', op='within') # join our SA2 points with all listings


In [7]:
gdf_joined.head()

Unnamed: 0,name,rooms,parking,property_type,date_available,bond,weekly_cost,address,suburb,postcode,point,SA2_CODE21,SA2_NAME21,GCC_NAME21,AREASQKM21
https://www.domain.com.au/7b-norland-street-cheltenham-vic-3192-17139649,"7B Norland Street, Cheltenham VIC 3192","[3 Beds, 2 Baths]",[1 Parking],Townhouse,09/24,$3042,700.0,7b norland street,cheltenham,3192,POINT (145.06887 -37.96835),208031188,Highett (East) - Cheltenham,Greater Melbourne,8.7376
https://www.domain.com.au/1008-915-collins-street-docklands-vic-3008-17199836,"1008/915 Collins Street, Docklands VIC 3008","[2 Beds, 2 Baths]",[1 Parking],Apartment / Unit / Flat,09/24,$3911,900.0,1008/915 collins street,docklands,3008,POINT (144.94011 -37.82067),206041118,Docklands,Greater Melbourne,2.444
https://www.domain.com.au/1009-555-swanston-street-carlton-vic-3053-17172492,"1009/555 Swanston Street, Carlton VIC 3053","[1 Bed, 1 Bath]",[− Parking],Apartment / Unit / Flat,09/24,$1955,450.0,1009/555 swanston street,carlton,3053,POINT (144.96262 -37.80577),206041117,Carlton,Greater Melbourne,1.8187
https://www.domain.com.au/2-decore-drive-south-morang-vic-3752-12868271,"2 Decore Drive, South Morang VIC 3752","[3 Beds, 1 Bath]",[2 Parking],House,09/24,$2086,480.0,2 decore drive,south morang,3752,POINT (145.06848 -37.63343),209041436,South Morang - South,Greater Melbourne,7.7445
https://www.domain.com.au/4-11-lane-road-ferntree-gully-vic-3156-17181626,"4/11 Lane Road, Ferntree Gully VIC 3156","[2 Beds, 1 Bath]",[1 Parking],Apartment / Unit / Flat,09/24,$2390,550.0,4/11 lane road,ferntree gully,3156,POINT (145.29212 -37.87764),211011447,Ferntree Gully - North,Greater Melbourne,6.9851


In [34]:
# now lets visualise what is the most expensive of suburbs

In [8]:
# create average cost per SA2
# created

ave_cost_SA2 = gdf_joined[['weekly_cost', 'SA2_NAME21']] \
                .groupby('SA2_NAME21') \
                .agg(
                    {
                        'weekly_cost': 'sum', # sum over total amount earned
                        'SA2_NAME21': 'count' # count number of instances from sample
                    }
                ).rename({'SA2_NAME21': 'total_houses'}, axis=1)


In [9]:
ave_cost_SA2['ave_cost'] = (

    ave_cost_SA2['weekly_cost']/ave_cost_SA2['total_houses']) # calculate average cost per SA2

ave_cost_SA2 = ave_cost_SA2.reset_index().sort_values('ave_cost', ascending=False)

ave_cost_SA2.head()

Unnamed: 0,SA2_NAME21,weekly_cost,total_houses,ave_cost
345,Panton Hill - St Andrews,1950.0,1,1950.0
54,Bright - Mount Beauty,18338.461538,13,1410.650888
350,Pearcedale - Tooradin,1400.0,1,1400.0
55,Brighton (Vic.),77265.0,58,1332.155172
176,Footscray,167814.0,138,1216.043478


In [10]:
geoJSON = sf[['SA2_NAME21', 'geometry']].to_json()

NameError: name 'sf' is not defined

In [11]:
# Create the base map centered on Victoria, Australia
m = folium.Map(location=[-37.4713, 144.7852],  # Coordinates for Victoria, Australia
               tiles="cartodb positron",
               zoom_start=7,
               zoom_control=False,
               width=475,
               height=500)

c = folium.Choropleth(
    geo_data=geoJSON, # geoJSON 
    name='choropleth', # name of plot
    data=ave_cost_SA2.reset_index(), # data source
    columns=['SA2_NAME21','ave_cost'], # the columns required
    key_on='properties.SA2_NAME21', # this is from the geoJSON's properties
    fill_color='YlOrRd', # color scheme
    nan_fill_color='black',
    legend_name='Average Cost (AUD) Per Week'
)

c.add_to(m)

# Display the map
m

NameError: name 'geoJSON' is not defined

In [None]:
## Population Data

In [82]:
# Read the GeoPackage
population_gdf = gpd.read_file("../data/population/population_extracted/32180_ERP_2023_SA2_GDA2020.gpkg")


In [83]:
# extract SA2 name and ERP (estimated residential population) for victoria only

population_gdf = population_gdf[population_gdf['State_name_2021'] == 'Victoria']

population_gdf = population_gdf[[ 'SA2_name_2021', 
 'ERP_2001',
 'ERP_2002',
 'ERP_2003',
 'ERP_2004',
 'ERP_2005',
 'ERP_2006',
 'ERP_2007',
 'ERP_2008',
 'ERP_2009',
 'ERP_2010',
 'ERP_2011',
 'ERP_2012',
 'ERP_2013',
 'ERP_2014',
 'ERP_2015',
 'ERP_2016',
 'ERP_2017',
 'ERP_2018',
 'ERP_2019',
 'ERP_2020',
 'ERP_2021',
 'ERP_2022',
 'ERP_2023']]

In [84]:
population_gdf

Unnamed: 0,SA2_name_2021,ERP_2001,ERP_2002,ERP_2003,ERP_2004,ERP_2005,ERP_2006,ERP_2007,ERP_2008,ERP_2009,...,ERP_2014,ERP_2015,ERP_2016,ERP_2017,ERP_2018,ERP_2019,ERP_2020,ERP_2021,ERP_2022,ERP_2023
642,Alfredton,5756.0,6092.0,6293.0,6480.0,6648.0,6761.0,7034.0,7272.0,7614.0,...,10338.0,11039.0,11852,12649,13537,14434,15507,16841,18002,18997
643,Ballarat,11497.0,11708.0,12015.0,12189.0,12269.0,12356.0,12408.0,12480.0,12476.0,...,12327.0,12300.0,12301,12266,12244,12320,12196,12071,11938,11809
644,Buninyong,5320.0,5399.0,5557.0,5620.0,5857.0,6037.0,6131.0,6252.0,6431.0,...,7082.0,7191.0,7311,7409,7418,7458,7377,7229,7247,7323
645,Delacombe,4154.0,4225.0,4371.0,4465.0,4704.0,5041.0,5206.0,5349.0,5557.0,...,6583.0,6846.0,7195,7622,8183,8890,9755,10648,11798,12869
646,Smythes Creek,3317.0,3378.0,3411.0,3473.0,3508.0,3542.0,3594.0,3658.0,3714.0,...,3945.0,3966.0,3990,4004,4042,4112,4152,4211,4223,4268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159,Otway,3452.0,3479.0,3511.0,3511.0,3492.0,3459.0,3489.0,3501.0,3490.0,...,3519.0,3538.0,3556,3635,3710,3802,3911,3979,3974,3983
1160,Moyne - East,6718.0,6704.0,6676.0,6643.0,6638.0,6652.0,6606.0,6631.0,6703.0,...,6734.0,6716.0,6709,6717,6746,6798,6883,6990,7046,7132
1161,Moyne - West,8317.0,8387.0,8450.0,8487.0,8517.0,8601.0,8694.0,8792.0,8878.0,...,9383.0,9467.0,9603,9686,9783,9845,9859,9967,10098,10148
1162,Warrnambool - North,17053.0,17449.0,17726.0,17937.0,18172.0,18528.0,18877.0,19107.0,19369.0,...,20930.0,21217.0,21442,21688,21954,22184,22416,22470,22586,22762


In [85]:
population_df = pd.DataFrame(population_gdf)

In [None]:
## Homelessness

In [87]:
# must conda/pip install openpyxl

homelessness_df = pd.read_excel('../data/homelessness/homelessness.xlsx', sheet_name='Table_5.3')

FileNotFoundError: [Errno 2] No such file or directory: '../data/homelessness/homelessness.xlsx'

In [25]:
homelessness_df[homelessness_df['Unnamed: 1'] == 'Capital Region']

Unnamed: 0,Australian Bureau of Statistics,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
8,,Capital Region,,,757


In [44]:
homelessness_df = homelessness_df.loc[775:1375]
homelessness_df = homelessness_df[pd.notna(homelessness_df['Unnamed: 3'])]
homelessness_df = homelessness_df.iloc[:, -2:]
homelessness_df.columns = ['SA2_name_2021', 'all_homeless_persons_2021']

In [45]:
homelessness_df

Unnamed: 0,SA2_name_2021,all_homeless_persons_2021
775,Buninyong,43
776,Delacombe,43
777,Smythes Creek,5
778,Wendouree - Miners Rest,93
779,Ballarat East - Warrenheip,148
...,...,...
1370,Otway,27
1372,Moyne - East,25
1373,Moyne - West,6
1374,Warrnambool - North,95


In [46]:
# socioeconomic

In [47]:
# must conda/pip install openpyxl

socioeconomic_df = pd.read_excel('../data/socioeconomic/socioeconomic.xlsx', sheet_name='Table 1')

In [51]:
socioeconomic_df = socioeconomic_df.loc[634:1149]
socioeconomic_df = socioeconomic_df.iloc[:, [1, 4, 5]]
socioeconomic_df.columns = ['SA2_name_2021', 'Index of Relative Socio-economic Advantage and Disadvantage', 'Decile']

In [52]:
socioeconomic_df

Unnamed: 0,SA2_name_2021,Index of Relative Socio-economic Advantage and Disadvantage,Decile
634,Alfredton,1011,6
635,Ballarat,1040,7
636,Buninyong,1040,7
637,Delacombe,947,3
638,Smythes Creek,1005,6
...,...,...,...
1145,Otway,977,5
1146,Moyne - East,986,5
1147,Moyne - West,1005,6
1148,Warrnambool - North,956,4


In [17]:
## inflation

In [18]:
# lets extract housing CPI index, as well as overall CPI with housing removed as a measure of all other inflation
# this will be best for analysis to keep both variables as independent as possible

In [75]:
housing_cpi_df = pd.read_excel('../data/inflation/inflation.xlsx', sheet_name='Data1')

In [21]:
excluding_housing_cpi_df = pd.read_excel('../data/inflation/inflation.xlsx', sheet_name='Data2')

In [76]:
housing_cpi_df = housing_cpi_df[['Unnamed: 0','Index Numbers ;  Housing ;  Melbourne ;']]

In [28]:
excluding_housing_cpi_df = excluding_housing_cpi_df[['Unnamed: 0','Index Numbers ;  All groups CPI excluding Housing ;  Melbourne ;']]

In [77]:
housing_cpi_df = housing_cpi_df.rename(columns={'Unnamed: 0': "quarter"})

In [78]:
inflation_df = pd.concat([housing_cpi_df,excluding_housing_cpi_df],axis=1).drop('Unnamed: 0', axis=1).dropna().iloc[9:].rename(columns={'Index Numbers ;  Housing ;  Melbourne ;': "housing_index", 'Index Numbers ;  All groups CPI excluding Housing ;  Melbourne ;': 'CPI_without_housing'})

In [79]:
inflation_df

Unnamed: 0,quarter,housing_index,CPI_without_housing
105,1972-09-01 00:00:00,11.2,11.3
106,1972-12-01 00:00:00,11.3,11.5
107,1973-03-01 00:00:00,11.5,11.7
108,1973-06-01 00:00:00,11.7,12.1
109,1973-09-01 00:00:00,12,12.6
...,...,...,...
308,2023-06-01 00:00:00,144,130.6
309,2023-09-01 00:00:00,147.5,131.8
310,2023-12-01 00:00:00,148.2,132.7
311,2024-03-01 00:00:00,149.3,134.1


In [80]:
# Now let's combine our external datasets

In [86]:
external_df = population_df.join([socioeconomic_df, homelessness_df], on='SA2_name_2021')

NameError: name 'socioeconomic_df' is not defined