In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:

# read data about all properties from Domain website
df=pd.read_csv('../data/curated/property_full.csv')

# remove feature wouldn't be useful towards model training
df=df.drop(columns=['Unnamed: 0','Unnamed: 0_x','Unnamed: 0.1','name','coordinates','closest_school_name','closest_school_coor',
'closest_sc_name','closest_sc_coor','closest_station_coor','2001population','2002population', '2003population', 
'2004population', '2005population', '2006population', '2007population', '2008population', '2009population',
'2010population', '2011population', '2012population', '2013population','2014population', '2015population', 
'2016population', '2017population', '2018population', '2019population', '2020population',
'2014-15 MEDIAN EMPLOYMENT INCOME PER JOB', '2015-16 MEDIAN EMPLOYMENT INCOME PER JOB',
'2016-17 MEDIAN EMPLOYMENT INCOME PER JOB', '2017-18 MEDIAN EMPLOYMENT INCOME PER JOB',
'2011_unemploy_rate', '2012_unemploy_rate', '2013_unemploy_rate', '2014_unemploy_rate',
'2015_unemploy_rate', '2016_unemploy_rate', '2017_unemploy_rate', '2018_unemploy_rate', '2019_unemploy_rate', 
'2020_unemploy_rate', '2014-15 #jobs*000', '2015-16 #jobs*000', '2016-17 #jobs*000', '2017-18 #jobs*000',])

# change all numeric feature to numeric data type
cols = ['cost_text', 'rooms', 'baths', 'parking', 'driving_time_to_school',
       'distance_to_school', 'driving_time_to_CBD', 'distance_to_CBD',
       'driving_time_to_sc', 'distance_to_sc', 'driving_time_to_station',
       'distance_to_station', 'SA2_code', '2021population',
       '2011-2021no', '2011-2021%', 'Area', 'Population density 2021',
       '2018-19 #jobs*000', '2018-19 MEDIAN EMPLOYMENT INCOME PER JOB',
       '2021_unemploy_rate']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

# rename the features
df = df.rename(columns={'2021population':'population_21','2011-2021no':'num_increase',
'2011-2021%':'percent_increase','Population density 2021':'pop_dens_21','2018-19 #jobs*000':'num_job_19',
'2018-19 MEDIAN EMPLOYMENT INCOME PER JOB':'income_rate_19','2021_unemploy_rate':'unemployment_rate_21'})

#calculating the lower and upper limit for mean +- 1.5IQR for rent price
lower = df["cost_text"].quantile(0.25) - 1.5*(df["cost_text"].quantile(0.75)-df["cost_text"].quantile(0.25))
upper = df["cost_text"].quantile(0.75) + 1.5*(df["cost_text"].quantile(0.75)-df["cost_text"].quantile(0.25))
#remove instance which has trip time outside the 1.5IQR range from mean
df = df[(df["cost_text"]>lower) & (df["cost_text"]<upper)]
df = df.dropna()

df


Unnamed: 0,cost_text,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,...,SA2_name,SA2_code,population_21,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21
49,800.0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,...,Docklands,206041118.0,15942.0,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80
62,780.0,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,...,Albert Park,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33
79,750.0,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,...,Albert Park,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33
93,700.0,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,...,Carlton,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35
119,670.0,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,...,Carlton,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14762,420.0,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,...,Wonthaggi - Inverloch,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30
14763,400.0,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,...,Wonthaggi - Inverloch,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30
14764,400.0,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,...,Wonthaggi - Inverloch,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30
14765,400.0,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,...,Wonthaggi - Inverloch,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30


In [3]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
SA2_name=df['SA2_name']

In [5]:
df1=df.drop(columns='SA2_name')
df1=df1.dropna()
df1=df1.reset_index()
df1=df1.drop(columns=['index'])
df1

Unnamed: 0,cost_text,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,...,distance_to_station,SA2_code,population_21,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21
0,800.0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,...,472.01,206041118.0,15942.0,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80
1,780.0,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,...,1769.16,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33
2,750.0,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,...,1686.95,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33
3,700.0,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,...,2145.33,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35
4,670.0,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,...,2024.38,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9513,420.0,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,...,14121.79,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30
9514,400.0,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,...,13305.61,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30
9515,400.0,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,...,13707.98,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30
9516,400.0,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,...,13972.42,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30


In [6]:
df1.columns

Index(['cost_text', 'rooms', 'baths', 'parking', 'driving_time_to_school',
       'distance_to_school', 'driving_time_to_CBD', 'distance_to_CBD',
       'driving_time_to_sc', 'distance_to_sc', 'driving_time_to_station',
       'distance_to_station', 'SA2_code', 'population_21', 'num_increase',
       'percent_increase', 'Area', 'pop_dens_21', 'num_job_19',
       'income_rate_19', 'unemployment_rate_21'],
      dtype='object')

In [7]:
X=df1[['rooms', 'baths', 'parking', 'driving_time_to_school',
       'distance_to_school', 'driving_time_to_CBD', 'distance_to_CBD',
       'driving_time_to_sc', 'distance_to_sc', 'driving_time_to_station',
       'distance_to_station', 'SA2_code', 'population_21', 'num_increase',
       'percent_increase', 'Area', 'pop_dens_21', 'num_job_19',
       'income_rate_19', 'unemployment_rate_21']]
y=df1['cost_text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
reg_rf=RandomForestRegressor(verbose=0,n_jobs=8,random_state=42)

In [9]:
reg_rf.fit(X_train,y_train)
y_predict_rf = reg_rf.predict(X_test)

In [10]:
from sklearn import metrics
import numpy as np
print("Randomforest Evaluation")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict_rf))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict_rf))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predict_rf)))
print('R-square:', r2_score(y_test,y_predict_rf))

Randomforest Evaluation
Mean Absolute Error: 53.28775468680689
Mean Squared Error: 5371.339922974662
Root Mean Squared Error: 73.28942572414292
R-square: 0.6745710904929996


In [11]:
#predict the 2023 price
y_predict_rf_reg=reg_rf.predict(X)
y_predict_rf_reg

array([639.30333333, 716.25      , 723.55555556, ..., 402.35      ,
       353.05      , 406.2       ])

In [12]:
df1['2023']=y_predict_rf_reg
df1

Unnamed: 0,cost_text,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,...,SA2_code,population_21,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2023
0,800.0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,...,206041118.0,15942.0,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80,639.303333
1,780.0,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,...,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,716.250000
2,750.0,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,...,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,723.555556
3,700.0,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,...,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,642.720000
4,670.0,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,...,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,660.620000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9513,420.0,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,387.750000
9514,400.0,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,368.150000
9515,400.0,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,402.350000
9516,400.0,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,353.050000


In [13]:
df2=df1.drop(columns=['cost_text'])
df2

Unnamed: 0,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,driving_time_to_station,...,SA2_code,population_21,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2023
0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,82.14,...,206041118.0,15942.0,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80,639.303333
1,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,227.91,...,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,716.250000
2,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,230.04,...,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,723.555556
3,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,254.97,...,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,642.720000
4,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,249.16,...,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,660.620000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9513,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,845.36,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,387.750000
9514,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,775.95,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,368.150000
9515,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,812.49,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,402.350000
9516,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,886.35,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,353.050000


In [14]:
#train the data
X_2023=df2[['rooms', 'baths', 'parking', 'driving_time_to_school',
       'distance_to_school', 'driving_time_to_CBD', 'distance_to_CBD',
       'driving_time_to_sc', 'distance_to_sc', 'driving_time_to_station',
       'distance_to_station', 'SA2_code', 'population_21', 'num_increase',
       'percent_increase', 'Area', 'pop_dens_21', 'num_job_19',
       'income_rate_19', 'unemployment_rate_21']]
y_2023=df2['2023']
X_train, X_test, y_train, y_test = train_test_split(X_2023, y_2023, test_size=0.33, random_state=42)
reg_rf=RandomForestRegressor(verbose=0,n_jobs=8,random_state=42)
reg_rf.fit(X_2023,y_2023)

RandomForestRegressor(n_jobs=8, random_state=42)

In [15]:
#predict the 2024 price
y_2024=reg_rf.predict(X_2023)
y_2024

array([639.30333333, 675.44948532, 719.57925615, ..., 407.755     ,
       354.94      , 410.8695    ])

In [16]:
df2['2024']=y_2024
df2

Unnamed: 0,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,driving_time_to_station,...,population_21,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2023,2024
0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,82.14,...,15942.0,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80,639.303333,639.303333
1,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,227.91,...,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,716.250000,675.449485
2,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,230.04,...,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,723.555556,719.579256
3,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,254.97,...,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,642.720000,642.156856
4,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,249.16,...,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,660.620000,657.545031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9513,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,845.36,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,387.750000,377.031500
9514,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,775.95,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,368.150000,366.377500
9515,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,812.49,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,402.350000,407.755000
9516,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,886.35,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,353.050000,354.940000


In [17]:
df3=df2.drop(columns=['2023'])
df3

Unnamed: 0,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,driving_time_to_station,...,SA2_code,population_21,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2024
0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,82.14,...,206041118.0,15942.0,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80,639.303333
1,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,227.91,...,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,675.449485
2,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,230.04,...,206051128.0,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,719.579256
3,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,254.97,...,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,642.156856
4,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,249.16,...,206041117.0,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,657.545031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9513,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,845.36,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,377.031500
9514,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,775.95,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,366.377500
9515,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,812.49,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,407.755000
9516,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,886.35,...,205031093.0,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,354.940000


In [18]:
#train the data for predicting the data 
X_2024=df3[['rooms', 'baths', 'parking', 'driving_time_to_school',
       'distance_to_school', 'driving_time_to_CBD', 'distance_to_CBD',
       'driving_time_to_sc', 'distance_to_sc', 'driving_time_to_station',
       'distance_to_station', 'SA2_code', 'population_21', 'num_increase',
       'percent_increase', 'Area', 'pop_dens_21', 'num_job_19',
       'income_rate_19', 'unemployment_rate_21']]
y_2024=df3['2024']
X_train, X_test, y_train, y_test = train_test_split(X_2024, y_2024, test_size=0.33, random_state=42)
reg_rf=RandomForestRegressor(verbose=0,n_jobs=8,random_state=42)
reg_rf.fit(X_2024,y_2024)

RandomForestRegressor(n_jobs=8, random_state=42)

In [19]:
#predict the 2025 price
y_2025=reg_rf.predict(X_2023)
y_2025

array([639.30333333, 650.48209125, 717.71724824, ..., 410.004445  ,
       357.323951  , 411.847415  ])

In [20]:
df3['2025']=y_2025
df3

Unnamed: 0,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,driving_time_to_station,...,population_21,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2024,2025
0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,82.14,...,15942.0,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80,639.303333,639.303333
1,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,227.91,...,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,675.449485,650.482091
2,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,230.04,...,16023.0,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,719.579256,717.717248
3,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,254.97,...,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,642.156856,640.408805
4,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,249.16,...,17198.0,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,657.545031,655.441481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9513,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,845.36,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,377.031500,367.046279
9514,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,775.95,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,366.377500,364.584062
9515,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,812.49,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,407.755000,410.004445
9516,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,886.35,...,26723.0,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,354.940000,357.323951


In [21]:
df1['2024']=y_2024
df1['2025']=y_2025
df1

Unnamed: 0,cost_text,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,...,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2023,2024,2025
0,800.0,2.0,2.0,0.0,176.84,1249.71,404.75,3475.62,196.15,1459.27,...,9746.0,157.3,2.4,6522.9,16.759,23164.0,3.80,639.303333,639.303333,639.303333
1,780.0,2.0,2.0,1.0,36.89,308.35,617.21,5766.55,165.09,1617.60,...,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,716.250000,675.449485,650.482091
2,750.0,2.0,2.0,2.0,67.66,497.75,584.29,4783.38,195.87,1807.00,...,1014.0,6.8,4.7,3427.6,15.822,29427.5,3.33,723.555556,719.579256,717.717248
3,700.0,2.0,2.0,1.0,85.37,638.52,688.69,6086.00,102.52,813.65,...,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,642.720000,642.156856,640.408805
4,670.0,2.0,2.0,1.0,79.55,517.57,639.61,5879.70,91.10,675.30,...,2076.0,13.7,1.8,9456.2,16.224,12760.0,7.35,660.620000,657.545031,655.441481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9513,420.0,2.0,1.0,0.0,132.33,1041.53,7441.26,155197.00,832.96,14089.57,...,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,387.750000,377.031500,367.046279
9514,400.0,2.0,1.0,1.0,54.61,618.56,7373.15,154441.34,763.54,13273.38,...,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,368.150000,366.377500,364.584062
9515,400.0,3.0,1.0,2.0,50.78,375.82,7405.83,154777.39,800.09,13675.76,...,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,402.350000,407.755000,410.004445
9516,400.0,3.0,1.0,1.0,239.71,2671.90,7485.04,155043.69,873.94,13940.19,...,6327.0,31.0,728.8,36.7,16.428,21912.5,6.30,353.050000,354.940000,357.323951


In [22]:
df4=df1.groupby('SA2_code').mean()
df4.head()

Unnamed: 0_level_0,cost_text,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,...,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2023,2024,2025
SA2_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201011001.0,435.25641,3.564103,1.897436,2.0,163.580513,1428.693846,5427.199231,121144.55359,257.708205,2448.217949,...,8371.0,99.0,52.7,319.2,11.038,28005.0,3.0,434.268339,435.837926,436.833582
201011002.0,413.472222,2.944444,1.361111,1.277778,125.538611,1050.494444,5059.028333,114844.282778,208.058333,1893.676389,...,-289.0,-2.3,12.4,975.5,10.22,24587.0,4.25,409.569857,414.461089,417.814154
201011005.0,432.857143,3.571429,1.714286,1.285714,217.98,1646.151429,5374.818571,117215.971429,889.321429,10394.03,...,507.0,7.5,51.6,140.2,5.614,27272.0,2.77,439.878571,442.168983,443.553587
201011006.0,452.948718,3.897436,2.025641,2.153846,303.162308,2675.248205,5599.432821,119381.88641,300.210256,2393.878462,...,4802.0,82.3,34.2,311.5,6.568,27209.0,5.55,459.368497,458.016846,456.810066
201011008.0,373.695652,2.956522,1.434783,1.652174,146.567826,1462.468261,5001.787826,118067.323478,238.78,2259.183478,...,841.0,5.8,67.6,226.6,9.982,25915.5,7.9,402.642935,408.168429,412.389512


In [23]:
df4=df4.iloc[1:,:]
df4.head()

Unnamed: 0_level_0,cost_text,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,distance_to_sc,...,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2023,2024,2025
SA2_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201011002.0,413.472222,2.944444,1.361111,1.277778,125.538611,1050.494444,5059.028333,114844.282778,208.058333,1893.676389,...,-289.0,-2.3,12.4,975.5,10.22,24587.0,4.25,409.569857,414.461089,417.814154
201011005.0,432.857143,3.571429,1.714286,1.285714,217.98,1646.151429,5374.818571,117215.971429,889.321429,10394.03,...,507.0,7.5,51.6,140.2,5.614,27272.0,2.77,439.878571,442.168983,443.553587
201011006.0,452.948718,3.897436,2.025641,2.153846,303.162308,2675.248205,5599.432821,119381.88641,300.210256,2393.878462,...,4802.0,82.3,34.2,311.5,6.568,27209.0,5.55,459.368497,458.016846,456.810066
201011008.0,373.695652,2.956522,1.434783,1.652174,146.567826,1462.468261,5001.787826,118067.323478,238.78,2259.183478,...,841.0,5.8,67.6,226.6,9.982,25915.5,7.9,402.642935,408.168429,412.389512
201021009.0,385.0,3.0,1.5,2.5,142.9025,1971.44,3400.175,76043.1575,1023.2625,21842.94,...,861.0,15.2,1038.8,6.3,4.977,27824.0,3.85,451.125,465.440034,473.461903


In [25]:
df4=df4.reset_index()
df4=df4.drop(columns=['index'])
df4

Unnamed: 0,SA2_code,cost_text,rooms,baths,parking,driving_time_to_school,distance_to_school,driving_time_to_CBD,distance_to_CBD,driving_time_to_sc,...,num_increase,percent_increase,Area,pop_dens_21,num_job_19,income_rate_19,unemployment_rate_21,2023,2024,2025
0,201011002.0,413.472222,2.944444,1.361111,1.277778,125.538611,1050.494444,5059.028333,114844.282778,208.058333,...,-289.0,-2.3,12.4,975.5,10.220,24587.0,4.25,409.569857,414.461089,417.814154
1,201011005.0,432.857143,3.571429,1.714286,1.285714,217.980000,1646.151429,5374.818571,117215.971429,889.321429,...,507.0,7.5,51.6,140.2,5.614,27272.0,2.77,439.878571,442.168983,443.553587
2,201011006.0,452.948718,3.897436,2.025641,2.153846,303.162308,2675.248205,5599.432821,119381.886410,300.210256,...,4802.0,82.3,34.2,311.5,6.568,27209.0,5.55,459.368497,458.016846,456.810066
3,201011008.0,373.695652,2.956522,1.434783,1.652174,146.567826,1462.468261,5001.787826,118067.323478,238.780000,...,841.0,5.8,67.6,226.6,9.982,25915.5,7.90,402.642935,408.168429,412.389512
4,201021009.0,385.000000,3.000000,1.500000,2.500000,142.902500,1971.440000,3400.175000,76043.157500,1023.262500,...,861.0,15.2,1038.8,6.3,4.977,27824.0,3.85,451.125000,465.440034,473.461903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,217031476.0,325.555556,3.444444,1.555556,1.555556,138.784444,1401.961111,9625.526667,196466.757778,3965.891111,...,558.0,16.3,1505.9,2.6,3.370,14554.0,3.65,348.288889,346.406778,347.499392
370,217041477.0,420.000000,3.500000,1.500000,1.000000,747.650000,12172.510000,10121.907500,226987.482500,2587.572500,...,210.0,3.1,3275.8,2.1,6.523,17586.0,3.28,445.862500,450.498000,451.515434
371,217041478.0,376.000000,2.800000,1.200000,1.200000,562.630000,10636.514000,12580.866000,286117.264000,2089.912000,...,939.0,10.4,2037.8,4.9,8.399,20000.0,2.50,431.878000,421.814518,417.285935
372,217041479.0,399.736842,2.578947,1.315789,1.263158,150.326316,1189.777895,11446.829474,255943.605263,374.212632,...,2583.0,13.0,62.4,359.9,19.115,22262.5,3.75,396.784298,394.386486,391.966645


In [26]:
df4=df4[['SA2_code','cost_text','2023','2024','2025']]
df4

Unnamed: 0,SA2_code,cost_text,2023,2024,2025
0,201011002.0,413.472222,409.569857,414.461089,417.814154
1,201011005.0,432.857143,439.878571,442.168983,443.553587
2,201011006.0,452.948718,459.368497,458.016846,456.810066
3,201011008.0,373.695652,402.642935,408.168429,412.389512
4,201021009.0,385.000000,451.125000,465.440034,473.461903
...,...,...,...,...,...
369,217031476.0,325.555556,348.288889,346.406778,347.499392
370,217041477.0,420.000000,445.862500,450.498000,451.515434
371,217041478.0,376.000000,431.878000,421.814518,417.285935
372,217041479.0,399.736842,396.784298,394.386486,391.966645


In [27]:
#find the SA2 name according to the SA2 code
SA2=[]
for i in range(df4.shape[0]):
    for j in range(df.shape[0]):
        if df4.iloc[i]['SA2_code'] == df.iloc[j]['SA2_code']:
            SA2.append(df.iloc[j]['SA2_name'])
            break
SA2

['Ballarat',
 'Buninyong',
 'Delacombe',
 'Wendouree - Miners Rest',
 'Bacchus Marsh Surrounds',
 'Creswick - Clunes',
 'Daylesford',
 'Gordon (Vic.)',
 'Maryborough (Vic.)',
 'Bendigo',
 'California Gully - Eaglehawk',
 'East Bendigo - Kennington',
 'Flora Hill - Spring Gully',
 'Kangaroo Flat - Golden Square',
 'Maiden Gully',
 'Strathfieldsaye',
 'White Hills - Ascot',
 'Bendigo Surrounds - South',
 'Castlemaine',
 'Castlemaine Surrounds',
 'Heathcote',
 'Kyneton',
 'Woodend',
 'Bendigo Surrounds - North',
 'Loddon',
 'Bannockburn',
 'Winchelsea',
 'Belmont',
 'Geelong',
 'Geelong West - Hamlyn Heights',
 'Highton',
 'Lara',
 'Leopold',
 'Newcomb - Moolap',
 'Newtown (Vic.)',
 'North Geelong - Bell Park',
 'Clifton Springs',
 'Lorne - Anglesea',
 'Portarlington',
 'Point Lonsdale - Queenscliff',
 'Torquay',
 'Alexandra',
 'Kilmore - Broadford',
 'Mansfield (Vic.)',
 'Seymour',
 'Yea',
 'Benalla',
 'Benalla Surrounds',
 'Rutherglen',
 'Wangaratta',
 'Wangaratta Surrounds',
 'Beechwor

In [28]:
#get the final dataframe
df4['SA2_name']=SA2
df4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['SA2_name']=SA2


Unnamed: 0,SA2_code,cost_text,2023,2024,2025,SA2_name
0,201011002.0,413.472222,409.569857,414.461089,417.814154,Ballarat
1,201011005.0,432.857143,439.878571,442.168983,443.553587,Buninyong
2,201011006.0,452.948718,459.368497,458.016846,456.810066,Delacombe
3,201011008.0,373.695652,402.642935,408.168429,412.389512,Wendouree - Miners Rest
4,201021009.0,385.000000,451.125000,465.440034,473.461903,Bacchus Marsh Surrounds
...,...,...,...,...,...,...
369,217031476.0,325.555556,348.288889,346.406778,347.499392,Otway
370,217041477.0,420.000000,445.862500,450.498000,451.515434,Moyne - East
371,217041478.0,376.000000,431.878000,421.814518,417.285935,Moyne - West
372,217041479.0,399.736842,396.784298,394.386486,391.966645,Warrnambool - North


In [29]:
df4.to_csv('../data/curated/rfmodel.csv',index=False)

In [30]:
# calculate increase in rent each year
df4['22to23dif']=df4['2023']-df4['cost_text']
df4['23to24dif']=df4['2024']-df4['2023']
df4['24to25dif']=df4['2025']-df4['2024']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['22to23dif']=df4['2023']-df4['cost_text']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['23to24dif']=df4['2024']-df4['2023']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['24to25dif']=df4['2025']-df4['2024']


In [31]:
# calculate how much the rent increase for each suburb by percentage
df4['22to23rate']=((df4['2023']-df4['cost_text'])/df4['cost_text'])*100
df4['23to24rate']=((df4['2024']-df4['2023'])/df4['2023'])*100
df4['24to25rate']=((df4['2025']-df4['2024'])/df4['2024'])*100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['22to23rate']=((df4['2023']-df4['cost_text'])/df4['cost_text'])*100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['23to24rate']=((df4['2024']-df4['2023'])/df4['2023'])*100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['24to25rate']=((df4['2025']-df4['2024'])/df4['2024'])*100


In [32]:
# 2023 top 10 suburb with highest rent increasing
df4.sort_values('22to23dif',ascending=False).head(10)[['SA2_name','22to23dif']]

Unnamed: 0,SA2_name,22to23dif
17,Bendigo Surrounds - South,182.75
341,Mildura Surrounds,131.8
245,Yarra Valley,116.1
168,Kinglake,98.59
250,Narre Warren North,70.5
335,Nhill Region,67.25
4,Bacchus Marsh Surrounds,66.125
57,Yackandandah,65.5
371,Moyne - West,55.878
68,Leongatha,53.590556


In [33]:
# 2024 top 10 suburb with highest rent increasing
df4.sort_values('23to24dif',ascending=False).head(10)[['SA2_name','23to24dif']]

Unnamed: 0,SA2_name,23to24dif
245,Yarra Valley,57.488583
338,West Wimmera,28.671
57,Yackandandah,23.4965
361,Glenelg (Vic.),19.28895
78,Yarram,15.41
7,Gordon (Vic.),14.69805
4,Bacchus Marsh Surrounds,14.315034
339,Yarriambiack,13.96425
351,Kyabram,13.919
212,Rowville - North,12.358456


In [34]:
# 2025 top 10 suburb with highest rent increasing
df4.sort_values('24to25dif',ascending=False).head(10)[['SA2_name','24to25dif']]

Unnamed: 0,SA2_name,24to25dif
245,Yarra Valley,38.762607
338,West Wimmera,18.11443
361,Glenelg (Vic.),17.007364
7,Gordon (Vic.),13.532491
57,Yackandandah,13.500669
212,Rowville - North,9.734611
25,Bannockburn,9.43258
183,South Morang - South,9.298928
250,Narre Warren North,9.240042
4,Bacchus Marsh Surrounds,8.021869


In [35]:
# 2023 top 10 suburb with highest rent increasing by rate(percentage)
df4.sort_values('22to23rate',ascending=False).head(10)[['SA2_name','22to23rate']]

Unnamed: 0,SA2_name,22to23rate
341,Mildura Surrounds,59.909091
245,Yarra Valley,58.05
17,Bendigo Surrounds - South,45.6875
57,Yackandandah,34.473684
335,Nhill Region,27.44898
168,Kinglake,25.279487
68,Leongatha,20.159457
338,West Wimmera,18.217391
4,Bacchus Marsh Surrounds,17.175325
354,Moira,16.502247


In [36]:
# 2024 top 10 suburb with highest rent increasing by rate(percentage)
df4.sort_values('23to24rate',ascending=False).head(10)[['SA2_name','23to24rate']]

Unnamed: 0,SA2_name,23to24rate
245,Yarra Valley,18.186834
338,West Wimmera,10.544686
57,Yackandandah,9.196282
361,Glenelg (Vic.),6.631354
339,Yarriambiack,4.785145
347,Kerang,4.380028
242,Mount Evelyn,4.056106
351,Kyabram,3.935256
7,Gordon (Vic.),3.254901
4,Bacchus Marsh Surrounds,3.173186


In [37]:
# 2025 top 10 suburb with highest rent increasing by rate(percentage)
df4.sort_values('24to25rate',ascending=False).head(10)[['SA2_name','24to25rate']]

Unnamed: 0,SA2_name,24to25rate
245,Yarra Valley,10.375747
338,West Wimmera,6.026673
361,Glenelg (Vic.),5.483346
57,Yackandandah,4.83901
7,Gordon (Vic.),2.902319
242,Mount Evelyn,2.756645
355,Numurkah,2.289651
339,Yarriambiack,2.249354
183,South Morang - South,2.034031
212,Rowville - North,1.991376
