In [1]:
import numpy as np
import pandas as pd
import folium
import geopandas as gpd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

In [2]:
df = pd.read_csv("../data/curated/predict_data.csv")
df.head()

Unnamed: 0,url,postcode,longitude,latitude,price,bed,bath,parking,cbd_dist,school_counts,furnished,2022 no.,2023 no.,2024 no.,2025 no.
0,https://www.domain.com.au/9-21-rockley-road-so...,3141,144.999757,-37.837338,400.0,1,1,1.0,5.9614,6.0,0,6515.0,6508.0,6512.0,6520.0
1,https://www.domain.com.au/35-233-canterbury-ro...,3182,144.976195,-37.85915,360.0,1,1,1.0,4.1882,5.0,0,16152.0,16303.0,16522.0,16692.0
2,https://www.domain.com.au/2-31-howitt-street-s...,3141,145.002817,-37.846776,320.0,1,1,0.0,6.9114,6.0,0,6515.0,6508.0,6512.0,6520.0
3,https://www.domain.com.au/2-17-railway-parade-...,3163,145.069344,-37.891722,360.0,1,1,1.0,14.3864,5.0,0,14114.0,14380.0,14662.0,14942.0
4,https://www.domain.com.au/2-16-vale-st-st-kild...,3182,144.983362,-37.866152,320.0,1,1,1.0,5.8505,5.0,0,16152.0,16303.0,16522.0,16692.0


In [3]:
# select columns
df = df.iloc[:, [1,4,5,6,7,8,9,10,11,12,13,14]]
df

Unnamed: 0,postcode,price,bed,bath,parking,cbd_dist,school_counts,furnished,2022 no.,2023 no.,2024 no.,2025 no.
0,3141,400.0,1,1,1.0,5.9614,6.0,0,6515.0,6508.0,6512.0,6520.0
1,3182,360.0,1,1,1.0,4.1882,5.0,0,16152.0,16303.0,16522.0,16692.0
2,3141,320.0,1,1,0.0,6.9114,6.0,0,6515.0,6508.0,6512.0,6520.0
3,3163,360.0,1,1,1.0,14.3864,5.0,0,14114.0,14380.0,14662.0,14942.0
4,3182,320.0,1,1,1.0,5.8505,5.0,0,16152.0,16303.0,16522.0,16692.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3427,3754,350.0,2,1,1.0,37.3377,10.0,0,4348.0,4353.0,4359.0,4376.0
3428,3186,510.0,2,1,1.0,8.7118,8.0,0,19519.0,19744.0,19999.0,20259.0
3429,3181,550.0,2,1,0.0,7.0455,3.0,0,20158.0,20756.0,21280.0,21647.0
3430,3122,600.0,2,2,1.0,9.1834,8.0,0,16028.0,16249.0,16618.0,17208.0


In [4]:
df.isnull().sum() # check null values

postcode         0
price            0
bed              0
bath             0
parking          0
cbd_dist         0
school_counts    0
furnished        0
2022 no.         0
2023 no.         0
2024 no.         0
2025 no.         0
dtype: int64

In [4]:
# deal with categorical variables
dummies = pd.get_dummies(df.furnished, prefix = ['f'])
df = pd.concat([df, dummies],axis = 1)
del df['furnished']

In [5]:
df

Unnamed: 0,postcode,price,bed,bath,parking,cbd_dist,school_counts,2022 no.,2023 no.,2024 no.,2025 no.,['f']_0,['f']_1
0,3141,400.0,1,1,1.0,5.9614,6.0,6515.0,6508.0,6512.0,6520.0,1,0
1,3182,360.0,1,1,1.0,4.1882,5.0,16152.0,16303.0,16522.0,16692.0,1,0
2,3141,320.0,1,1,0.0,6.9114,6.0,6515.0,6508.0,6512.0,6520.0,1,0
3,3163,360.0,1,1,1.0,14.3864,5.0,14114.0,14380.0,14662.0,14942.0,1,0
4,3182,320.0,1,1,1.0,5.8505,5.0,16152.0,16303.0,16522.0,16692.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3427,3754,350.0,2,1,1.0,37.3377,10.0,4348.0,4353.0,4359.0,4376.0,1,0
3428,3186,510.0,2,1,1.0,8.7118,8.0,19519.0,19744.0,19999.0,20259.0,1,0
3429,3181,550.0,2,1,0.0,7.0455,3.0,20158.0,20756.0,21280.0,21647.0,1,0
3430,3122,600.0,2,2,1.0,9.1834,8.0,16028.0,16249.0,16618.0,17208.0,1,0


In [6]:
# calculate population change each year
df['pop23-22'] = df['2023 no.'] - df['2022 no.']
df['pop24-23'] = df['2024 no.'] - df['2023 no.']
df['pop25-24'] = df['2025 no.'] - df['2024 no.']
df = df.drop(columns = ['2022 no.', '2023 no.', '2024 no.', '2025 no.'])
df

Unnamed: 0,postcode,price,bed,bath,parking,cbd_dist,school_counts,['f']_0,['f']_1,pop23-22,pop24-23,pop25-24
0,3141,400.0,1,1,1.0,5.9614,6.0,1,0,-7.0,4.0,8.0
1,3182,360.0,1,1,1.0,4.1882,5.0,1,0,151.0,219.0,170.0
2,3141,320.0,1,1,0.0,6.9114,6.0,1,0,-7.0,4.0,8.0
3,3163,360.0,1,1,1.0,14.3864,5.0,1,0,266.0,282.0,280.0
4,3182,320.0,1,1,1.0,5.8505,5.0,1,0,151.0,219.0,170.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3427,3754,350.0,2,1,1.0,37.3377,10.0,1,0,5.0,6.0,17.0
3428,3186,510.0,2,1,1.0,8.7118,8.0,1,0,225.0,255.0,260.0
3429,3181,550.0,2,1,0.0,7.0455,3.0,1,0,598.0,524.0,367.0
3430,3122,600.0,2,2,1.0,9.1834,8.0,1,0,221.0,369.0,590.0


In [7]:
y = np.array(df['price'])
x = df.drop(columns = ['postcode', 'price', 'pop24-23', 'pop25-24'], axis = 1)
x_list = list(x.columns)
x = np.array(x)
x

array([[  1.,   1.,   1., ...,   1.,   0.,  -7.],
       [  1.,   1.,   1., ...,   1.,   0., 151.],
       [  1.,   1.,   0., ...,   1.,   0.,  -7.],
       ...,
       [  2.,   1.,   0., ...,   1.,   0., 598.],
       [  2.,   2.,   1., ...,   1.,   0., 221.],
       [  2.,   1.,   1., ...,   1.,   0.,  50.]])

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.28, random_state = 42)

In [9]:
# fit a linear regression model
model = LinearRegression().fit(x_train, y_train)

In [10]:
y_pred = model.predict(x)

In [12]:
#mse_1 = mean_squared_error(y, y_pred)
#print ("Train ERROR = ", mse_1)

pred1_train = model.predict(x_train)
mse_1 = mean_squared_error(pred1_train,y_train)
print ("Train ERROR = ", mse_1)

pred1_test = model.predict(x_test)
mse_2 = mean_squared_error(pred1_test,y_test)
print("Test ERROR = ", mse_2)

Train ERROR =  10049.085728366746
Test ERROR =  18753.082089837142


In [21]:
df['price_2023'] = y_pred
df

Unnamed: 0,postcode,price,bed,bath,parking,cbd_dist,school_counts,['f']_0,['f']_1,pop23-22,pop24-23,pop25-24,price_2023
0,3141,400.0,1,1,1.0,5.9614,6.0,1,0,-7.0,4.0,8.0,366.621240
1,3182,360.0,1,1,1.0,4.1882,5.0,1,0,151.0,219.0,170.0,375.578976
2,3141,320.0,1,1,0.0,6.9114,6.0,1,0,-7.0,4.0,8.0,354.256832
3,3163,360.0,1,1,1.0,14.3864,5.0,1,0,266.0,282.0,280.0,373.131511
4,3182,320.0,1,1,1.0,5.8505,5.0,1,0,151.0,219.0,170.0,374.899602
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3427,3754,350.0,2,1,1.0,37.3377,10.0,1,0,5.0,6.0,17.0,430.635748
3428,3186,510.0,2,1,1.0,8.7118,8.0,1,0,225.0,255.0,260.0,457.364861
3429,3181,550.0,2,1,0.0,7.0455,3.0,1,0,598.0,524.0,367.0,480.996261
3430,3122,600.0,2,2,1.0,9.1834,8.0,1,0,221.0,369.0,590.0,582.594692


In [22]:
# new y is the price of 2023
y2 = np.array(df['price_2023'])
x2 = df.iloc[:, [1, 10]]
x2_list = list(x2.columns)
x2 = np.array(x2)
x2

array([[400.,   4.],
       [360., 219.],
       [320.,   4.],
       ...,
       [550., 524.],
       [600., 369.],
       [490.,  39.]])

In [23]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size = 0.28, random_state = 42)
m2 = LinearRegression().fit(x2_train, y2_train)
y2_pred = m2.predict(x2)
df['price_2024'] = y2_pred
df

Unnamed: 0,postcode,price,bed,bath,parking,cbd_dist,school_counts,['f']_0,['f']_1,pop23-22,pop24-23,pop25-24,price_2023,price_2024
0,3141,400.0,1,1,1.0,5.9614,6.0,1,0,-7.0,4.0,8.0,366.621240,391.199273
1,3182,360.0,1,1,1.0,4.1882,5.0,1,0,151.0,219.0,170.0,375.578976,380.333405
2,3141,320.0,1,1,0.0,6.9114,6.0,1,0,-7.0,4.0,8.0,354.256832,362.888233
3,3163,360.0,1,1,1.0,14.3864,5.0,1,0,266.0,282.0,280.0,373.131511,381.297349
4,3182,320.0,1,1,1.0,5.8505,5.0,1,0,151.0,219.0,170.0,374.899602,366.177885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3427,3754,350.0,2,1,1.0,37.3377,10.0,1,0,5.0,6.0,17.0,430.635748,373.535475
3428,3186,510.0,2,1,1.0,8.7118,8.0,1,0,225.0,255.0,260.0,457.364861,433.967431
3429,3181,550.0,2,1,0.0,7.0455,3.0,1,0,598.0,524.0,367.0,480.996261,452.238841
3430,3122,600.0,2,2,1.0,9.1834,8.0,1,0,221.0,369.0,590.0,582.594692,467.561632


In [24]:
# new y is the price of 2024
y3 = np.array(df['price_2024'])
x3 = df.iloc[:, [11, 12]]
x3_list = list(x3.columns)
x3 = np.array(x3)
x3

array([[  8.        , 366.62123956],
       [170.        , 375.57897631],
       [  8.        , 354.25683213],
       ...,
       [367.        , 480.99626128],
       [590.        , 582.5946918 ],
       [ 35.        , 502.51306545]])

In [25]:
x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.28, random_state=42)
m3 = LinearRegression().fit(x3_train, y3_train)
y3_pred = m3.predict(x3)
df['price_2025'] = y3_pred
df

Unnamed: 0,postcode,price,bed,bath,parking,cbd_dist,school_counts,['f']_0,['f']_1,pop23-22,pop24-23,pop25-24,price_2023,price_2024,price_2025
0,3141,400.0,1,1,1.0,5.9614,6.0,1,0,-7.0,4.0,8.0,366.621240,391.199273,379.334441
1,3182,360.0,1,1,1.0,4.1882,5.0,1,0,151.0,219.0,170.0,375.578976,380.333405,385.024144
2,3141,320.0,1,1,0.0,6.9114,6.0,1,0,-7.0,4.0,8.0,354.256832,362.888233,374.950472
3,3163,360.0,1,1,1.0,14.3864,5.0,1,0,266.0,282.0,280.0,373.131511,381.297349,385.863137
4,3182,320.0,1,1,1.0,5.8505,5.0,1,0,151.0,219.0,170.0,374.899602,366.177885,384.783263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3427,3754,350.0,2,1,1.0,37.3377,10.0,1,0,5.0,6.0,17.0,430.635748,373.535475,402.171305
3428,3186,510.0,2,1,1.0,8.7118,8.0,1,0,225.0,255.0,260.0,457.364861,433.967431,415.418899
3429,3181,550.0,2,1,0.0,7.0455,3.0,1,0,598.0,524.0,367.0,480.996261,452.238841,425.457961
3430,3122,600.0,2,2,1.0,9.1834,8.0,1,0,221.0,369.0,590.0,582.594692,467.561632,464.941172


In [26]:
rate_df = df.iloc[:, [0, 1, 2, 12, 13, 14]]
rate_df

Unnamed: 0,postcode,price,bed,price_2023,price_2024,price_2025
0,3141,400.0,1,366.621240,391.199273,379.334441
1,3182,360.0,1,375.578976,380.333405,385.024144
2,3141,320.0,1,354.256832,362.888233,374.950472
3,3163,360.0,1,373.131511,381.297349,385.863137
4,3182,320.0,1,374.899602,366.177885,384.783263
...,...,...,...,...,...,...
3427,3754,350.0,2,430.635748,373.535475,402.171305
3428,3186,510.0,2,457.364861,433.967431,415.418899
3429,3181,550.0,2,480.996261,452.238841,425.457961
3430,3122,600.0,2,582.594692,467.561632,464.941172


In [27]:
# calculate growth rate
rate_df['rate_22-23'] = (rate_df['price_2023'] - rate_df['price']) / rate_df['price']
rate_df['rate_23-24'] = (rate_df['price_2024'] - rate_df['price_2023']) / rate_df['price_2023']
rate_df['rate_24-25'] = (rate_df['price_2025'] - rate_df['price_2024']) / rate_df['price_2024']
rate_df['avg_rate'] = (rate_df['rate_22-23'] + rate_df['rate_23-24'] + rate_df['rate_24-25']) / 3
rate_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rate_df['rate_22-23'] = (rate_df['price_2023'] - rate_df['price']) / rate_df['price']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rate_df['rate_23-24'] = (rate_df['price_2024'] - rate_df['price_2023']) / rate_df['price_2023']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rate_df['rate_24-25'] =

Unnamed: 0,postcode,price,bed,price_2023,price_2024,price_2025,rate_22-23,rate_23-24,rate_24-25,avg_rate
0,3141,400.0,1,366.621240,391.199273,379.334441,-0.083447,0.067039,-0.030329,-0.015579
1,3182,360.0,1,375.578976,380.333405,385.024144,0.043275,0.012659,0.012333,0.022756
2,3141,320.0,1,354.256832,362.888233,374.950472,0.107053,0.024365,0.033240,0.054886
3,3163,360.0,1,373.131511,381.297349,385.863137,0.036476,0.021885,0.011974,0.023445
4,3182,320.0,1,374.899602,366.177885,384.783263,0.171561,-0.023264,0.050810,0.066369
...,...,...,...,...,...,...,...,...,...,...
3427,3754,350.0,2,430.635748,373.535475,402.171305,0.230388,-0.132595,0.076662,0.058151
3428,3186,510.0,2,457.364861,433.967431,415.418899,-0.103206,-0.051157,-0.042742,-0.065702
3429,3181,550.0,2,480.996261,452.238841,425.457961,-0.125461,-0.059787,-0.059218,-0.081489
3430,3122,600.0,2,582.594692,467.561632,464.941172,-0.029009,-0.197450,-0.005605,-0.077354


In [46]:
rate_df.to_csv(r'../data/curated/rate.csv', index = False)

In [39]:
# output the top 10 suburbs with highest average growth rate
growth_df = rate_df.groupby('postcode')['avg_rate'].mean().sort_values(ascending = False)
growth_df.head(10)

postcode
3048    0.658791
3202    0.214578
3085    0.207713
3465    0.202106
3305    0.201719
3429    0.188992
3125    0.173976
3585    0.162220
3350    0.159621
3840    0.156779
Name: avg_rate, dtype: float64