In [165]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import log
from statsmodels.formula.api import ols

df = pd.read_csv('../data/curated/ques2data.csv')
df = df[(df['Type'] == 'All properties') | (df['Year'] > 2020)]

print(len(df['suburb'].unique()))
df = df.drop('Type', axis=1)
df.loc[:, 'Median_Price'] = df['Median_Price'].apply(pd.to_numeric, errors='coerce')
df = df.dropna(subset=['population', 'income'], axis=0)
print(len(df['suburb'].unique()))
df.columns

384
312


Index(['suburb', 'Year', 'population', 'income', 'Median_Price'], dtype='object')

In [166]:
known_suburbs = df[df['Year'] <= 2020]['suburb'].unique()

df = df[df['suburb'].isin(known_suburbs)]

print(len(df['suburb'].unique()))


144


In [167]:
df.loc[:, 'income_pct_change'] = df.groupby('suburb')['income'].apply(pd.Series.pct_change) + 1
df.loc[:, 'income_change'] = df.groupby('suburb')['income_pct_change'].apply(pd.Series.cumprod) - 1

df.loc[:, 'pop_pct_change'] = df.groupby('suburb')['population'].apply(pd.Series.pct_change) + 1
df.loc[:, 'population_change'] = df.groupby('suburb')['pop_pct_change'].apply(pd.Series.cumprod) - 1
df.drop(['income_pct_change', 'pop_pct_change'], axis=1, inplace=True)

In [168]:
model = ols(formula="""Median_Price ~ suburb + income_change + population_change""",
            data=df)

model.fit().summary()

0,1,2,3
Dep. Variable:,Median_Price,R-squared:,0.963
Model:,OLS,Adj. R-squared:,0.95
Method:,Least Squares,F-statistic:,76.59
Date:,"Sun, 09 Oct 2022",Prob (F-statistic):,2.1499999999999998e-240
Time:,22:34:40,Log-Likelihood:,-2268.6
No. Observations:,576,AIC:,4829.0
Df Residuals:,430,BIC:,5465.0
Df Model:,145,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,443.1609,7.627,58.102,0.000,428.169,458.152
suburb[T.Albert Park],105.5010,10.340,10.203,0.000,85.177,125.825
suburb[T.Alphington],-47.6795,10.350,-4.607,0.000,-68.023,-27.336
suburb[T.Altona],-72.0784,10.250,-7.032,0.000,-92.225,-51.932
suburb[T.Armadale],28.5547,10.354,2.758,0.006,8.203,48.906
suburb[T.Ascot Vale],-22.5062,10.354,-2.174,0.030,-42.857,-2.155
suburb[T.Ashburton],-3.5651,10.291,-0.346,0.729,-23.792,16.661
suburb[T.Aspendale],-31.6141,10.350,-3.054,0.002,-51.957,-11.271
suburb[T.Avondale Heights],-51.5172,10.298,-5.003,0.000,-71.758,-31.276

0,1,2,3
Omnibus:,62.742,Durbin-Watson:,2.179
Prob(Omnibus):,0.0,Jarque-Bera (JB):,214.086
Skew:,-0.465,Prob(JB):,3.25e-47
Kurtosis:,5.838,Cond. No.,149.0


In [169]:
future_df = df[df['Year'] > 2020]

In [170]:
future_df.loc[:, 'Median_Price'] = model.fit().predict(future_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df.loc[:, 'Median_Price'] = model.fit().predict(future_df)


In [171]:
future_df.sort_values('income_change')

Unnamed: 0,suburb,Year,population,income,Median_Price,income_change,population_change
5649,Port Melbourne,2025,18137.345904,43602.287980,566.406062,-0.598832,0.063782
5648,Port Melbourne,2024,18016.178669,44257.012448,565.256083,-0.592808,0.056675
5647,Port Melbourne,2023,17895.123755,46621.494706,565.849841,-0.571054,0.049575
5646,Port Melbourne,2022,17773.459914,50930.818986,568.416043,-0.531405,0.042439
5645,Port Melbourne,2021,17652.756421,57346.293187,573.142577,-0.472379,0.035360
...,...,...,...,...,...,...,...
4555,Mentone,2021,13331.031292,136024.216986,471.244901,1.032918,-0.026645
4556,Mentone,2022,13258.049977,169081.747885,524.594547,1.526971,-0.031973
4557,Mentone,2023,13185.098940,218402.637071,604.861552,2.264085,-0.037300
4558,Mentone,2024,13112.132501,294503.446082,729.450577,3.401432,-0.042628


In [172]:
df.loc[df['Year'] > 2020, 'Median_Price'] = model.fit().predict(df[df['Year'] > 2020])

In [173]:
df[df['Year'] > 2020]

Unnamed: 0,suburb,Year,population,income,Median_Price,income_change,population_change
35,Abbotsford,2021,9337.364459,87739.135073,480.788008,0.210807,0.055858
36,Abbotsford,2022,9404.488164,92916.656874,490.641225,0.282257,0.063448
37,Abbotsford,2023,9878.605420,99085.265448,513.776344,0.367385,0.117061
38,Abbotsford,2024,10022.687593,106485.898547,529.252082,0.469514,0.133353
39,Abbotsford,2025,10190.912148,115433.759061,547.790376,0.592995,0.152376
...,...,...,...,...,...,...,...
7435,Yarraville,2021,15910.505666,91911.558135,476.045022,0.205745,-0.000617
7436,Yarraville,2022,15908.663387,96980.309295,483.379130,0.272240,-0.000733
7437,Yarraville,2023,15907.936920,102997.167982,492.108535,0.351172,-0.000778
7438,Yarraville,2024,15906.499919,110186.147542,502.529343,0.445481,-0.000868


In [174]:
sub_postcodeinfo = pd.read_csv('../data/raw/realestate.csv')
sub_postcodeinfo = sub_postcodeinfo[['suburb', 'postcode']]
sub_postcode = sub_postcodeinfo.groupby(['suburb', 'postcode']).count().reset_index()
sub_postcode
df = df.merge(sub_postcode, on='suburb', how='inner')
df

Unnamed: 0,suburb,Year,population,income,Median_Price,income_change,population_change,postcode
0,Abbotsford,2016,8843.391594,72463.341789,450.000000,,,3067
1,Abbotsford,2017,9360.969572,75929.483052,450.000000,0.047833,0.058527,3067
2,Abbotsford,2018,9595.265665,78784.652725,475.000000,0.087235,0.085021,3067
3,Abbotsford,2019,9661.113233,81883.887314,495.000000,0.130004,0.092467,3067
4,Abbotsford,2020,9738.337037,86715.715407,490.000000,0.196684,0.101199,3067
...,...,...,...,...,...,...,...,...
1445,Yarraville,2021,15910.505666,91911.558135,476.045022,0.205745,-0.000617,3013
1446,Yarraville,2022,15908.663387,96980.309295,483.379130,0.272240,-0.000733,3013
1447,Yarraville,2023,15907.936920,102997.167982,492.108535,0.351172,-0.000778,3013
1448,Yarraville,2024,15906.499919,110186.147542,502.529343,0.445481,-0.000868,3013


In [175]:
df = df[df['Year'].isin([2022, 2025])]
df.loc[:, 'Median_Price_Change'] = df.groupby('suburb')['Median_Price'].apply(pd.Series.pct_change)
df.dropna(inplace=True)
df = df[['Median_Price_Change', 'suburb', 'postcode', 'Median_Price']]
df.to_csv('../data/curated/2025estimates.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Median_Price_Change'] = df.groupby('suburb')['Median_Price'].apply(pd.Series.pct_change)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [186]:
highest_growth_df = df.sort_values('Median_Price_Change').tail(11)
highest_growth_df

Unnamed: 0,Median_Price_Change,suburb,postcode,Median_Price
888,0.120442,Melbourne,3000,531.802452
369,0.12311,Clayton,3168,418.955355
1089,0.130755,Parkville,3052,449.013863
409,0.137765,Collingwood,3066,559.044234
1029,0.146711,North Melbourne,3051,489.086321
1419,0.147742,West Melbourne,3003,538.68928
749,0.167294,Hoppers Crossing,3029,492.926202
279,0.174685,Carlton,3053,436.221906
419,0.179983,Craigieburn,3064,538.58125
1189,0.184531,Roxburgh Park,3064,527.33125


In [192]:
# discard Mentone, becasue it seems like an outlier

list(highest_growth_df[0:10]['suburb'])[::-1]

['Roxburgh Park',
 'Craigieburn',
 'Carlton',
 'Hoppers Crossing',
 'West Melbourne',
 'North Melbourne',
 'Collingwood',
 'Parkville',
 'Clayton',
 'Melbourne']