In [29]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from statsmodels.formula.api import ols

pd.options.mode.chained_assignment = None

suburb = pd.read_csv('../data/curated/suburb_final.csv')
suburb_2023 = pd.read_csv('../data/curated/suburb_2023.csv')
suburb_2024 = pd.read_csv('../data/curated/suburb_2024.csv')


In [30]:
suburb = suburb.fillna(0)
suburb

Unnamed: 0.1,Unnamed: 0,LOC_PID,LOC_NAME,rental_price,suburb_population,population_density,offence_count_scaled,income,num_stations,num_schools,num_hospitals
0,0,loc0067a4549ed1,Korumburra,278.333333,4897.000000,81.797573,0.165407,47097.371969,0.0,3.0,0.0
1,1,loc00a9769647d7,Kew,621.281250,26158.000000,2486.279947,0.107577,71097.931084,0.0,11.0,6.0
2,2,loc00d1503504f1,Glen Waverley,561.269841,41928.000000,2489.998587,0.105967,44548.852850,2.0,13.0,2.0
3,3,loc00e6e39d335b,Sailors Falls,450.000000,1057.555556,108.121093,0.001891,46234.245242,0.0,0.0,0.0
4,4,loc00f0949ea0ad,Sunbury,471.617647,39266.294118,297.403254,0.130443,62622.919507,1.0,14.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
719,719,locff6258c8ea42,Montmorency,525.555556,9003.705882,2366.881699,0.050979,66315.679359,1.0,4.0,0.0
720,720,locff62fb6a898a,Carrum Downs,458.823529,22420.000000,1109.905643,0.165120,59654.130953,0.0,4.0,0.0
721,721,locffa1c8993b70,Mambourin,407.352941,5488.609375,154.062304,0.035164,59687.062945,0.0,0.0,0.0
722,722,locffb43e78ab10,Murtoa,300.000000,2133.333333,6.995021,0.056250,50199.145749,0.0,2.0,0.0


In [31]:
from sklearn import model_selection
import statsmodels.api as sm

train, test = model_selection.train_test_split(suburb, test_size=0.2, random_state=1234)
model = sm.formula.ols('rental_price ~ population_density + offence_count_scaled + income', data=train).fit()
print('The partial regression coefficients of the models were: \n', model.params)

test_X = test.drop(labels='rental_price', axis=1)
pred = model.predict(exog=test_X)

print('Comparing the difference between predicted and actual values:\n', pd.DataFrame({'Suburb': test.LOC_NAME, 'Prediction': pred, 'Real':test.rental_price}))


The partial regression coefficients of the models were: 
 Intercept               421.869545
population_density       -0.002202
offence_count_scaled   -254.822498
income                    0.002439
dtype: float64
Comparing the difference between predicted and actual values:
             Suburb  Prediction        Real
21        Watsonia  544.005479  446.428571
587  North Bendigo  496.797816  448.913043
271     Allansford  505.046901   40.000000
519  Wheelers Hill  523.027869  591.428571
489     Heidelberg  512.088790  452.516129
..             ...         ...         ...
227         Yinnar  530.856955  380.000000
288     Hughesdale  530.012512  567.500000
214        Donvale  558.178275  570.937500
594   Mirboo North  519.851823  295.000000
319    Melton West  501.175118  358.235294

[145 rows x 3 columns]


In [32]:
accuracy_list = []
for i in pred:
    for j in test.rental_price:
        if i > j:
            accuracy_list.append(j/i)
        else:
            accuracy_list.append(i/j)
            
accuracy = sum(accuracy_list) / len(accuracy_list)

print('Accuracy of this model is:\n', accuracy)

Accuracy of this model is:
 0.803289172066259


In [33]:
train, test = model_selection.train_test_split(suburb, test_size=0.2, random_state=1234)
model_more = sm.formula.ols('rental_price ~ population_density + offence_count_scaled + income + num_stations + num_schools + num_hospitals', data=train).fit()
print('The partial regression coefficients of the models were: \n', model_more.params)

test_X = test.drop(labels='rental_price', axis=1)
pred1 = model_more.predict(exog=test_X)

print('Comparing the difference between predicted and actual values:\n', pd.DataFrame({'Suburb': test.LOC_NAME, 'Prediction': pred1, 'Real':test.rental_price}))


The partial regression coefficients of the models were: 
 Intercept               434.285484
population_density       -0.000565
offence_count_scaled   -237.185984
income                    0.002648
num_stations             -6.064746
num_schools             -12.858653
num_hospitals            20.207447
dtype: float64
Comparing the difference between predicted and actual values:
             Suburb  Prediction        Real
21        Watsonia  545.033499  446.428571
587  North Bendigo  500.088618  448.913043
271     Allansford  518.530609   40.000000
519  Wheelers Hill  447.318547  591.428571
489     Heidelberg  592.964046  452.516129
..             ...         ...         ...
227         Yinnar  543.320078  380.000000
288     Hughesdale  525.151004  567.500000
214        Donvale  542.100979  570.937500
594   Mirboo North  517.441901  295.000000
319    Melton West  453.342166  358.235294

[145 rows x 3 columns]


In [34]:
accuracy_list1 = []
for i in pred1:
    for j in test.rental_price:
        if i > j:
            accuracy_list1.append(j/i)
        else:
            accuracy_list1.append(i/j)
            
accuracy = sum(accuracy_list1) / len(accuracy_list1)

print('Accuracy of this external model is:\n', accuracy)

Accuracy of this external model is:
 0.8008104605683144


In [35]:
sub_2023 = suburb_2023.drop(labels='rental_price', axis=1)
pred_2023 = model.predict(exog=sub_2023)

print('Prediction of rental price in 2023\n', pd.DataFrame({'Suburb': suburb_2023.LOC_NAME, 'Prediction': pred_2023}))

Prediction of rental price in 2023
             Suburb  Prediction
0       Korumburra  497.413056
1              Kew  566.806551
2    Glen Waverley  499.732491
3    Sailors Falls  537.480701
4          Sunbury  545.191032
..             ...         ...
719    Montmorency  570.614553
720   Carrum Downs  527.375005
721      Mambourin  561.814272
722         Murtoa  533.890503
723        Chelsea  544.301199

[724 rows x 2 columns]


In [36]:
sub_2024 = suburb_2024.drop(labels='rental_price', axis=1)
pred_2024 = model.predict(exog=sub_2024)

print('Prediction of rental price in 2024\n', pd.DataFrame({'Suburb': suburb_2024.LOC_NAME, 'Prediction': pred_2024}))

Prediction of rental price in 2024
             Suburb  Prediction
0       Korumburra  500.491129
1              Kew  571.331653
2    Glen Waverley  501.451553
3    Sailors Falls  541.156230
4          Sunbury  549.797142
..             ...         ...
719    Montmorency  575.981347
720   Carrum Downs  532.041627
721      Mambourin  565.570526
722         Murtoa  537.948015
723        Chelsea  550.579480

[724 rows x 2 columns]


In [37]:
price_pred = pd.DataFrame({'Suburb': suburb.LOC_NAME, 'Rental_price_now': suburb.rental_price,'Prediction_2023': pred_2023, 'Prediction_2024': pred_2024})

In [38]:
growth_rate_2023 = (price_pred.Prediction_2023 - price_pred.Rental_price_now) / price_pred.Rental_price_now
growth_rate_2024 = (price_pred.Prediction_2024 - price_pred.Prediction_2023) / price_pred.Prediction_2023
price_pred['avg_growth_rate'] = (growth_rate_2023 + growth_rate_2024) /2 



In [39]:
price_pred.to_csv('../data/raw/Prediction of rental price.csv')

In [40]:
top10_2022 = price_pred.sort_values(['Rental_price_now'],ascending=False).head(10)[['Suburb','Rental_price_now']]
top10_2023 = price_pred.sort_values(['Prediction_2023'],ascending=False).head(10)[['Suburb','Prediction_2023']]
top10_2024 = price_pred.sort_values(['Prediction_2024'],ascending=False).head(10)[['Suburb','Prediction_2024']]
top10_rate = price_pred.sort_values(['avg_growth_rate'],ascending=False).head(10)

In [41]:
top10_2022

Unnamed: 0,Suburb,Rental_price_now
630,Skenes Creek,3850.0
478,Merriang,2800.0
263,Flinders,2425.0
171,Seaspray,2250.0
586,Marengo,2100.0
184,Balnarring Beach,1995.0
346,Myrtleford,1788.571429
212,Apollo Bay,1657.5
11,Indented Head,1589.0
239,Cape Bridgewater,1540.0


In [42]:
top10_2023

Unnamed: 0,Suburb,Prediction_2023
395,South Kingsville,601.810564
542,Newport,601.207416
499,Ivanhoe East,597.775842
138,Eaglemont,597.567925
23,Mount Macedon,595.712861
362,Toorak,594.203339
364,Spotswood,591.821835
91,Yarraville,589.802167
491,Glen Iris,589.784045
275,Alphington,585.163586


In [43]:
top10_2024

Unnamed: 0,Suburb,Prediction_2024
395,South Kingsville,610.150136
542,Newport,609.546988
499,Ivanhoe East,603.864183
138,Eaglemont,603.656266
23,Mount Macedon,603.409809
364,Spotswood,600.161408
362,Toorak,599.542009
91,Yarraville,596.697153
491,Glen Iris,595.609511
275,Alphington,590.898539


In [49]:
top10_rate.to_csv("../data/curated/top_10_rate.csv")
top10_rate

Unnamed: 0,Suburb,Rental_price_now,Prediction_2023,Prediction_2024,avg_growth_rate
271,Allansford,40.0,508.15559,511.340411,5.855079
36,Bundalong,175.0,536.878445,541.67786,1.038408
5,Natimuk,200.0,559.164271,565.048668,0.903172
569,Nichols Point,195.0,540.498739,545.977402,0.890962
607,Watchem,200.0,533.155561,536.767367,0.836276
226,Kadnook,230.0,574.978192,582.926945,0.756865
189,Portland West,220.0,540.48803,543.305654,0.730988
713,Ouyen,220.0,537.027545,541.42953,0.724616
268,Trafalgar,221.666667,529.161768,531.962701,0.696245
500,Penshurst,230.0,544.204845,549.108565,0.687559


In [45]:
Top10 = pd.DataFrame({'Suburbs with highest rental price in 2022': top10_2022.Suburb, 'Suburbs with highest rental price in 2023': top10_2023.Suburb,'Suburbs with highest rental price in 2024': top10_2024.Suburb, 'Suburbs with highest average growth rate': top10_rate.Suburb})

In [46]:
Top10

Unnamed: 0,Suburbs with highest rental price in 2022,Suburbs with highest rental price in 2023,Suburbs with highest rental price in 2024,Suburbs with highest average growth rate
5,,,,Natimuk
11,Indented Head,,,
23,,Mount Macedon,Mount Macedon,
36,,,,Bundalong
91,,Yarraville,Yarraville,
138,,Eaglemont,Eaglemont,
171,Seaspray,,,
184,Balnarring Beach,,,
189,,,,Portland West
212,Apollo Bay,,,


In [47]:
model.summary()

0,1,2,3
Dep. Variable:,rental_price,R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,3.4
Date:,"Thu, 06 Oct 2022",Prob (F-statistic):,0.0176
Time:,01:35:13,Log-Likelihood:,-4103.6
No. Observations:,579,AIC:,8215.0
Df Residuals:,575,BIC:,8233.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,421.8695,80.652,5.231,0.000,263.461,580.278
population_density,-0.0022,0.007,-0.308,0.758,-0.016,0.012
offence_count_scaled,-254.8225,94.227,-2.704,0.007,-439.894,-69.751
income,0.0024,0.001,1.674,0.095,-0.000,0.005

0,1,2,3
Omnibus:,675.205,Durbin-Watson:,2.039
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52653.04
Skew:,5.59,Prob(JB):,0.0
Kurtosis:,48.36,Cond. No.,456000.0


In [48]:
model_more.summary()

0,1,2,3
Dep. Variable:,rental_price,R-squared:,0.03
Model:,OLS,Adj. R-squared:,0.02
Method:,Least Squares,F-statistic:,2.956
Date:,"Thu, 06 Oct 2022",Prob (F-statistic):,0.00753
Time:,01:35:13,Log-Likelihood:,-4099.8
No. Observations:,579,AIC:,8214.0
Df Residuals:,572,BIC:,8244.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,434.2855,82.708,5.251,0.000,271.838,596.733
population_density,-0.0006,0.007,-0.077,0.939,-0.015,0.014
offence_count_scaled,-237.1860,96.449,-2.459,0.014,-426.623,-47.749
income,0.0026,0.001,1.775,0.076,-0.000,0.006
num_stations,-6.0647,19.984,-0.303,0.762,-45.316,33.186
num_schools,-12.8587,5.313,-2.420,0.016,-23.295,-2.422
num_hospitals,20.2074,15.600,1.295,0.196,-10.432,50.847

0,1,2,3
Omnibus:,670.531,Durbin-Watson:,2.056
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51826.246
Skew:,5.524,Prob(JB):,0.0
Kurtosis:,48.013,Cond. No.,466000.0
