In [17]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from statsmodels.formula.api import ols

pd.options.mode.chained_assignment = None

suburb = pd.read_csv('../data/raw/suburb_final.csv')
suburb_2023 = pd.read_csv('../data/raw/suburb_2023.csv')
suburb_2024 = pd.read_csv('../data/raw/suburb_2024.csv')


In [20]:
suburb['income'] = suburb['2022_income']
suburb = suburb.fillna(0)
suburb

Unnamed: 0.1,Unnamed: 0,LOC_PID,LOC_NAME,rental_price,suburb_population,population_density,offence_count_scaled,2022_income,num_stations,num_schools,Hospital ID,income
0,0,loc0067a4549ed1,Korumburra,278.333333,4897.000000,81.797573,0.168266,47097.371969,0.0,3.0,0.0,47097.371969
1,1,loc00a9769647d7,Kew,621.281250,26158.000000,2486.279947,0.112317,71097.931084,0.0,11.0,6.0,71097.931084
2,2,loc00d1503504f1,Glen Waverley,561.269841,41928.000000,2489.998587,0.109139,44548.852850,2.0,13.0,2.0,44548.852850
3,3,loc00e6e39d335b,Sailors Falls,450.000000,1057.555556,108.121093,0.000946,46234.245242,0.0,0.0,0.0,46234.245242
4,4,loc00f0949ea0ad,Sunbury,471.617647,39266.294118,297.403254,0.138185,62622.919507,1.0,14.0,1.0,62622.919507
...,...,...,...,...,...,...,...,...,...,...,...,...
719,719,locff6258c8ea42,Montmorency,525.555556,9003.705882,2366.881699,0.053311,66315.679359,1.0,4.0,0.0,66315.679359
720,720,locff62fb6a898a,Carrum Downs,458.823529,22420.000000,1109.905643,0.165076,59654.130953,0.0,4.0,0.0,59654.130953
721,721,locffa1c8993b70,Mambourin,407.352941,5488.609375,154.062304,0.021681,59687.062945,0.0,0.0,0.0,59687.062945
722,722,locffb43e78ab10,Murtoa,300.000000,2133.333333,6.995021,0.048281,50199.145749,0.0,2.0,0.0,50199.145749


In [22]:
from sklearn import model_selection
import statsmodels.api as sm

train, test = model_selection.train_test_split(suburb, test_size=0.2, random_state=1234)
model = sm.formula.ols('rental_price ~ population_density + offence_count_scaled + income', data=train).fit()
print('The partial regression coefficients of the models were: \n', model.params)

test_X = test.drop(labels='rental_price', axis=1)
pred = model.predict(exog=test_X)

print('Comparing the difference between predicted and actual values:\n', pd.DataFrame({'Suburb': test.LOC_NAME, 'Prediction': pred, 'Real':test.rental_price}))


The partial regression coefficients of the models were: 
 Intercept               423.016501
population_density       -0.002393
offence_count_scaled   -253.077554
income                    0.002442
dtype: float64
Comparing the difference between predicted and actual values:
             Suburb  Prediction        Real
21        Watsonia  539.880065  446.428571
587  North Bendigo  497.018331  448.913043
271     Allansford  514.714497   40.000000
519  Wheelers Hill  523.656306  591.428571
489     Heidelberg  503.886878  452.516129
..             ...         ...         ...
227         Yinnar  529.817949  380.000000
288     Hughesdale  534.884148  567.500000
214        Donvale  558.765877  570.937500
594   Mirboo North  527.941126  295.000000
319    Melton West  459.307364  358.235294

[145 rows x 3 columns]


In [23]:
accuracy_list = []
for i in pred:
    for j in test.rental_price:
        if i > j:
            accuracy_list.append(j/i)
        else:
            accuracy_list.append(i/j)
            
accuracy = sum(accuracy_list) / len(accuracy_list)

print('Accuracy of this model is:\n', accuracy)

Accuracy of this model is:
 0.8019375358290044


In [13]:
suburb['num_hospital'] = suburb['Hospital ID']

In [14]:
train, test = model_selection.train_test_split(suburb, test_size=0.2, random_state=1234)
model_more = sm.formula.ols('rental_price ~ population_density + offence_count_scaled + income2022 + num_stations + num_schools + num_hospital', data=train).fit()
print('The partial regression coefficients of the models were: \n', model.params)

test_X = test.drop(labels='rental_price', axis=1)
pred1 = model_more.predict(exog=test_X)

print('Comparing the difference between predicted and actual values:\n', pd.DataFrame({'Suburb': test.LOC_NAME, 'Prediction': pred1, 'Real':test.rental_price}))


The partial regression coefficients of the models were: 
 Intercept               423.016501
population_density       -0.002393
offence_count_scaled   -253.077554
income2022                0.002442
dtype: float64
Comparing the difference between predicted and actual values:
             Suburb  Prediction        Real
21        Watsonia  541.337893  446.428571
587  North Bendigo  500.123346  448.913043
271     Allansford  527.359019   40.000000
519  Wheelers Hill  447.999221  591.428571
489     Heidelberg  585.463386  452.516129
..             ...         ...         ...
227         Yinnar  542.159600  380.000000
288     Hughesdale  529.337580  567.500000
214        Donvale  542.505794  570.937500
594   Mirboo North  524.878177  295.000000
319    Melton West  414.420376  358.235294

[145 rows x 3 columns]


In [15]:
accuracy_list1 = []
for i in pred1:
    for j in test.rental_price:
        if i > j:
            accuracy_list1.append(j/i)
        else:
            accuracy_list1.append(i/j)
            
accuracy = sum(accuracy_list1) / len(accuracy_list1)

print('Accuracy of this external model is:\n', accuracy)

Accuracy of this external model is:
 0.7995242096382887


In [26]:
sub_2023 = suburb_2023.drop(labels='rental_price', axis=1)
pred_2023 = model.predict(exog=sub_2023)

print('Prediction of rental price in 2023\n', pd.DataFrame({'Suburb': suburb_2023.LOC_NAME, 'Prediction': pred_2023}))


             Suburb  Prediction
0       Korumburra  498.275533
1              Kew  566.718162
2    Glen Waverley  499.942462
3    Sailors Falls  539.013490
4          Sunbury  544.771012
..             ...         ...
719    Montmorency  571.044261
720   Carrum Downs  528.821150
721      Mambourin  566.615523
722         Murtoa  537.328962
723        Chelsea  542.423649

[724 rows x 2 columns]


In [27]:
sub_2024 = suburb_2024.drop(labels='rental_price', axis=1)
pred_2024 = model.predict(exog=sub_2024)

print('Prediction of rental price in 2024\n', pd.DataFrame({'Suburb': suburb_2024.LOC_NAME, 'Prediction': pred_2024}))

Prediction of rental price in 2024
             Suburb  Prediction
0       Korumburra  501.357941
1              Kew  571.249636
2    Glen Waverley  501.663945
3    Sailors Falls  542.694195
4          Sunbury  549.383608
..             ...         ...
719    Montmorency  576.418613
720   Carrum Downs  533.494343
721      Mambourin  570.377067
722         Murtoa  541.392188
723        Chelsea  548.710772

[724 rows x 2 columns]


In [30]:
price_pred = pd.DataFrame({'Suburb': suburb.LOC_NAME, 'Rental_price_now': suburb.rental_price,'Prediction_2023': pred_2023, 'Prediction_2024': pred_2024})

In [35]:
growth_rate_2023 = (price_pred.Prediction_2023 - price_pred.Rental_price_now) / price_pred.Rental_price_now
growth_rate_2024 = (price_pred.Prediction_2024 - price_pred.Prediction_2023) / price_pred.Prediction_2023
price_pred['avg_growth_rate'] = (growth_rate_2023 + growth_rate_2024) /2 



In [37]:
price_pred.to_csv('../data/raw/Prediction of rental price.csv')

In [57]:
top10_2022 = price_pred.sort_values(['Rental_price_now'],ascending=False).head(10)[['Suburb','Rental_price_now']]
top10_2023 = price_pred.sort_values(['Prediction_2023'],ascending=False).head(10)[['Suburb','Prediction_2023']]
top10_2024 = price_pred.sort_values(['Prediction_2024'],ascending=False).head(10)[['Suburb','Prediction_2024']]
top10_rate = price_pred.sort_values(['avg_growth_rate'],ascending=False).head(10)

In [58]:
top10_2022

Unnamed: 0,Suburb,Rental_price_now
630,Skenes Creek,3850.0
478,Merriang,2800.0
263,Flinders,2425.0
171,Seaspray,2250.0
586,Marengo,2100.0
184,Balnarring Beach,1995.0
346,Myrtleford,1788.571429
212,Apollo Bay,1657.5
11,Indented Head,1589.0
239,Cape Bridgewater,1540.0


In [59]:
top10_2023

Unnamed: 0,Suburb,Prediction_2023
395,South Kingsville,603.889764
542,Newport,603.456768
499,Ivanhoe East,598.640658
138,Eaglemont,597.32923
23,Mount Macedon,597.150605
364,Spotswood,595.92914
362,Toorak,594.777284
91,Yarraville,591.520312
491,Glen Iris,589.039356
9,Williamstown North,586.521086


In [60]:
top10_2024

Unnamed: 0,Suburb,Prediction_2024
395,South Kingsville,612.241081
542,Newport,611.808085
23,Mount Macedon,604.858393
499,Ivanhoe East,604.737573
364,Spotswood,604.280457
138,Eaglemont,603.426146
362,Toorak,600.123473
91,Yarraville,598.425009
491,Glen Iris,594.873026
9,Williamstown North,591.721608


In [61]:
top10_rate

Unnamed: 0,Suburb,Rental_price_now,Prediction_2023,Prediction_2024,avg_growth_rate
271,Allansford,40.0,517.827564,521.016871,5.975924
36,Bundalong,175.0,537.686861,542.493035,1.040717
5,Natimuk,200.0,559.183593,565.076277,0.903228
569,Nichols Point,195.0,540.987025,546.473404,0.892217
607,Watchem,200.0,535.649688,539.26658,0.8425
226,Kadnook,230.0,576.339712,584.299659,0.759818
189,Portland West,220.0,541.00503,543.826622,0.732165
713,Ouyen,220.0,537.077829,541.486014,0.724735
268,Trafalgar,221.666667,529.432501,532.237379,0.696858
500,Penshurst,230.0,542.587647,547.498273,0.684064


In [65]:
Top10 = pd.DataFrame({'Suburbs with highest rental price in 2022': top10_2022.Suburb, 'Suburbs with highest rental price in 2023': top10_2023.Suburb,'Suburbs with highest rental price in 2024': top10_2024.Suburb, 'Suburbs with highest average growth rate': top10_rate.Suburb})

In [63]:
Top10

Unnamed: 0,Suburbs with highest rental price in 2022,Suburbs with highest rental price in 2023,Suburbs with highest rental price in 2024,Suburbs with highest average growth rate
5,,,,Natimuk
9,,Williamstown North,Williamstown North,
11,Indented Head,,,
23,,Mount Macedon,Mount Macedon,
36,,,,Bundalong
91,,Yarraville,Yarraville,
138,,Eaglemont,Eaglemont,
171,Seaspray,,,
184,Balnarring Beach,,,
189,,,,Portland West


In [16]:
model.summary()

0,1,2,3
Dep. Variable:,rental_price,R-squared:,0.019
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,3.623
Date:,"Mon, 03 Oct 2022",Prob (F-statistic):,0.013
Time:,16:43:28,Log-Likelihood:,-4103.2
No. Observations:,579,AIC:,8214.0
Df Residuals:,575,BIC:,8232.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,423.0165,80.589,5.249,0.000,264.732,581.301
population_density,-0.0024,0.007,-0.336,0.737,-0.016,0.012
offence_count_scaled,-253.0776,89.584,-2.825,0.005,-429.029,-77.126
income2022,0.0024,0.001,1.677,0.094,-0.000,0.005

0,1,2,3
Omnibus:,674.23,Durbin-Watson:,2.04
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52415.788
Skew:,5.577,Prob(JB):,0.0
Kurtosis:,48.258,Cond. No.,437000.0


In [9]:
model_more.summary()

0,1,2,3
Dep. Variable:,rental_price,R-squared:,0.031
Model:,OLS,Adj. R-squared:,0.021
Method:,Least Squares,F-statistic:,3.049
Date:,"Mon, 03 Oct 2022",Prob (F-statistic):,0.00607
Time:,16:42:42,Log-Likelihood:,-4099.5
No. Observations:,579,AIC:,8213.0
Df Residuals:,572,BIC:,8244.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,435.5238,82.673,5.268,0.000,273.144,597.903
population_density,-0.0008,0.007,-0.108,0.914,-0.015,0.014
offence_count_scaled,-235.4090,91.687,-2.568,0.010,-415.494,-55.324
income2022,0.0026,0.001,1.773,0.077,-0.000,0.006
num_stations,-5.5397,19.982,-0.277,0.782,-44.787,33.708
num_schools,-12.8071,5.311,-2.411,0.016,-23.239,-2.375
num_hospital,20.1881,15.575,1.296,0.195,-10.404,50.780

0,1,2,3
Omnibus:,669.609,Durbin-Watson:,2.057
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51604.659
Skew:,5.511,Prob(JB):,0.0
Kurtosis:,47.917,Cond. No.,447000.0
