This notebook implements and evaluates different models for predicting the yearly growth rate of the median rent given suburb and year, and the rental price of a property given its attributes.

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Read the growth rate data
yearly_growth = pd.read_csv('../data/landing/yearly_growth_rate_of_median_rent_by_suburb.csv')
yearly_growth.head()

Unnamed: 0,suburb,year,growth_rate
0,Albert Park-Middle Park-West St Kilda,2003,-1.666667
1,Armadale,2003,0.858369
2,Carlton North,2003,0.0
3,Carlton-Parkville,2003,3.703704
4,CBD-St Kilda Rd,2003,-3.125


In [3]:
# Build a model to predict the yearly growth rate of the median rent given suburb and year
le = LabelEncoder()
X = yearly_growth[['suburb', 'year']]
X['suburb'] = le.fit_transform(X['suburb'])
y = yearly_growth['growth_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30034)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['suburb'] = le.fit_transform(X['suburb'])


In [4]:
# Fit a baseline model
# In this case, choose a suburb-wise mean baseline
suburb_mean = yearly_growth.groupby('suburb')['growth_rate'].mean()
y_pred = X_test['suburb'].apply(lambda x: suburb_mean[x])
baseline_mse = mean_squared_error(y_test, y_pred)
print(f'Baseline MSE: {baseline_mse}')

Baseline MSE: 18.884595292333167


  y_pred = X_test['suburb'].apply(lambda x: suburb_mean[x])


In [5]:
# Fit a baseline model
# In this case, choose a year-wise mean baseline
year_mean = yearly_growth.groupby('year')['growth_rate'].mean()
y_pred = X_test['year'].apply(lambda x: year_mean[x])
baseline_mse = mean_squared_error(y_test, y_pred)
print(f'Baseline MSE: {baseline_mse}')

Baseline MSE: 11.318108840861491


In [6]:
# Fit a linear regression model
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

Mean squared error: 19.39025083623132


Since the linear regressor assumes linearities in the data, based on the MSEs some non-linearities in the data are expected.

In [7]:
# Fit a polynomial regression model
degree = 3
PR = make_pipeline(PolynomialFeatures(degree), LinearRegression())
PR.fit(X_train, y_train)
y_pred = PR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

# Predict the growth rate for each suburb in 2024
suburbs = le.fit_transform(yearly_growth['suburb'].unique())
years = [2024] * len(suburbs)
X_pred = pd.DataFrame({'suburb': suburbs, 'year': years})
y_pred = PR.predict(X_pred)

# The top 10 suburbs with the highest predicted growth rate in 2024
suburbs = le.inverse_transform(suburbs)
growth_rate = pd.DataFrame({'suburb': suburbs, 'growth_rate': y_pred})
growth_rate = growth_rate.sort_values('growth_rate', ascending=False)
growth_rate.head(10)

Mean squared error: 15.62106961630631


Unnamed: 0,suburb,growth_rate
63,Yarraville-Seddon,10.759485
96,Yarra Ranges,10.737153
145,Wodonga,10.714817
62,Williamstown,10.692479
88,Whittlesea,10.670141
61,West Footscray,10.647803
76,West Brunswick,10.625468
60,Werribee-Hoppers Crossing,10.603136
120,Wendouree-Alfredton,10.58081
144,Warrnambool,10.55849


In [8]:
# Fit a random forest regressor
RFR = RandomForestRegressor(n_estimators=100, random_state=30034)
RFR.fit(X_train, y_train)
y_pred = RFR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

# Predict the growth rate for each suburb in 2024
suburbs = le.fit_transform(yearly_growth['suburb'].unique())
years = [2024] * len(suburbs)
X_pred = pd.DataFrame({'suburb': suburbs, 'year': years})
y_pred = RFR.predict(X_pred)

# The top 10 suburbs with the highest predicted growth rate in 2024
suburbs = le.inverse_transform(suburbs)
growth_rate = pd.DataFrame({'suburb': suburbs, 'growth_rate': y_pred})
growth_rate = growth_rate.sort_values('growth_rate', ascending=False)
growth_rate.head(10)

Mean squared error: 16.473171092781257


Unnamed: 0,suburb,growth_rate
28,Canterbury-Surrey Hills-Mont Albert,31.710033
19,Southbank,28.986747
2,Carlton North,28.173828
27,Camberwell-Glen Iris,28.125757
4,CBD-St Kilda Rd,27.026678
3,Carlton-Parkville,25.120764
43,Carnegie,24.778679
6,Docklands,22.382418
18,South Yarra,20.411094
44,Caulfield,20.045454


In [9]:
# Read the current listings data
domain_current = pd.read_parquet('../data/landing/cleaned_domain_current_listings.parquet')
domain_current.head()

Unnamed: 0,bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode,rent_pw
0,2,1,1,https://www.domain.com.au/705-8-marmion-place-...,"705/8 Marmion Place, Docklands VIC 3008",$600 per week,Apartment / Unit / Flat,-37.8134708,144.9424794,Docklands VIC 3008,3008,600
1,3,2,2,https://www.domain.com.au/5-18-20-ibbottson-st...,"5/18-20 Ibbottson Street, Watsonia VIC 3087",$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087,650
2,3,2,1,https://www.domain.com.au/2109-35-malcolm-st-s...,"2109/35 Malcolm St, South Yarra VIC 3141","$1,150/week",Apartment / Unit / Flat,-37.8369683,144.9964622,South Yarra VIC 3141,3141,1150
3,2,1,1,https://www.domain.com.au/4-17a-the-esplanade-...,"4/17a The Esplanade, Geelong VIC 3220",$475.00 per week,Apartment / Unit / Flat,-38.1345686,144.3548803,Geelong VIC 3220,3220,475
4,3,2,2,https://www.domain.com.au/501-446-malvern-road...,"501/446 Malvern Road, Prahran VIC 3181",$2200 Per Week,Apartment / Unit / Flat,-37.8479885,145.0012197,Prahran VIC 3181,3181,2200


In [10]:
# Build a model to predict the rental price of a property given its attributes
# including the postcode, type, number of bedrooms, number of bathrooms, and number of parking spaces
le = LabelEncoder()
X = domain_current[['postcode', 'type', 'bedrooms', 'bathrooms', 'car_parks']]
X['postcode'] = le.fit_transform(X['postcode'])
X['type'] = le.fit_transform(X['type'])
y = domain_current['rent_pw']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30034)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['postcode'] = le.fit_transform(X['postcode'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['type'] = le.fit_transform(X['type'])


In [11]:
# Use the dummy regressor as a baseline
from sklearn.dummy import DummyRegressor
DR = DummyRegressor(strategy='mean')
DR.fit(X_train, y_train)
y_pred = DR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

Mean squared error: 2333467.754144234


In [12]:
# Fit a K-nearest neighbors regressor
from sklearn.neighbors import KNeighborsRegressor
KNR = KNeighborsRegressor(n_neighbors=5)
KNR.fit(X_train, y_train)
y_pred = KNR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

Mean squared error: 1440777.9509677421
