This notebook implements and evaluates different models for:
- Predicting the yearly median rent given suburb, year and relevant suburban features,
- Listing the top 10 suburbs with the highest predicted growth rate,
- Figuring out the most liveable and affordable suburbs according to some metrics,
- Predicting the rental price of a property given its attributes.

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
# Read the median rent data
yearly_median = pd.read_csv('../data/curated/yearly_median_rent_by_suburb.csv')
yearly_median.head()

Unnamed: 0,suburb,year,median_rent,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,best_school_school_name,avg_school_number_vce_subjects,...,b60_bribery,total_crimes,total_area,developed_area,built_percentage,urban_percentage,disturbed_percentage,water_percentage,developed_percentage,nature_percentage
0,Albert Park-Middle Park-West St Kilda,2002,300,57.0,99.0,31.0,6.3,83.0,albert park college,57.0,...,0.0,2586.379367,7043125.0,4691875.0,0.032212,0.595794,0.038158,0.047387,0.666164,0.28645
1,Armadale,2002,233,30.0,100.0,34.0,19.8,97.0,lauriston girls school,30.0,...,0.0,605.270037,2181875.0,2112500.0,0.026353,0.934403,0.007448,0.0,0.968204,0.031796
2,Carlton North,2002,290,43.0,99.0,31.0,0.0,78.0,,42.583333,...,0.0,780.000033,2305625.0,1977500.0,0.007861,0.843589,0.006235,0.0,0.857685,0.142315
3,Carlton-Parkville,2002,270,61.0,97.0,31.0,13.6,91.0,university high school,61.0,...,0.0,1749.415541,5855625.0,4111875.0,0.066816,0.612766,0.022628,0.0,0.702209,0.297791
4,CBD-St Kilda Rd,2002,320,9.0,100.0,29.0,5.7,54.0,holmes grammar school,9.0,...,0.0,10326.999768,2367500.0,2187500.0,0.579461,0.287223,0.057286,0.007656,0.92397,0.068374


In [3]:
# Normalize the numeric features
scaler = MinMaxScaler()
numeric_cols = yearly_median.select_dtypes(include=['float64', 'int64']).columns
yearly_median[numeric_cols] = scaler.fit_transform(yearly_median[numeric_cols])
yearly_median.head()

Unnamed: 0,suburb,year,median_rent,best_school_number_vce_subjects,best_school_satisfactory_complete_vce_percent,best_school_median_study_score,best_school_study_score_over_40_percent,best_school_percentage_applying_to_victorian_uni,best_school_school_name,avg_school_number_vce_subjects,...,b60_bribery,total_crimes,total_area,developed_area,built_percentage,urban_percentage,disturbed_percentage,water_percentage,developed_percentage,nature_percentage
0,Albert Park-Middle Park-West St Kilda,0.0,0.30303,0.767123,0.96,0.529412,0.137555,0.773333,albert park college,0.859375,...,0.0,0.205757,0.002199,0.082507,0.05545,0.615293,0.174042,0.321817,0.667721,0.285611
1,Armadale,0.0,0.206349,0.39726,1.0,0.705882,0.432314,0.96,lauriston girls school,0.4375,...,0.0,0.033086,0.000312,0.018526,0.045338,0.969952,0.03397,0.0,0.974606,0.025526
2,Carlton North,0.0,0.2886,0.575342,0.96,0.529412,0.0,0.706667,,0.634115,...,0.0,0.048316,0.00036,0.015177,0.01342,0.874834,0.028437,0.0,0.862314,0.138402
3,Carlton-Parkville,0.0,0.25974,0.821918,0.88,0.529412,0.296943,0.88,university high school,0.921875,...,0.0,0.132808,0.001738,0.06812,0.115176,0.633069,0.103208,0.0,0.704344,0.297194
4,CBD-St Kilda Rd,0.0,0.33189,0.109589,1.0,0.411765,0.124454,0.386667,holmes grammar school,0.109375,...,0.0,0.880418,0.000384,0.020386,1.0,0.292095,0.261288,0.051993,0.929663,0.062884


In [4]:
# Compute the Pearson's correlation between 'median_rent' and the other numeric features (excluding 'suburb')
# Assumptions: linearity, homoscedasticity
pearson_corr = yearly_median[numeric_cols].corr()['median_rent'].sort_values(ascending=False)
pearson_corr

median_rent                                            1.000000
year                                                   0.730696
developed_percentage                                   0.412972
urban_percentage                                       0.371749
avg_school_percentage_applying_to_victorian_uni        0.338044
avg_school_median_study_score                          0.311855
best_school_percentage_applying_to_victorian_uni       0.302528
zoned_school_study_score_over_40_percent               0.275304
best_school_median_study_score                         0.263954
zoned_school_percentage_applying_to_victorian_uni      0.244467
built_percentage                                       0.213614
avg_school_satisfactory_complete_vce_percent           0.192729
best_school_satisfactory_complete_vce_percent          0.187115
avg_school_study_score_over_40_percent                 0.182035
zoned_school_number_vce_subjects                       0.153939
best_school_study_score_over_40_percent 

In [5]:
# Build a model to predict the yearly median rent given suburb, year and relevant suburban features
le = LabelEncoder()
X = yearly_median[['suburb'] + list(numeric_cols)]
X['suburb'] = le.fit_transform(X['suburb'])
y = yearly_median['median_rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30034)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['suburb'] = le.fit_transform(X['suburb'])


In [6]:
# Fit a baseline model
# In this case, choose a suburb-wise mean baseline
suburb_mean = yearly_median.groupby('suburb')['median_rent'].mean()
y_pred = X_test['suburb'].apply(lambda x: suburb_mean[x])
baseline_mse = mean_squared_error(y_test, y_pred)
print(f'Baseline MSE: {baseline_mse}')

Baseline MSE: 0.014167800924229788


  y_pred = X_test['suburb'].apply(lambda x: suburb_mean[x])


In [7]:
# Fit a baseline model
# In this case, choose a year-wise mean baseline
year_mean = yearly_median.groupby('year')['median_rent'].mean()
y_pred = X_test['year'].apply(lambda x: year_mean[x])
baseline_mse = mean_squared_error(y_test, y_pred)
print(f'Baseline MSE: {baseline_mse}')

Baseline MSE: 0.011028198602621277


In [8]:
# Fit a linear regression model
LR = LinearRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

Mean squared error: 3.2996718483716504e-28


In [9]:
# Read the current listings data
domain_current = pd.read_parquet('../data/landing/cleaned_domain_current_listings.parquet')
domain_current.head()

Unnamed: 0,bedrooms,bathrooms,car_parks,url,name,cost_text,type,latitude,longitude,suburb,postcode,rent_pw
0,2,1,1,https://www.domain.com.au/705-8-marmion-place-...,"705/8 Marmion Place, Docklands VIC 3008",$600 per week,Apartment / Unit / Flat,-37.8134708,144.9424794,Docklands VIC 3008,3008,600
1,3,2,2,https://www.domain.com.au/5-18-20-ibbottson-st...,"5/18-20 Ibbottson Street, Watsonia VIC 3087",$650 Per Week,Townhouse,-37.70987239999999,145.0844928,Watsonia VIC 3087,3087,650
2,3,2,1,https://www.domain.com.au/2109-35-malcolm-st-s...,"2109/35 Malcolm St, South Yarra VIC 3141","$1,150/week",Apartment / Unit / Flat,-37.8369683,144.9964622,South Yarra VIC 3141,3141,1150
3,2,1,1,https://www.domain.com.au/4-17a-the-esplanade-...,"4/17a The Esplanade, Geelong VIC 3220",$475.00 per week,Apartment / Unit / Flat,-38.1345686,144.3548803,Geelong VIC 3220,3220,475
4,3,2,2,https://www.domain.com.au/501-446-malvern-road...,"501/446 Malvern Road, Prahran VIC 3181",$2200 Per Week,Apartment / Unit / Flat,-37.8479885,145.0012197,Prahran VIC 3181,3181,2200


In [10]:
# Compute the Pearson's correlation between 'rent_pw' and the other numeric features
# Assumptions: linearity, homoscedasticity
pearson_corr = domain_current[['bedrooms', 'bathrooms', 'car_parks', 'rent_pw']].corr()['rent_pw'].sort_values(ascending=False)
pearson_corr

rent_pw      1.000000
bathrooms    0.100481
bedrooms     0.085069
car_parks    0.068947
Name: rent_pw, dtype: float64

In [11]:
# Build a model to predict the rental price of a property given its attributes
# including the postcode, type, number of bedrooms, number of bathrooms, and number of parking spaces
le = LabelEncoder()
X = domain_current[['postcode', 'type', 'bedrooms', 'bathrooms', 'car_parks']]
X['postcode'] = le.fit_transform(X['postcode'])
X['type'] = le.fit_transform(X['type'])
y = domain_current['rent_pw']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30034)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['postcode'] = le.fit_transform(X['postcode'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['type'] = le.fit_transform(X['type'])


In [12]:
# Use the dummy regressor as a baseline
from sklearn.dummy import DummyRegressor
DR = DummyRegressor(strategy='mean')
DR.fit(X_train, y_train)
y_pred = DR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

Mean squared error: 2333467.754144234


In [13]:
# Fit a K-nearest neighbors regressor
from sklearn.neighbors import KNeighborsRegressor
KNR = KNeighborsRegressor(n_neighbors=5)
KNR.fit(X_train, y_train)
y_pred = KNR.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

Mean squared error: 1440777.9509677421
