# random forest
This notebook fit a random forest regressor to predict the rental price

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn import metrics
import _pickle as cPickle
import seaborn as sns

In [None]:
df = pd.read_csv('../data/curated/full_listing.csv').iloc[:,1:]
remove = [ 'address', 'suburb', 'postcode', 'url', 'loc_address', 'lat', 'lon', 'list_date', 'list_history', 'list_count', 'SA2_NAME_2016', 'SA2', 'lgaregion', 'primary_school_name', 'secondary_school_name', 'pri_lat', 'pri_lon', 'sec_lat', 'sec_lon', 'closest_ed_name', 'ed_lat', 'ed_lon', 'train_stop', 'train_lat', 'train_lon']
df = df.drop(remove, axis=1)
df.head()
# with the scraped data in addition to the features that were added 
# we are able to use machine learning models to help answer the three questions
# overall approach: get some data -> process -> model

In [None]:
# type is a categorical data, need to be changed for analysis
df.type = pd.Categorical(df.type)
df['type_code'] = df.type.cat.codes

In [None]:
df = df.dropna()

In [None]:
x = df.drop(['type', 'weekly_rent'], axis=1)
y = df['weekly_rent']

In [None]:
# we use a random forest here to predict the rental price
# as we dont think the relationship between rental price and features are linear, random forest would be more appropriate
# while it lower the rist of overfitting and is efficient in a way on this kind of large dataset
regressor = RandomForestRegressor(n_estimators = 500, random_state = 0)

In [None]:
regressor.fit(x, y)

In [None]:
plt.barh(x.columns, regressor.feature_importances_)

In [None]:
# print out the rows and values so it can be exported to Keynote 
# for a aesthetically pleasing plot that matches the theme of Keynote
pd.DataFrame([x.columns, regressor.feature_importances_]).to_csv('../plots/rf_importance.csv')

In [None]:
# the internal features here seemed very important
# future, a more complex dataset about properties can be used such as dataset/API provided by CoreLogic
# this would be able to give more in deepth features for the property
# such as air conditioning, swimming pool etc
# this should help to produce better result
# https://www.corelogic.com.au
# this proposal was rejected by Calvin at the very start of the project

In [None]:
predictions = regressor.predict(x)

In [None]:
# gt stands for ground truth
# this is used for statistics below
gt = y.tolist()

In [None]:
# adapted from https://stackoverflow.com/a/58167031
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(gt, predictions),4))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(gt, predictions),4))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(gt, predictions)),4))
mape = np.mean(np.abs((gt - predictions) / np.abs(gt)))
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 4))
print('Accuracy:', round(100*(1 - mape), 4))
print('R-squared:', round(metrics.r2_score(gt, predictions),4))

In [None]:
# save the random forest model
with open('../models/rf_regressor', 'wb') as f:
    cPickle.dump(regressor, f)

In [None]:
# get gt and prediction side by side to observe the stats
compare_df = pd.DataFrame({'gt': y.tolist(), 'prediction': predictions.tolist()})
compare_df.head()

In [None]:
y.quantile(0.01)

In [None]:
y.quantile(0.99)

In [None]:
compare_df = compare_df[compare_df['gt'] <= y.quantile(0.99)]
compare_df = compare_df[compare_df['gt'] >= y.quantile(0.01)]

In [None]:
min(compare_df['prediction'])

In [None]:
# plot comparison between ground truth and prediction
sample = compare_df.sample(n = 3000)

In [None]:
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns_plot = sns.regplot(sample['gt'],sample['prediction'], line_kws={"color": "red"})
sns_plot.set_xlabel('rental price')
sns_plot.set_ylabel('predicted rental price')