In [None]:
# find the top growing suburbs
# using past changes and population stats

In [None]:
import pandas as pd
from ast import literal_eval
import statistics
from tqdm import tqdm
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
from sklearn import metrics
import seaborn as sns

In [None]:
df = pd.read_csv('../data/curated/full_listing.csv')

In [None]:
keep_columns = ['address', 'list_history', 'population_total', 'proj_population_total', 'population_children', 'proj_population_children', 'population_early_working', 'proj_population_early_working', 'population_mature_working', 'proj_population_mature_working', 'population_prime_working', 'proj_population_prime_working', 'population_elderly', 'proj_population_elderly']
df = df[keep_columns]

In [None]:
n_rows = len(df)
n_cols = len(df.iloc[0])

In [None]:
n_rows

In [None]:
# increase rate = (newer rent - older rent) / older amount / year difference
result = []
for i in tqdm(range(n_rows)):
    temp = []
    record_list = literal_eval(df.iloc[i]['list_history'])
    if len(record_list) > 0:
        for j in record_list:
            temp.append(j[1]/j[0])
        temp = [statistics.mean(temp)]
    else:
        temp = []
    for j in range(2, n_cols, 2):
        temp.append((df.iloc[i][j + 1] - df.iloc[i][j]) / df.iloc[i][j] /5)
    result.append(temp)

In [None]:
df = pd.read_csv('../data/curated/full_listing.csv').iloc[:,1:]

In [None]:
df = df.drop(keep_columns[1:], axis = 1)

In [None]:
df2 = pd.DataFrame(result, columns = ['c_rent', 'c_total_population', 'c_children', 'c_early', 'c_mature', 'c_prime', 'c_elderly'])

In [None]:
result_df = pd.concat([df, df2], axis=1)

In [None]:
remove = ['address', 'postcode', 'url', 'loc_address', 'lat', 'lon', 'list_date', 'SA2_NAME_2016', 'SA2', 'lgaregion', 'primary_school_name', 'secondary_school_name', 'pri_lat', 'pri_lon', 'sec_lat', 'sec_lon', 'closest_ed_name', 'ed_lat', 'ed_lon', 'train_stop', 'train_lat', 'train_lon']
result_df = result_df.drop(remove, axis = 1)

In [None]:
result_df.type = pd.Categorical(df.type)
result_df['type_code'] = result_df.type.cat.codes
result_df = result_df.dropna()

In [None]:
X = result_df.drop(['type', 'suburb', 'c_rent'], axis=1)
y = result_df['c_rent']

In [None]:
result_df = result_df.dropna()

In [None]:
reg = RandomForestRegressor(n_estimators = 500, random_state = 0).fit(X, y)

In [None]:
predictions = reg.predict(X)
gt = y.tolist()
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(gt, predictions), 4))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(gt, predictions), 4))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(gt, predictions)), 4))
mape = np.mean(np.abs((gt - predictions) / np.abs(gt)))
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 4))
print('Accuracy:', round(100 * (1 - mape), 4))
print('R-squared:', round(metrics.r2_score(gt, predictions), 4))

In [None]:
compare_df = pd.DataFrame({'gt': y.tolist(), 'prediction': predictions.tolist()})
compare_df = compare_df[compare_df['gt'] <= y.quantile(0.99)]
compare_df = compare_df[compare_df['gt'] >= y.quantile(0.01)]
sample = compare_df.sample(n=3000)

sns.set(rc={"figure.dpi": 300, 'savefig.dpi': 300})
sns_plot = sns.regplot(sample['gt'], sample['prediction'], line_kws={"color": "red"})
sns_plot.set_xlabel('rental price')
sns_plot.set_ylabel('predicted rental price')

In [None]:
rank_df = result_df.groupby('suburb').mean()

In [None]:
rank_df[['bed', 'bath', 'car', 'type_code']] = [2, 1, 1, 4]

In [None]:
x = rank_df.drop('c_rent', axis=1).dropna()
prediction = pd.DataFrame({'suburb': x.index.tolist(), 'grow': reg.predict(x)}).sort_values(by='grow', ascending=False)

In [None]:
stats_df = prediction.merge(df.groupby('suburb').count().iloc[:,0], on = 'suburb')
stats_df = stats_df.set_axis(['suburb', 'growth_rate', 'n_listing'], axis=1)
min_listing = stats_df.n_listing.quantile(0.25)
stats_df = stats_df.loc[stats_df['n_listing'] >= min_listing]

In [None]:
stats_df.head(10)

In [None]:
plt.barh(x.columns, reg.feature_importances_)

In [None]:
pd.DataFrame({'feature_name': x.columns, 'feature_imporance': reg.feature_importances_}).sort_values(by='feature_imporance', ascending=False).head()