# Rental Price Predictions and Growth Rates

### Importing Libraries and Functions

In [61]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

### Reading in the models

In [62]:
model_read_dir = '../models/'

# Load in saved models for Greater Melbourne 
with open(f"{model_read_dir}gm_lr.pkl", 'rb') as file:
    gm_lr_model = pickle.load(file)

with open(f"{model_read_dir}gm_rf.pkl", 'rb') as file:
    gm_rf_model = pickle.load(file)
    
# Load in saved models for Rest of Vic
with open(f"{model_read_dir}rv_lr.pkl", 'rb') as file:
    rv_lr_model = pickle.load(file)

with open(f"{model_read_dir}rv_rf.pkl", 'rb') as file:
    rv_rf_model = pickle.load(file)

In [63]:
# Ensure the models have been correctly loaded in

print(f"Greater Melbourne Linear Regression Type: {type(gm_lr_model)}") 
print(f"Greater Melbourne Random Forest Type: {type(gm_rf_model)}") 
print(f"Rest of Vic Linear Regression Type: {type(rv_lr_model)}") 
print(f"Rest of Vic Random Forest Type: {type(rv_rf_model)}") 

Greater Melbourne Linear Regression Type: <class 'sklearn.linear_model._base.LinearRegression'>
Greater Melbourne Random Forest Type: <class 'sklearn.ensemble._forest.RandomForestRegressor'>
Rest of Vic Linear Regression Type: <class 'sklearn.linear_model._base.LinearRegression'>
Rest of Vic Random Forest Type: <class 'sklearn.ensemble._forest.RandomForestRegressor'>


### Reading in Data for Predictions

In [64]:
# Read in prediction data and their identifiers for later
forecast_read_dir = '../data/curated/forecast/'

gm_df = pd.read_csv(f"{forecast_read_dir}greater_melbourne_predict.csv")
gm_identifiers_df = pd.read_csv(f"{forecast_read_dir}gm_predict_identifiers.csv")

rv_df = pd.read_csv(f"{forecast_read_dir}rest_of_vic_predict.csv")
rv_identifiers_df = pd.read_csv(f"{forecast_read_dir}rv_predict_identifiers.csv")

### Scaling the Datasets

In [65]:
# Apply necessary transformations on test dataset 

# Linear Regression Transformations for Greater Melbourne and Rest of Vic in the same order as 
# modelling_properties.ipynb

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
scaler = StandardScaler()

gm_lr_np = scaler.fit_transform(gm_df)
gm_lr_np = poly.fit_transform(gm_lr_np)

rv_lr_np = poly.fit_transform(rv_df)
rv_lr_np = scaler.fit_transform(rv_lr_np)

# Random Forest Transformation - only StandardScaler 
gm_rf_np = scaler.fit_transform(gm_df)
rv_rf_np = scaler.fit_transform(rv_df)

### Predicting Future Prices

In [66]:
# Predict forecasted data

gm_lr_predictions = gm_lr_model.predict(gm_lr_np)
gm_rf_predictions = gm_rf_model.predict(gm_rf_np)

rv_lr_predictions = rv_lr_model.predict(rv_lr_np)
rv_rf_predictions = rv_rf_model.predict(rv_rf_np)

In [67]:
# Convert predictions to dataframe type
gm_lr_pred_df = pd.DataFrame(gm_lr_predictions, columns=['weekly_cost'])
gm_rf_pred_df = pd.DataFrame(gm_rf_predictions, columns=['weekly_cost'])

rv_lr_pred_df = pd.DataFrame(rv_lr_predictions, columns=['weekly_cost'])
rv_rf_pred_df = pd.DataFrame(rv_rf_predictions, columns=['weekly_cost'])

In [68]:
# Back-transform predictions, as they were initially log-transformed
gm_lr_orig_pred = np.exp(gm_lr_pred_df)
gm_rf_orig_pred = np.exp(gm_rf_pred_df)

rv_lr_orig_pred = np.exp(rv_lr_pred_df)
rv_rf_orig_pred = np.exp(rv_rf_pred_df)

  result = func(self.values, **kwargs)


In [69]:
print(type(gm_lr_orig_pred))

<class 'pandas.core.frame.DataFrame'>


In [70]:
# Join with identifiers for analysis
gm_lr_pred_final = pd.concat([gm_identifiers_df, gm_lr_orig_pred], axis=1)
gm_rf_pred_final = pd.concat([gm_identifiers_df, gm_rf_orig_pred], axis=1)

rv_lr_pred_final = pd.concat([rv_identifiers_df, rv_lr_orig_pred], axis=1)
rv_rf_pred_final = pd.concat([rv_identifiers_df, rv_rf_orig_pred], axis=1)

### Results

In [71]:
print("GM Linear Regression Prediction Head: ")
print(gm_lr_pred_final.head())
print("GM Random Forest Prediction Head: ")
print(gm_rf_pred_final.head())

print("RV Linear Regression Prediction Head: ")
print(rv_lr_pred_final.head())
print("RV Random Forest Prediction Head: ")
print(rv_rf_pred_final.head())

GM Linear Regression Prediction Head: 
        suburb  year  weekly_cost
0  albert park  2025   482.952433
1  albert park  2026   483.899851
2  albert park  2027   558.603027
3  albert park  2028   591.819938
4  albert park  2029   637.488714
GM Random Forest Prediction Head: 
        suburb  year  weekly_cost
0  albert park  2025   444.158913
1  albert park  2026   445.717426
2  albert park  2027   457.640942
3  albert park  2028   695.314139
4  albert park  2029   708.844757
RV Linear Regression Prediction Head: 
   suburb  year  weekly_cost
0  ararat  2025          inf
1  ararat  2026          inf
2  ararat  2027          inf
3  ararat  2028          inf
4  ararat  2029          inf
RV Random Forest Prediction Head: 
   suburb  year  weekly_cost
0  ararat  2025   354.397564
1  ararat  2026   359.167350
2  ararat  2027   388.530877
3  ararat  2028   436.962740
4  ararat  2029   449.387281


In [72]:
# Group by suburb, year and transform to have feature of predicted weekly cost per year for each
# suburb

gm_predicted_average_costs = gm_rf_pred_final.groupby(['suburb', 'year'])['weekly_cost'].mean().reset_index()
rv_predicted_average_costs = rv_rf_pred_final.groupby(['suburb', 'year'])['weekly_cost'].mean().reset_index()

# TransformingD
gm_pred_costs_transformed = gm_predicted_average_costs.pivot(index='suburb', columns='year', values='weekly_cost')
gm_pred_costs_transformed.columns = [f'{year}_weekly_cost' for year in gm_pred_costs_transformed.columns]  # Rename columns
gm_pred_costs_transformed.reset_index(inplace=True)  # Reset index to bring 'suburb' back as a column

rv_pred_costs_transformed = rv_predicted_average_costs.pivot(index='suburb', columns='year', values='weekly_cost')
rv_pred_costs_transformed.columns = [f'{year}_weekly_cost' for year in rv_pred_costs_transformed.columns]  # Rename columns
rv_pred_costs_transformed.reset_index(inplace=True)  # Reset index to bring 'suburb' back as a column

In [73]:
# Read in current data to compare 2024 prices
data_read_dir = '../data/curated/final_datasets/'

gm_real = pd.read_csv(f"{data_read_dir}greater_melbourne_train.csv")
gm_real_identifiers = pd.read_csv(f"{data_read_dir}gm_train_identifiers.csv")

rv_real = pd.read_csv(f"{data_read_dir}rest_of_vic_train.csv")
rv_real_identifiers = pd.read_csv(f"{data_read_dir}rv_train_identifiers.csv")

In [74]:
# Combine with identifiers and group by year, suburb, keep only 2024 data
gm_data = pd.concat([gm_real_identifiers, gm_real], axis=1)
rv_data = pd.concat([rv_real_identifiers, rv_real], axis=1)

gm_current_average_costs = gm_data.groupby(['suburb', 'year'])['weekly_cost'].mean().reset_index()
rv_current_average_costs = rv_data.groupby(['suburb', 'year'])['weekly_cost'].mean().reset_index()


gm_current_average_costs = gm_current_average_costs[gm_current_average_costs['year'] == 2024]\
    .rename(columns={'weekly_cost': '2024_weekly_cost'})\
    .drop('year', axis=1)
rv_current_average_costs = rv_current_average_costs[rv_current_average_costs['year'] == 2024]\
    .rename(columns={'weekly_cost': '2024_weekly_cost'})\
    .drop('year', axis=1)


In [75]:
df = pd.read_csv(f"../data/curated/forecast/greater_melbourne_predict.csv")
df_identifiers = pd.read_csv(f"../data/curated/forecast/gm_predict_identifiers.csv")
ndf = pd.concat([df_identifiers, df], axis=1)
ndf.head()

Unnamed: 0,suburb,year,beds,baths,parking,dist_to_city,dist_to_education,dist_to_parks_and_gardens,dist_to_train_station,dist_to_healthcare,...,median_age,median_weekly_rent,percent_aboriginal_torres_strait_islander,percent_au_citizen,percent_overseas_born,percent_rental_properties,population,percent_unemployed,housing_index,cpi_without_housing
0,albert park,2025,2.0,1.0,1.0,4.42985,0.52632,0.19365,3.36613,0.44046,...,42.8,512.9,0.323333,4.113333,35.52,43.86,14420.682864,4.603333,149.056579,130.086404
1,albert park,2026,2.0,1.0,1.0,4.42985,0.52632,0.19365,3.36613,0.44046,...,43.14,516.0,0.333333,3.833333,35.9,44.0,14624.772379,4.633333,152.824737,132.342675
2,albert park,2027,2.0,1.0,1.0,4.42985,0.52632,0.19365,3.36613,0.44046,...,43.48,519.1,0.343333,3.553333,36.28,44.14,14820.286445,4.663333,156.592895,134.598947
3,albert park,2028,2.0,1.0,1.0,4.42985,0.52632,0.19365,3.36613,0.44046,...,43.82,522.2,0.353333,3.273333,36.66,44.28,15014.808184,4.693333,160.361053,136.855219
4,albert park,2029,2.0,1.0,1.0,4.42985,0.52632,0.19365,3.36613,0.44046,...,44.16,525.3,0.363333,2.993333,37.04,44.42,15208.063939,4.723333,164.129211,139.111491


In [77]:
# TO DO: DEAL WITH WHY 2024 WEEKLY AVG COST IS HIGHER - HAVENT REMOVED EXPENSIVE top 5% properties 

# Merge data
gm_weekly_costs = pd.merge(gm_pred_costs_transformed, gm_current_average_costs, on='suburb', how='inner')
rv_weekly_costs = pd.merge(rv_pred_costs_transformed, rv_current_average_costs, on='suburb', how='inner')

YEARS = ["2025", "2026", "2027", "2028", "2029"]
for year in YEARS:
    gm_weekly_costs[f"{year}_percent_increase"] = gm_weekly_costs[f"{year}_weekly_cost"]/\
                                                  gm_weekly_costs['2024_weekly_cost']
    rv_weekly_costs[f"{year}_percent_increase"] = rv_weekly_costs[f"{year}_weekly_cost"]/\
                                                  rv_weekly_costs['2024_weekly_cost']

# gm_weekly_costs = gm_weekly_costs[gm_weekly_costs['2029_weekly_cost'] > gm_weekly_costs['2024_weekly_cost']]
#gm_weekly_costs.sort_values(by="2029_percent_increase", ascending=False).head(10)
gm_weekly_costs[gm_weekly_costs['suburb'] == "coburg"].head()


# gm_weekly_costs = gm_weekly_costs[gm_weekly_costs['2029_weekly_cost'] > gm_weekly_costs['2024_weekly_cost']]
#rv_weekly_costs.sort_values(by="2029_percent_increase", ascending=False).head(10)

Unnamed: 0,suburb,2025_weekly_cost,2026_weekly_cost,2027_weekly_cost,2028_weekly_cost,2029_weekly_cost,2024_weekly_cost,2025_percent_increase,2026_percent_increase,2027_percent_increase,2028_percent_increase,2029_percent_increase
110,coburg,453.325635,460.834284,469.597408,557.310988,579.174582,617.912621,0.73364,0.745792,0.759974,0.901925,0.937308
