In [8]:
import sys
sys.path.insert(0, '../scripts/')
from helper_functions import convert_census_to_postcode

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from statsmodels.tsa.vector_ar.var_model import VAR

import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.formula.api import ols

import geopandas as gpd

from scipy.interpolate import interp1d

output_dir = '../data/curated/'

import warnings
warnings.filterwarnings('ignore')


<h3> Preparation for forecasting </h3>

In [9]:
properties_df = pd.read_csv("../data/curated/properties_processed.csv", index_col=0)
properties_df['Postcode'] = properties_df['Postcode'].astype(str)

# read in census dataframes
census_df = pd.read_csv("../data/curated/census_data.csv")
sa2_postcode_map = pd.read_csv("../data/curated/sa2_postcode_mapping_2021.csv")

In [10]:
census_df_postcode_agg = convert_census_to_postcode(census_df, sa2_postcode_map, 'mean_no_zero')

Median Rental Prices

In [45]:
median_rent = properties_df.groupby(["Postcode"])["Cost"].median()


In [12]:
median_rent.to_csv(f'{output_dir}median_rental_postcode.csv')

In [13]:
# change according to how many years in the future forecasting is required for
forecasting_steps = 5

impute_features = ['tot_population', 'avg_med_mortg_rep', 'avg_med_person_inc', 'avg_med_rent', 'avg_med_hh_inc', 'tot_avg_hh_size']
init_years = [11, 16, 21]
new_years = list(range(11, 22))

forecasted_dict = dict()

for i in range(0, len(census_df_postcode_agg)):
    data = census_df_postcode_agg.iloc[i]
    #print(census_df_postcode_agg.iloc[i])

    # interpolation 
    interpolated_dict = dict()
    interpolated_dict['date']= [f'20{year}' for year in new_years]

    for feature in impute_features:
        init_y = data[[f'{feature}_{init_years[0]}', f'{feature}_{init_years[1]}', f'{feature}_{init_years[2]}']].tolist()
        #print(init_y)

        interp_func = interp1d(init_years, init_y)
        new_y = list(interp_func(new_years))
        #print(new_y)

        interpolated_dict[feature] = new_y

    # creation of new dataframe
    interpolated_df = pd.DataFrame(interpolated_dict)
    

    # model
    interpolated_df['date'] = pd.to_datetime(interpolated_df['date'])
    interpolated_df = interpolated_df.set_index('date')

    model = VAR(endog=interpolated_df)
    model_fit = model.fit(trend='nc')

    forecasted_values = model_fit.forecast(interpolated_df.values[-1:], forecasting_steps)

    forecasted_avg_med_rent = [row[3] for row in forecasted_values]

    forecasted_dict[int(data['postcode_2021'])] = forecasted_avg_med_rent

In [14]:
forecasted_dict


{3000: [426.14564705882356,
  435.6320207612458,
  446.5590783635256,
  458.84207375390656,
  472.4012458860297],
 3002: [465.3687058823529,
  483.43419377162616,
  503.7272411968243,
  526.1168152440698,
  550.4795908179478],
 3003: [384.2588235294125,
  383.30830449827096,
  382.63134541013784,
  382.2118545036596,
  382.03468659168016],
 3004: [455.6243529411762,
  472.6844498269892,
  491.80171748422504,
  512.855145867506,
  535.7308431694178],
 3006: [478.8235294117649,
  499.72802768166116,
  523.5322613474459,
  550.0656577387728,
  579.1676778717863],
 3008: [494.50588235294083,
  490.61730103806167,
  486.3574598005284,
  481.7481974593205,
  476.81006819700724],
 3011: [413.3117647058821,
  445.3934256055361,
  480.5526358640341,
  518.60836316615,
  559.3902241563768],
 3012: [400.91058823529426,
  432.93467128027703,
  468.077337675555,
  506.1551413416988,
  546.9954271451284],
 3013: [441.5056470588237,
  476.25907958477524,
  514.3798396092003,
  555.6698490439531,
  59

In [57]:
# to calculate growth rate based on current medians derived from property data 
indexed_median_rent = median_rent.reset_index()
median_rent_dict = median_rent.to_dict()

growth_rate = dict()

for postcode in indexed_median_rent['Postcode']:

    current_value = median_rent_dict[str(postcode)]
    forecasted_2026 = forecasted_dict[int(postcode)][-1]

    # growth rate is defined as the proportion of the predicted value in 2026 
    # generated from the time regression model above against the current 
    # median rental price from the scaped data 

    rate = ((forecasted_2026 - current_value)/current_value)  

    growth_rate[int(postcode)] = rate

    

growth_rate_df = pd.DataFrame.from_dict(growth_rate, orient='index', columns = ['Growth Rate'])


Unnamed: 0,Growth Rate
3000,-0.073723
3002,-0.143222
3003,-0.265318
3004,-0.025944
3006,0.050168
...,...
3981,0.126868
3984,0.081229
3987,-0.144583
3995,0.024686


In [62]:
growth_rate_df.sort_values(by = 'Growth Rate', ascending= False).head(10)

Unnamed: 0,Growth Rate
3277,7.903577
3424,1.108529
3796,1.061733
3799,1.020913
3713,0.840182
3061,0.818834
3028,0.788013
3027,0.677426
3824,0.659491
3737,0.65901
