In [1]:
import sys
sys.path.insert(0, '../scripts/')
from helper_functions import convert_census_to_postcode

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from statsmodels.tsa.vector_ar.var_model import VAR

import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.formula.api import ols

import geopandas as gpd

from scipy.interpolate import interp1d

output_dir = '../data/curated/'

import warnings
warnings.filterwarnings('ignore')


<h3> Preparation for forecasting </h3>

In [3]:
properties_df = pd.read_csv("../data/curated/properties_processed.csv", index_col=0)
properties_df['Postcode'] = properties_df['Postcode'].astype(str)

# read in census dataframes
census_df = pd.read_csv("../data/curated/census_data.csv")
sa2_postcode_map = pd.read_csv("../data/curated/sa2_postcode_mapping_2021.csv")

In [4]:
census_df_postcode_agg = convert_census_to_postcode(census_df, sa2_postcode_map, 'mean_no_zero')

Median Rental Prices

In [5]:
median_rent = properties_df.groupby(["Postcode"])["Cost"].median()


In [6]:
median_rent.to_csv(f'{output_dir}median_rental_postcode.csv')

In [7]:
# change according to how many years in the future forecasting is required for
forecasting_steps = 5

impute_features = ['tot_population', 'avg_med_mortg_rep', 'avg_med_person_inc', 'avg_med_rent', 'avg_med_hh_inc', 'tot_avg_hh_size']
init_years = [11, 16, 21]
new_years = list(range(11, 22))

forecasted_dict = dict()

for i in range(0, len(census_df_postcode_agg)):
    data = census_df_postcode_agg.iloc[i]
    #print(census_df_postcode_agg.iloc[i])

    # interpolation 
    interpolated_dict = dict()
    interpolated_dict['date']= [f'20{year}' for year in new_years]

    for feature in impute_features:
        init_y = data[[f'{feature}_{init_years[0]}', f'{feature}_{init_years[1]}', f'{feature}_{init_years[2]}']].tolist()
        #print(init_y)

        interp_func = interp1d(init_years, init_y)
        new_y = list(interp_func(new_years))
        #print(new_y)

        interpolated_dict[feature] = new_y

    # creation of new dataframe
    interpolated_df = pd.DataFrame(interpolated_dict)
    

    # model
    interpolated_df['date'] = pd.to_datetime(interpolated_df['date'])
    interpolated_df = interpolated_df.set_index('date')

    model = VAR(endog=interpolated_df)
    model_fit = model.fit(trend='nc')

    forecasted_values = model_fit.forecast(interpolated_df.values[-1:], steps=forecasting_steps)

    forecasted_avg_med_rent = [row[3] for row in forecasted_values]

    forecasted_dict[int(data['postcode_2021'])] = forecasted_avg_med_rent

In [8]:
forecasted_dict


{3000: [426.14564705882356,
  435.63202076124594,
  446.55907836352577,
  458.8420737539067,
  472.4012458860301],
 3002: [465.36870588235297,
  483.43419377162616,
  503.7272411968245,
  526.11681524407,
  550.4795908179482],
 3003: [384.2588235294124,
  383.3083044982709,
  382.6313454101378,
  382.2118545036595,
  382.03468659168],
 3004: [455.6243529411762,
  472.6844498269892,
  491.80171748422515,
  512.855145867506,
  535.7308431694178],
 3006: [478.8235294117648,
  499.7280276816614,
  523.5322613474463,
  550.0656577387729,
  579.1676778717863],
 3008: [494.50588235294117,
  490.61730103806224,
  486.3574598005291,
  481.7481974593214,
  476.8100681970082],
 3011: [413.3117647058821,
  445.3934256055364,
  480.5526358640344,
  518.6083631661503,
  559.3902241563771],
 3012: [400.9105882352942,
  432.934671280277,
  468.07733767555493,
  506.1551413416988,
  546.9954271451284],
 3013: [441.5056470588237,
  476.25907958477524,
  514.3798396092002,
  555.669849043953,
  599.94268

In [9]:
# to calculate growth rate based on current medians derived from property data 
indexed_median_rent = median_rent.reset_index()
median_rent_dict = median_rent.to_dict()

growth_rate = dict()

for postcode in indexed_median_rent['Postcode']:

    current_value = median_rent_dict[str(postcode)]
    forecasted_2026 = forecasted_dict[int(postcode)][-1]

    # growth rate is defined as the proportion of the predicted value in 2026 
    # generated from the time regression model above against the current 
    # median rental price from the scaped data 

    rate = ((forecasted_2026 - current_value)/current_value)  

    growth_rate[int(postcode)] = rate

    

growth_rate_df = pd.DataFrame.from_dict(growth_rate, orient='index', columns = ['Growth Rate'])


In [10]:
growth_rate_df.sort_values(by = 'Growth Rate', ascending= False).head(10)

Unnamed: 0,Growth Rate
3277,7.903577
3424,1.108529
3796,1.061733
3799,1.020913
3713,0.840182
3061,0.818834
3028,0.788013
3027,0.677426
3824,0.659491
3737,0.65901
