In [66]:

# Recurrent Neural Networks Real Estate Price Prediction - Multivariate Time Series

"""
In folder /Processed Data, hpi_econo_and_constmaterial_merged_df is the dataframe we will start working with for our ML multivariate prediction model.

Predicting housing prices like financial assets is a complex task given the confluence of a vast number of variable that have a direct influence, positive or negative, on 
real estate prices. Unquantifiable events such as political decisions, economic cycles, market sentiment and the weather all do exert forces on housing prices; plus, many variables
are interdependent which makes statistical modeling difficult when it comes to housing.

We will go with the standard approach:
1_ selecting features
2_ scaling the data of selected features
3_ slicing the data with a sliding window approach
4_ training the model on past data
5_ validating our trained model
6_ making predictions
7_ unscaling predictions to have coherent values.

We will 

"""

'\nIn folder /Processed Data, hpi_econo_and_constmaterial_merged_df is the dataframe we will start working with for our ML multivariate prediction model.\n\nPredicting housing prices like financial assets is a complex task given the confluence of a vast number of variable that have a direct influence, positive or negative, on \nreal estate prices. Unquantifiable events such as political decisions, economic cycles, market sentiment and the weather all do exert forces on housing prices; plus, many variables\nare interdependent which makes statistical modeling difficult when it comes to housing.\n\nWe will go with the standard approach:\n1_ selecting features\n2_ scaling the data of selected features\n3_ slicing the data with a sliding window approach\n4_ training the model on past data\n5_ validating our trained model\n6_ making predictions\n7_ unscaling predictions to have coherent values.\n\nWe will \n\n'

In [67]:
# pip install keras

In [68]:
import numpy as np
import pandas as pd
import math

# Visualization and date formatting packages
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set_style('white', {'axes.spines.right': False, 'axes.spines.top': False})
# Date functions
from datetime import date, timedelta, datetime 
from pandas.plotting import register_matplotlib_converters

# ML modules / deep learning for RNN
import tensorflow as tf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Checking tensorflow version and # of GPUs

print('Tensorflow version is ' + tf.__version__)
gpu = tf.config.list_physical_devices('GPU')
print("# of GPUs", len(gpu))


Tensorflow version is 2.10.0
# of GPUs 0


In [69]:
# Loading csv in dataframe 

df = pd.read_csv(r"C:\Users\hp\Desktop\Projects Coding\Affinity_Propagation_Canada_Real_Estate_Market\Processed Data\hpi_econo_and_constmaterial_merged_df.csv")
df = df.drop(columns="Unnamed: 0", axis = 1)
# Preview

df.head(200)

Unnamed: 0,REF_DATE,stone_VALUE,crushedStone_VALUE,logsbolts_VALUE,ironOres_concentrates_VALUE,REF_DATE.1,HPI_nfland_labrador,HPI_charlottetown_pei,HPI_halifax_ns,HPI_stjohn_fredericton_moncton,...,HPI_sudbury_ont,HPI_winnipeg_manitoba,HPI_edmonton_alb,HPI_calgary_alb,HPI_victoria_bc,HPI_vancouver_bc,HPI_kelowna_bc,Interest_Rate_Central_Bank,CPI_inflation,unemployment_rate
0,1981-01,6221.6308,5454.0270,7130.6353,9534.4472,1981-01,37.5,,,61.4,...,54.6,29.3,36.5,27.8,206.8,96.1,,17.00,12.200957,7.4
1,1981-02,6282.2311,5454.0270,7130.6353,10261.6508,1981-02,37.5,,,62.1,...,55.6,29.7,36.8,28.1,209.1,97.5,,17.14,12.056738,7.4
2,1981-03,6403.4317,5494.4272,7231.6358,10201.0505,1981-03,37.5,,,62.1,...,55.6,30.3,36.8,28.6,210.6,97.5,,16.59,12.676056,7.3
3,1981-04,6443.8319,5534.8274,7312.4362,8968.8444,1981-04,37.5,,,62.1,...,57.0,30.5,36.9,30.1,210.6,97.7,,17.40,12.587413,7.1
4,1981-05,6423.6318,5413.6268,7393.2366,9009.2446,1981-05,37.7,,,63.3,...,57.0,31.1,38.2,30.1,212.4,97.7,,19.06,12.211982,7.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1997-04,11009.0545,10625.2526,19371.8959,10605.0525,1997-04,49.3,88.4,59.1,80.5,...,88.1,46.6,48.9,36.8,126.5,80.2,,3.25,1.691094,9.5
196,1997-05,10988.8544,10625.2526,19533.4967,10443.4517,1997-05,49.8,88.4,58.2,80.1,...,87.8,46.6,49.0,36.9,125.5,80.1,,3.25,1.460674,9.5
197,1997-06,11110.0550,10908.0540,19695.0975,10483.8519,1997-06,49.6,88.5,58.3,80.0,...,87.8,46.8,49.1,37.2,126.4,79.4,,3.25,1.685393,9.3
198,1997-07,11231.2556,10887.8539,19836.4982,10443.4517,1997-07,49.6,88.8,58.3,79.8,...,87.7,46.8,49.1,37.3,126.4,78.6,,3.50,1.685393,9.1


In [72]:
# # Plotting line charts

# df_plot = df.copy()
# df_plot = df_plot.drop(columns = {"REF_DATE"}, axis = 1)

# ncols = 2 
# nrows = int(round(df_plot.shape[1] / ncols, 0))

# fig, ax = plt.subplots(nrows = nrows, ncols = ncols, sharex = True, figsize=(35, 54))
# for i, ax in enumerate(fig.axes):
#     sns.lineplot(data = df_plot.iloc[:, i], ax = ax)
#     ax.tick_params(axis = "x", rotation = 35, labelsize = 0, length = 0)
#     ax.xaxis.set_major_locator(mdates.AutoDateLocator())
#     # ax.set_title()
# fig.tight_layout()
# plt.show()

In [73]:
# Given columns with long successions of NaN values let's drop these column before building our Model.

df = df.drop(columns={"HPI_nfland_labrador", "HPI_charlottetown_pei", "HPI_halifax_ns", "HPI_kelowna_bc", "REF_DATE.1", "HPI_sherbrooke_qc", "HPI_troisriv_qc", "HPI_ottawa_gatineau_qc", "HPI_oshawa_ont"}, axis = 1)
df.head()

Unnamed: 0,REF_DATE,stone_VALUE,crushedStone_VALUE,logsbolts_VALUE,ironOres_concentrates_VALUE,HPI_stjohn_fredericton_moncton,HPI_quebec_qc,HPI_mtl_qc,HPI_ottawa_gatineau_ont,HPI_toronto_ont,...,HPI_windsor_ont,HPI_sudbury_ont,HPI_winnipeg_manitoba,HPI_edmonton_alb,HPI_calgary_alb,HPI_victoria_bc,HPI_vancouver_bc,Interest_Rate_Central_Bank,CPI_inflation,unemployment_rate
0,1981-01,6221.6308,5454.027,7130.6353,9534.4472,61.4,34.9,30.0,30.8,22.6,...,64.9,54.6,29.3,36.5,27.8,206.8,96.1,17.0,12.200957,7.4
1,1981-02,6282.2311,5454.027,7130.6353,10261.6508,62.1,35.4,30.2,30.8,23.2,...,64.9,55.6,29.7,36.8,28.1,209.1,97.5,17.14,12.056738,7.4
2,1981-03,6403.4317,5494.4272,7231.6358,10201.0505,62.1,35.4,30.5,31.9,24.8,...,64.1,55.6,30.3,36.8,28.6,210.6,97.5,16.59,12.676056,7.3
3,1981-04,6443.8319,5534.8274,7312.4362,8968.8444,62.1,35.7,30.8,32.0,26.5,...,63.9,57.0,30.5,36.9,30.1,210.6,97.7,17.4,12.587413,7.1
4,1981-05,6423.6318,5413.6268,7393.2366,9009.2446,63.3,36.1,31.1,32.7,27.7,...,63.9,57.0,31.1,38.2,30.1,212.4,97.7,19.06,12.211982,7.2


In [77]:
print(df.columns)

Index(['REF_DATE', 'stone_VALUE', 'crushedStone_VALUE', 'logsbolts_VALUE',
       'ironOres_concentrates_VALUE', 'HPI_stjohn_fredericton_moncton',
       'HPI_quebec_qc', 'HPI_mtl_qc', 'HPI_ottawa_gatineau_ont',
       'HPI_toronto_ont', 'HPI_hamilton_ont', 'HPI_guelph_ont',
       'HPI_london_ont', 'HPI_windsor_ont', 'HPI_sudbury_ont',
       'HPI_winnipeg_manitoba', 'HPI_edmonton_alb', 'HPI_calgary_alb',
       'HPI_victoria_bc', 'HPI_vancouver_bc', 'Interest_Rate_Central_Bank',
       'CPI_inflation', 'unemployment_rate'],
      dtype='object')


In [85]:
train_df = df.sort_values(by=["REF_DATE"]).copy()

FEATURES = ['stone_VALUE', 'crushedStone_VALUE', 'logsbolts_VALUE',
       'ironOres_concentrates_VALUE', 'HPI_stjohn_fredericton_moncton',
       'HPI_quebec_qc', 'HPI_mtl_qc', 'HPI_ottawa_gatineau_ont',
       'HPI_toronto_ont', 'HPI_hamilton_ont', 'HPI_guelph_ont',
       'HPI_london_ont', 'HPI_windsor_ont', 'HPI_sudbury_ont',
       'HPI_winnipeg_manitoba', 'HPI_edmonton_alb', 'HPI_calgary_alb',
       'HPI_victoria_bc', 'HPI_vancouver_bc', 'Interest_Rate_Central_Bank',
       'CPI_inflation', 'unemployment_rate']

# print([f for f in FEATURES])

# Dataset with features and filtering data to the list of FEATURES

data = pd.DataFrame(train_df)
data_filter = data[FEATURES]

# Prediction column with dummy values to prepare data before scaling

data_filter_ext = data_filter.copy()
data_filter_ext['Prediction'] = data_filter_ext['HPI_mtl_qc']

# Show last values, tail, of the dataframe
data_filter_ext.tail()

Unnamed: 0,stone_VALUE,crushedStone_VALUE,logsbolts_VALUE,ironOres_concentrates_VALUE,HPI_stjohn_fredericton_moncton,HPI_quebec_qc,HPI_mtl_qc,HPI_ottawa_gatineau_ont,HPI_toronto_ont,HPI_hamilton_ont,...,HPI_sudbury_ont,HPI_winnipeg_manitoba,HPI_edmonton_alb,HPI_calgary_alb,HPI_victoria_bc,HPI_vancouver_bc,Interest_Rate_Central_Bank,CPI_inflation,unemployment_rate,Prediction
496,24765.3226,24765.3226,35289.5747,34420.9704,121.1,132.6,161.1,187.4,114.6,121.9,...,122.3,158.6,113.5,133.4,132.1,127.2,1.25,7.730496,5.0,161.1
497,26017.7288,26017.7288,34845.1725,34198.7693,121.1,133.3,160.9,188.9,114.6,121.9,...,122.5,158.6,113.6,133.6,132.1,127.2,1.75,8.132956,5.1,160.9
498,26502.5312,26502.5312,35531.9759,28684.142,121.1,133.3,160.6,190.2,114.6,121.9,...,122.5,158.6,113.2,133.2,132.1,127.6,2.75,7.589599,5.0,160.6
499,25330.9254,25330.9254,34764.3721,28704.3421,,,,,,,...,,,,,,,,,,
500,24462.3211,24462.3211,29936.5482,27128.7343,,,,,,,...,,,,,,,,,,
