In [9]:

# Recurrent Neural Networks Real Estate Price Prediction - Multivariate Time Series

"""
In folder /Processed Data, hpi_econo_and_constmaterial_merged_df is the dataframe we will start working with for our ML multivariate prediction model.

Predicting housing prices like financial assets is a complex task given the confluence of a vast number of variable that have a direct influence, positive or negative, on 
real estate prices. Unquantifiable events such as political decisions, economic cycles, market sentiment and the weather all do exert forces on housing prices; plus, many variables
are interdependent which makes statistical modeling difficult when it comes to housing.

We will go with the standard approach:
1_ selecting features
2_ scaling the data of selected features
3_ slicing the data with a sliding window approach
4_ training the model on past data
5_ validating our trained model
6_ making predictions
7_ unscaling predictions to have coherent values.

"""

'\nIn folder /Processed Data, hpi_econo_and_constmaterial_merged_df is the dataframe we will start working with for our ML multivariate prediction model.\n\nPredicting housing prices like financial assets is a complex task given the confluence of a vast number of variable that have a direct influence, positive or negative, on \nreal estate prices. Unquantifiable events such as political decisions, economic cycles, market sentiment and the weather all do exert forces on housing prices; plus, many variables\nare interdependent which makes statistical modeling difficult when it comes to housing.\n\nWe will go with the standard approach:\n1_ selecting features\n2_ scaling the data of selected features\n3_ slicing the data with a sliding window approach\n4_ training the model on past data\n5_ validating our trained model\n6_ making predictions\n7_ unscaling predictions to have coherent values.\n\n'

In [10]:
# pip install keras

In [3]:
import numpy as np
import pandas as pd
import math

# Visualization and date formatting packages
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set_style('white', {'axes.spines.right': False, 'axes.spines.top': False})
# Date functions
from datetime import date, timedelta, datetime 
from pandas.plotting import register_matplotlib_converters

# ML modules / deep learning for RNN
import tensorflow as tf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [7]:
# Loading csv in dataframe 

# We will use the hpi_econo_and_constmaterial_merged_df dataframe which includes House prices indexes, Econ data and construction materials Price indexes.

df = pd.read_csv(r"C:\Users\hp\Desktop\Projects Coding\Affinity_Propagation_Canada_Real_Estate_Market\Processed Data\hpi_econo_and_constmaterial_merged_df.csv")
df = df.drop(columns="Unnamed: 0", axis = 1)
df = df.drop(columns="REF_DATE.1", axis = 1)
df = df.drop(columns="REF_DATE", axis = 1)

# Finding all columns with NaN values
df.isna().any()

stone_VALUE                       False
crushedStone_VALUE                False
logsbolts_VALUE                   False
ironOres_concentrates_VALUE       False
HPI_nfland_labrador                True
HPI_charlottetown_pei              True
HPI_halifax_ns                     True
HPI_stjohn_fredericton_moncton     True
HPI_quebec_qc                      True
HPI_sherbrooke_qc                  True
HPI_troisriv_qc                    True
HPI_mtl_qc                         True
HPI_ottawa_gatineau_qc             True
HPI_ottawa_gatineau_ont            True
HPI_oshawa_ont                     True
HPI_toronto_ont                    True
HPI_hamilton_ont                   True
HPI_guelph_ont                     True
HPI_london_ont                     True
HPI_windsor_ont                    True
HPI_sudbury_ont                    True
HPI_winnipeg_manitoba              True
HPI_edmonton_alb                   True
HPI_calgary_alb                    True
HPI_victoria_bc                    True


In [8]:
# With nan values in all columns of df 
# Let's fill all NaN cells with mean value of column 

df.apply(lambda x: x.fillna(x.mean()),axis=0)

# df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())

Unnamed: 0,stone_VALUE,crushedStone_VALUE,logsbolts_VALUE,ironOres_concentrates_VALUE,HPI_nfland_labrador,HPI_charlottetown_pei,HPI_halifax_ns,HPI_stjohn_fredericton_moncton,HPI_quebec_qc,HPI_sherbrooke_qc,HPI_troisriv_qc,HPI_mtl_qc,HPI_ottawa_gatineau_qc,HPI_ottawa_gatineau_ont,HPI_oshawa_ont,HPI_toronto_ont,HPI_hamilton_ont,HPI_guelph_ont,HPI_london_ont,HPI_windsor_ont,HPI_sudbury_ont,HPI_winnipeg_manitoba,HPI_edmonton_alb,HPI_calgary_alb,HPI_victoria_bc,HPI_vancouver_bc,HPI_kelowna_bc,Interest_Rate_Central_Bank,CPI_inflation,unemployment_rate
0,6221.6308,5454.0270,7130.6353,9534.4472,37.500000,97.493051,76.06122,61.400000,34.900000,103.114706,104.547059,30.000000,108.619118,30.800000,107.919118,22.600000,24.800000,113.629412,27.000000,64.900000,54.60000,29.300000,36.500000,27.800000,206.800000,96.100000,109.138235,17.00000,12.200957,7.400000
1,6282.2311,5454.0270,7130.6353,10261.6508,37.500000,97.493051,76.06122,62.100000,35.400000,103.114706,104.547059,30.200000,108.619118,30.800000,107.919118,23.200000,24.900000,113.629412,27.500000,64.900000,55.60000,29.700000,36.800000,28.100000,209.100000,97.500000,109.138235,17.14000,12.056738,7.400000
2,6403.4317,5494.4272,7231.6358,10201.0505,37.500000,97.493051,76.06122,62.100000,35.400000,103.114706,104.547059,30.500000,108.619118,31.900000,107.919118,24.800000,25.300000,113.629412,28.200000,64.100000,55.60000,30.300000,36.800000,28.600000,210.600000,97.500000,109.138235,16.59000,12.676056,7.300000
3,6443.8319,5534.8274,7312.4362,8968.8444,37.500000,97.493051,76.06122,62.100000,35.700000,103.114706,104.547059,30.800000,108.619118,32.000000,107.919118,26.500000,25.600000,113.629412,28.600000,63.900000,57.00000,30.500000,36.900000,30.100000,210.600000,97.700000,109.138235,17.40000,12.587413,7.100000
4,6423.6318,5413.6268,7393.2366,9009.2446,37.700000,97.493051,76.06122,63.300000,36.100000,103.114706,104.547059,31.100000,108.619118,32.700000,107.919118,27.700000,26.400000,113.629412,28.600000,63.900000,57.00000,31.100000,38.200000,30.100000,212.400000,97.700000,109.138235,19.06000,12.211982,7.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,24765.3226,24765.3226,35289.5747,34420.9704,106.700000,125.500000,127.50000,121.100000,132.600000,113.100000,112.600000,161.100000,120.500000,187.400000,125.300000,114.600000,121.900000,134.100000,156.100000,147.400000,122.30000,158.600000,113.500000,133.400000,132.100000,127.200000,128.300000,1.25000,7.730496,5.000000
497,26017.7288,26017.7288,34845.1725,34198.7693,107.400000,126.900000,127.50000,121.100000,133.300000,113.100000,112.600000,160.900000,120.500000,188.900000,125.300000,114.600000,121.900000,134.100000,156.100000,149.000000,122.50000,158.600000,113.600000,133.600000,132.100000,127.200000,128.300000,1.75000,8.132956,5.100000
498,26502.5312,26502.5312,35531.9759,28684.1420,107.400000,127.300000,127.50000,121.100000,133.300000,113.100000,112.600000,160.600000,120.500000,190.200000,125.300000,114.600000,121.900000,134.100000,156.100000,149.000000,122.50000,158.600000,113.200000,133.200000,132.100000,127.600000,128.400000,2.75000,7.589599,5.000000
499,25330.9254,25330.9254,34764.3721,28704.3421,66.149299,97.493051,76.06122,87.164329,75.387375,103.114706,104.547059,72.429259,108.619118,74.542084,107.919118,59.391984,66.150501,113.629412,70.253908,91.206212,86.50521,64.551102,68.176754,59.740681,131.412425,88.898397,109.138235,5.18008,2.974651,8.282365


In [4]:
# Given columns with long successions of NaN values let's drop these column before building our Model.

# df = df.drop(columns={"HPI_nfland_labrador", "HPI_charlottetown_pei", "HPI_halifax_ns", "HPI_kelowna_bc", "HPI_sherbrooke_qc", "HPI_troisriv_qc", "HPI_ottawa_gatineau_qc", "HPI_oshawa_ont"}, axis = 1)
# df.head()

Unnamed: 0,REF_DATE,stone_VALUE,crushedStone_VALUE,logsbolts_VALUE,ironOres_concentrates_VALUE,HPI_stjohn_fredericton_moncton,HPI_quebec_qc,HPI_mtl_qc,HPI_ottawa_gatineau_ont,HPI_toronto_ont,...,HPI_windsor_ont,HPI_sudbury_ont,HPI_winnipeg_manitoba,HPI_edmonton_alb,HPI_calgary_alb,HPI_victoria_bc,HPI_vancouver_bc,Interest_Rate_Central_Bank,CPI_inflation,unemployment_rate
0,1981-01,6221.6308,5454.027,7130.6353,9534.4472,61.4,34.9,30.0,30.8,22.6,...,64.9,54.6,29.3,36.5,27.8,206.8,96.1,17.0,12.200957,7.4
1,1981-02,6282.2311,5454.027,7130.6353,10261.6508,62.1,35.4,30.2,30.8,23.2,...,64.9,55.6,29.7,36.8,28.1,209.1,97.5,17.14,12.056738,7.4
2,1981-03,6403.4317,5494.4272,7231.6358,10201.0505,62.1,35.4,30.5,31.9,24.8,...,64.1,55.6,30.3,36.8,28.6,210.6,97.5,16.59,12.676056,7.3
3,1981-04,6443.8319,5534.8274,7312.4362,8968.8444,62.1,35.7,30.8,32.0,26.5,...,63.9,57.0,30.5,36.9,30.1,210.6,97.7,17.4,12.587413,7.1
4,1981-05,6423.6318,5413.6268,7393.2366,9009.2446,63.3,36.1,31.1,32.7,27.7,...,63.9,57.0,31.1,38.2,30.1,212.4,97.7,19.06,12.211982,7.2


In [5]:
# print(df.columns)

Index(['REF_DATE', 'stone_VALUE', 'crushedStone_VALUE', 'logsbolts_VALUE',
       'ironOres_concentrates_VALUE', 'HPI_stjohn_fredericton_moncton',
       'HPI_quebec_qc', 'HPI_mtl_qc', 'HPI_ottawa_gatineau_ont',
       'HPI_toronto_ont', 'HPI_hamilton_ont', 'HPI_guelph_ont',
       'HPI_london_ont', 'HPI_windsor_ont', 'HPI_sudbury_ont',
       'HPI_winnipeg_manitoba', 'HPI_edmonton_alb', 'HPI_calgary_alb',
       'HPI_victoria_bc', 'HPI_vancouver_bc', 'Interest_Rate_Central_Bank',
       'CPI_inflation', 'unemployment_rate'],
      dtype='object')


In [6]:
train_df = df.sort_values(by=["REF_DATE"]).copy()

FEATURES = ['stone_VALUE', 'crushedStone_VALUE', 'logsbolts_VALUE',
       'ironOres_concentrates_VALUE', 'HPI_stjohn_fredericton_moncton',
       'HPI_quebec_qc', 'HPI_mtl_qc', 'HPI_ottawa_gatineau_ont',
       'HPI_toronto_ont', 'HPI_hamilton_ont', 'HPI_guelph_ont',
       'HPI_london_ont', 'HPI_windsor_ont', 'HPI_sudbury_ont',
       'HPI_winnipeg_manitoba', 'HPI_edmonton_alb', 'HPI_calgary_alb',
       'HPI_victoria_bc', 'HPI_vancouver_bc', 'Interest_Rate_Central_Bank',
       'CPI_inflation']

# print([f for f in FEATURES])

# Dataset with features and filtering data to the list of FEATURES

data = pd.DataFrame(train_df)
data_filter = data[FEATURES]

# Prediction column with dummy values to prepare data before scaling

data_filter_ext = data_filter.copy()
data_filter_ext['Prediction'] = data_filter_ext['HPI_mtl_qc']

# Show last values, tail, of the dataframe
data_filter_ext.tail()

Unnamed: 0,stone_VALUE,crushedStone_VALUE,logsbolts_VALUE,ironOres_concentrates_VALUE,HPI_stjohn_fredericton_moncton,HPI_quebec_qc,HPI_mtl_qc,HPI_ottawa_gatineau_ont,HPI_toronto_ont,HPI_hamilton_ont,...,HPI_windsor_ont,HPI_sudbury_ont,HPI_winnipeg_manitoba,HPI_edmonton_alb,HPI_calgary_alb,HPI_victoria_bc,HPI_vancouver_bc,Interest_Rate_Central_Bank,CPI_inflation,Prediction
496,24765.3226,24765.3226,35289.5747,34420.9704,121.1,132.6,161.1,187.4,114.6,121.9,...,147.4,122.3,158.6,113.5,133.4,132.1,127.2,1.25,7.730496,161.1
497,26017.7288,26017.7288,34845.1725,34198.7693,121.1,133.3,160.9,188.9,114.6,121.9,...,149.0,122.5,158.6,113.6,133.6,132.1,127.2,1.75,8.132956,160.9
498,26502.5312,26502.5312,35531.9759,28684.142,121.1,133.3,160.6,190.2,114.6,121.9,...,149.0,122.5,158.6,113.2,133.2,132.1,127.6,2.75,7.589599,160.6
499,25330.9254,25330.9254,34764.3721,28704.3421,,,,,,,...,,,,,,,,,,
500,24462.3211,24462.3211,29936.5482,27128.7343,,,,,,,...,,,,,,,,,,


In [7]:
# Scaling the input data with MinMaxScaler for scaling data in a range 0:1
# Model trained on scaled input imply that we unscale the output/predictions 
# scalar_model will be 23 x 23 like the number of features 

nrows = data_filter.shape[0] 

np_data_unscaled = np.array(data_filter)
np_data = np.reshape(np_data_unscaled, (nrows, -1))
print(np_data.shape)

scaler = MinMaxScaler()
np_data_scaled = scaler.fit_transform(np_data_unscaled)

# Separate scaler that works on single column for scaling our predictions
scaler_pred = MinMaxScaler()
df_prediction = pd.DataFrame(data_filter_ext['HPI_mtl_qc'])
np_prediction_scaled = scaler_pred.fit_transform(df_prediction)

(501, 21)


In [8]:
# Sequence length: time frame to make a prediction
sequence_length = 50

# Prediction Index
index_prediction = data.columns.get_loc("HPI_mtl_qc")

# Splitting data between train and test datasets
train_data_len = math.ceil(np_data_scaled.shape[0] * 0.8)
train_data = np_data_scaled[0:train_data_len, :]
test_data = np_data_scaled[train_data_len - sequence_length:, :]

# Recurrent Neural Networks needs to be given data in format: Samples, Time steps, Features

def partition_dataset(sequence_length, data):
    x, y = [], []
    data_len = data.shape[0]
    for i in range(sequence_length, data_len):
        x.append(data[i - sequence_length:i, :])   # sequence_length values 
        y.append(data[i, index_prediction]) # these are prediction values

    # x, y to numpy arrays
    x = np.array(x)
    y = np.array(y)
    return x, y

# Creating our training and testing datasets

x_train, y_train = partition_dataset(sequence_length, train_data)
x_test, y_test = partition_dataset(sequence_length, test_data)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(351, 50, 21) (351,)
(100, 50, 21) (100,)


In [9]:
# Training multivariate prediction model

model = Sequential()

n_neurons = x_train.shape[1] * x_train.shape[2]
print(n_neurons, x_train.shape[1], x_train.shape[2])

model.add(LSTM(n_neurons, return_sequences = True, input_shape=(x_train.shape[1], x_train.shape[2])))
model.add(LSTM(n_neurons, return_sequences = False ))
model.add(Dense(5))
model.add(Dense(1))

model.compile(optimizer = 'adam', loss = 'mse')

1050 50 21


In [10]:
# Starting the training process by running the code below

epochs = 75
batch_size = 18

early_stop = EarlyStopping(monitor = 'loss', patience = 5, verbose = 1)
history = model.fit(x_train, y_train, 
                    batch_size = batch_size, 
                    epochs = epochs,
                    validation_data = (x_test, y_test))

Epoch 1/75


In [None]:
import pickle

# save the model to disk
filename = 'canada_hpi_model.sav'
pickle.dump(model, open(filename, 'wb'))