In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#annual_data=pd.read_csv('../input/usa-housing-market-factors/Annual_Macroeconomic_Factors.csv',parse_dates=['Date'])
#monthly_data=pd.read_csv('../input/usa-housing-market-factors/Monthly_Macroeconomic_Factors.csv',parse_dates=['Date'])
factors_data=pd.read_csv('../input/usa-housing-market-factors/Housing_Macroeconomic_Factors_US (2).csv',parse_dates=['Date'])

In [3]:
factors_data

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
sns.scatterplot(data=factors_data,x='Date',y='house_price_index',hue='employment_rate')
#correlation between housing price index and date. A spike near 2005 probably due to the recession. Dips in housing price index occur when rate of unemployment is high. 

In [6]:
sns.scatterplot(data=factors_data,x='Date',y='employment_rate')

In [7]:
sns.scatterplot(data=factors_data,x='mortgage_rate',y='population')

In [8]:
from sklearn.preprocessing import LabelEncoder

def labelencoder(df):
    for c in df.columns:
        if df[c].dtype=='object': 
            df[c] = df[c].fillna('N')
            lbl = LabelEncoder()
            lbl.fit(list(df[c].values))
            df[c] = lbl.transform(df[c].values)
    return df

In [9]:
new_factors_data=labelencoder(factors_data)
data=new_factors_data
y=new_factors_data['house_price_index']
X=data.drop(['house_price_index'],axis=1)


In [10]:
new_factors_data = factors_data.dropna().set_index('Date')
new_factors_data.head(10)

In [11]:
factors_corr = new_factors_data.corr()
mask = np.zeros_like(factors_corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(12, 10))
    ax = sns.heatmap(factors_corr, mask=mask, vmax=1, vmin=-1, linewidths=.5, square=True, cmap='mako', annot=True)
    plt.title('Correlation Heatmap of Macroeconomic factors', fontsize = 15)
    plt.yticks(rotation=0)
    plt.xticks(rotation=90)
    #plt.tight_layout()

In [12]:
#strongest positive and negative correlations can be seen through this figure

In [13]:
#Normalizing data
copy_factors_data = factors_data.copy().reset_index()
copy_factors_data = copy_factors_data.drop(columns=['Date'])


In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normal_factors_data=pd.DataFrame(scaler.fit_transform(copy_factors_data), columns=copy_factors_data.columns, index=copy_factors_data.index) 
normal_factors_data.tail(5)

In [15]:
normal_factors_data.isna().any()

In [16]:
normal_factors_data['gdp'].isna().sum()
normal_factors_data['mortgage_rate'].isna().sum()
normal_factors_data['employment_rate'].isna().sum()
normal_factors_data['permit_new'].isna().sum()
normal_factors_data['ppi_res'].isna().sum()
normal_factors_data['m3'].isna().sum()
normal_factors_data['cci'].isna().sum()

normal_factors_data['gdp'] = normal_factors_data['gdp'].fillna(normal_factors_data['gdp'].mean())
normal_factors_data['mortgage_rate'] = normal_factors_data['mortgage_rate'].fillna(normal_factors_data['mortgage_rate'].mean())
normal_factors_data['employment_rate'] = normal_factors_data['employment_rate'].fillna(normal_factors_data['employment_rate'].mean())
normal_factors_data['permit_new'] = normal_factors_data['permit_new'].fillna(normal_factors_data['permit_new'].mean())
normal_factors_data['ppi_res'] = normal_factors_data['ppi_res'].fillna(normal_factors_data['ppi_res'].mean())
normal_factors_data['m3'] = normal_factors_data['m3'].fillna(normal_factors_data['m3'].mean())
normal_factors_data['cci'] = normal_factors_data['cci'].fillna(normal_factors_data['cci'].mean())


In [17]:
normal_factors_data['delinquency_rate'].isna().sum()

In [18]:
normal_factors_data['hcai'].isna().sum()

In [19]:
#33% of the hcai column is missing so filling it up will introduce biases in model. Dropping it may be a good idea because this factor seems to have a strong correlation with others.
#Our best  bet is to test our model score both with and without the hcai column.
#12% of delinquincy_rate column is missing. This is still workable if we fill the column because the correlation is low but not zero.

In [20]:
copy_n_factors_data = normal_factors_data.drop(columns=['hcai'])

In [21]:
copy_n_factors_data['delinquency_rate'] = copy_n_factors_data['delinquency_rate'].fillna(copy_n_factors_data['delinquency_rate']).mean()

In [22]:
copy_n_factors_data

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(copy_n_factors_data.drop(columns="house_price_index", axis=1), copy_n_factors_data["house_price_index"], test_size= 0.30, random_state=42)

In [24]:
from tensorflow.keras import models, layers, backend as K, utils
import shap

In [25]:
model = models.Sequential(name="Perceptron", layers=[    layers.Dense(   #a fully connected layer
          name="dense1",
          input_dim=11,        #with 11 features as the input
          units=6, #and 1 node because we want 1 output    
          activation='ReLU', #f(x)=x
    ),layers.Dense(   #a fully connected layer
          name="dense2",
          input_dim=11,     #with  features as the input
          units=3,            #and 1 node because we want 1 output
          activation='ReLU' #f(x)=x
    ), layers.Dense(    #a fully connected layer
          name="dense3",
          input_dim=11,        #with 3 features as the input
          units=1,            #and 1 node because we want 1 output
          activation='ReLU' #f(x)=x
)
])
model.summary()

In [26]:
# define metrics
# define metrics
def R2(y_train, y_test):
    ss_res =  K.sum(K.square(y_train - y_test)) 
    ss_tot = K.sum(K.square(y_train - K.mean(y_train))) 
    return ( 1 - ss_res/(ss_tot + K.epsilon()) )
# compile the neural network
model.compile(optimizer='adam', loss='mean_absolute_error', 
              metrics=[R2])

In [27]:
# train/validation
training = model.fit(x=X_train, y=y_train, batch_size=32, epochs=100, shuffle=True, verbose=0, validation_split=0.30)
# plot
metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]    
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(15,3))

## training    
ax[0].set(title="Training")    
ax11 = ax[0].twinx()    
ax[0].plot(training.history['loss'], color='black')    
ax[0].set_xlabel('Epochs')    
ax[0].set_ylabel('Loss', color='black')    
for metric in metrics:        
    ax11.plot(training.history[metric], label=metric)    
ax11.set_ylabel("Score", color='steelblue')    
ax11.legend()
        
## validation    
ax[1].set(title="Validation")    
ax22 = ax[1].twinx()    
ax[1].plot(training.history['val_loss'], color='black')    
ax[1].set_xlabel('Epochs')    
ax[1].set_ylabel('Loss', color='black')    
for metric in metrics:          
    ax22.plot(training.history['val_'+metric], label=metric)    
ax22.set_ylabel("Score", color="steelblue")    
plt.show()

In [28]:
utils.plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

In [29]:
preds = model.predict(X_test)

In [30]:
plt.figure(figsize=(10,10))
plt.scatter(y_test, preds, c='crimson')
plt.yscale('log')
plt.xscale('log')

p1 = max(max(preds), max(y_test))
p2 = min(min(preds), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.title("Accuracy Plot")
plt.axis('equal')
plt.show()

In [31]:
score = model.evaluate(X_test, y_test, batch_size=16)
score