In [59]:
## The program below was written by modifying the following code:
#    https://towardsdatascience.com/keras-101-a-simple-and-interpretable-neural-network-model-for-house-pricing-regression-31b1a77f05ae

In [60]:
# Set up

# Import dependencies
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

# Dataset is the boston housing dataset from keras.
merged_df = pd.read_csv("../Resources/merged_data.csv")

merged_df.head()

Unnamed: 0,sold_price,bathroom_ct,bedroom_ct,home_sqft,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Population 25 and Over,...,Rate 25 and Over w/ Some or Completed Middle School,Rate 25 and Over w/ Some High School,Rate 25 and Over w/ Completed High School or Equivalent,"Rate 25 and Over w/ Some college, less than 1 year","Rate 25 and Over w/ Some college, 1 or more years",Rate 25 and Over w/ Associate's degree,Rate 25 and Over w/ Bachelor's degree,Rate 25 and Over w/ Master's degree,Rate 25 and Over w/ Professional school degree,Rate 25 and Over w/ Doctorate degree
0,617700.0,2.5,3.0,2422.0,8419.0,39.5,42936.0,37766.0,0.196223,6750.0,...,0.019704,0.082074,0.186222,0.05037,0.178815,0.045185,0.225037,0.104444,0.034222,0.022667
1,585000.0,2.0,3.0,1666.0,8419.0,39.5,42936.0,37766.0,0.196223,6750.0,...,0.019704,0.082074,0.186222,0.05037,0.178815,0.045185,0.225037,0.104444,0.034222,0.022667
2,575000.0,2.0,2.0,2204.0,8419.0,39.5,42936.0,37766.0,0.196223,6750.0,...,0.019704,0.082074,0.186222,0.05037,0.178815,0.045185,0.225037,0.104444,0.034222,0.022667
3,789000.0,2.5,3.0,1930.0,8419.0,39.5,42936.0,37766.0,0.196223,6750.0,...,0.019704,0.082074,0.186222,0.05037,0.178815,0.045185,0.225037,0.104444,0.034222,0.022667
4,359000.0,2.5,2.0,1310.0,8419.0,39.5,42936.0,37766.0,0.196223,6750.0,...,0.019704,0.082074,0.186222,0.05037,0.178815,0.045185,0.225037,0.104444,0.034222,0.022667


In [61]:
# Data is split for testing and training, then normalized
X = merged_df.loc[:, merged_df.columns != 'sold_price']
y = merged_df.loc[:, merged_df.columns == 'sold_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=456)


# Normalization is done by replacing all X values with their normalized values
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

In [62]:
# Model is crafted
# Sequential is used to add depth, input shape uses the number of x_train columns, mean-square-error and linear activation are
#     used in the last layer as this is a linear regression.
model = Sequential()
model.add(Dense(256, input_shape=(21, ), activation='relu', name='dense_1'))
model.add(Dense(128, activation='relu', name='dense_2'))
#model.add(Dense(32, activation='relu', name='dense_3'))
model.add(Dense(1, activation='linear', name='dense_output'))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               5632      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_output (Dense)         (None, 1)                 129       
Total params: 38,657
Trainable params: 38,657
Non-trainable params: 0
_________________________________________________________________


In [63]:
# Model is trained
history = model.fit(X_train, y_train, epochs=100, validation_split=0.05)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [64]:
# Visual representation to make a simpler way of verifying model is not overfitting
fig = go.Figure()

fig.add_trace(go.Scattergl(y=history.history['loss'],
                    name='Train'))

fig.add_trace(go.Scattergl(y=history.history['val_loss'],
                    name='Valid'))

fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

In [65]:
# Visual representation to make a simpler way of verifying model using mean absolute error
fig = go.Figure()

fig.add_trace(go.Scattergl(y=history.history['mae'],
                    name='Train'))

fig.add_trace(go.Scattergl(y=history.history['val_mae'],
                    name='Valid'))

fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Mean Absolute Error')

fig.show() 

In [66]:
# Model is tested
mse_nn, mae_nn = model.evaluate(X_test, y_test)

print('Mean squared error on test data: ', mse_nn)
print('Mean absolute error on test data: ', mae_nn)

Mean squared error on test data:  7502981632.0
Mean absolute error on test data:  58093.5


In [67]:
# Model is saved
model.save("housing_model.h5")

In [68]:
# Save the predicted values in an easily accessed array 
predictions = model.predict(X_test)
predicted = [predict[0]for predict in predictions]
predicted

[348200.53,
 183478.05,
 306013.75,
 287713.16,
 246191.17,
 382767.38,
 467242.12,
 513971.44,
 459733.97,
 433150.12,
 236283.64,
 465566.6,
 380261.53,
 470735.78,
 267640.47,
 297885.9,
 646527.75,
 314130.4,
 426083.62,
 285215.72,
 222412.23,
 304118.62,
 306483.6,
 342015.3,
 411740.1,
 267031.78,
 233683.22,
 260042.56,
 452028.4,
 366147.6,
 269702.28,
 164981.38,
 394514.34,
 248253.34,
 117955.72,
 322709.97,
 314736.03,
 316012.03,
 290244.72,
 583675.44,
 193212.45,
 2099059.5,
 274528.22,
 415902.38,
 257888.92,
 312306.12,
 292929.94,
 452903.62,
 577469.75,
 445412.78,
 251343.67,
 293532.97,
 560451.0,
 364612.56,
 460587.03,
 283865.97,
 245184.64,
 237757.1,
 278318.34,
 524421.94,
 264362.66,
 212705.89,
 253639.28,
 198866.78,
 239536.4,
 319113.56,
 240703.34,
 685765.1,
 354120.34,
 211233.08,
 310778.06,
 253390.67,
 296641.97,
 344157.47,
 133136.25,
 219266.98,
 301473.8,
 283990.6,
 304818.2,
 423803.56,
 411219.25,
 307691.53,
 262542.5,
 273924.16,
 296972.

In [69]:
# Difference between expected an actual value
diff = (predicted-y_test["sold_price"])/predicted
diff

9185    -0.108557
10698    0.784715
10096    0.088276
5044     0.066779
7582     0.086076
           ...   
3039     0.272146
969     -0.057508
3416     0.365211
17343    0.045858
6544     0.072665
Name: sold_price, Length: 5354, dtype: float64

In [70]:
# Mean of the absolute values of the difference between expected and actual
abs(diff).mean()

0.1853599531993999

In [71]:
# Range of values for difference
print(diff.min())
print(diff.max())

-2.9591154707909357
0.951168606702192


In [72]:
print(mean)
print(std)

bathroom_ct                                                    2.215294
bedroom_ct                                                     3.167414
home_sqft                                                   1780.877902
Population                                                 37791.788711
Median Age                                                    37.140160
Household Income                                           57504.766533
Per Capita Income                                          28233.510809
Poverty Rate                                                   0.161099
Population 25 and Over                                     24473.959488
Rate 25 and Over w/ less than 1st grade                        0.018247
Rate 25 and Over w/ Some or Completed Elementary School        0.029605
Rate 25 and Over w/ Some or Completed Middle School            0.023870
Rate 25 and Over w/ Some High School                           0.085176
Rate 25 and Over w/ Completed High School or Equivalent        0

In [132]:
# Normalization Values are saved to csv
mean_trans = pd.DataFrame(mean).transpose()
mean_trans.to_csv("mean_norm.csv")
std_trans = pd.DataFrame(std).transpose()
std_trans.to_csv("std_norm.csv")