In [1]:
# import the necessary packages
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
import pandas as pd
import numpy as np
import glob
import cv2
import os
import locale

Introduction to the House Price Estimation Dataset
This dataset was introduced and published in a 2016 paper titled '2016 House Price Estimation from Visual and Textual Features.


##### https://github.com/emanhamed/Houses-dataset
##### https://arxiv.org/pdf/1609.08399.pdf

In [2]:
cols = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
df = pd.read_csv("https://raw.githubusercontent.com/emanhamed/Houses-dataset/master/Houses%20Dataset/HousesInfo.txt", sep=" ", header=None, names=cols)

df.head()

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226


In [11]:
zipcodes, counts = np.unique(df["zipcode"], return_counts=True)

In [12]:
df["zipcode"].value_counts()

Unnamed: 0_level_0,count
zipcode,Unnamed: 1_level_1
92276,100
93510,60
93446,54
92880,49
94501,41
91901,32
92677,26
94531,22
85255,12
96019,12


In [13]:
df.shape

(535, 5)

In [14]:
# Get zipcodes with fewer than 25 occurrences
low_count_zips = df["zipcode"].value_counts()[df["zipcode"].value_counts() < 25].index

# Drop rows where the zipcode is in the low-count list
df.drop(df[df["zipcode"].isin(low_count_zips)].index, inplace=True)



In [15]:
df.shape

(362, 5)

In [16]:
# Split Dataset into Training and Testing Sets

(train, test) = train_test_split(df, test_size=0.25, random_state=42)
print(train.shape)
print(test.shape)

(271, 5)
(91, 5)


## Preprocessing

In [17]:
# find the largest house price in the training set and use it to
# scale our house prices to the range [0, 1] (this will lead to
# better training and convergence)
maxPrice = train["price"].max()
trainY = train["price"] / maxPrice
testY = test["price"] / maxPrice

In [18]:
# initialize the column names of the continuous data
continuous = ["bedrooms", "bathrooms", "area"]

# performin min-max scaling each continuous feature column to
# the range [0, 1]
scaler = MinMaxScaler()
trainContinuous = scaler.fit_transform(train[continuous])
testContinuous = scaler.transform(test[continuous])

In [19]:
# one-hot encode the zip code categorical data (by definition of
# one-hot encoing, all output features are now in the range [0, 1])
zipBinarizer = LabelBinarizer().fit(df["zipcode"])
trainCategorical = zipBinarizer.transform(train["zipcode"])
testCategorical = zipBinarizer.transform(test["zipcode"])

In [20]:
zipBinarizer.classes_

array([91901, 92276, 92677, 92880, 93446, 93510, 94501])

In [21]:
trainCategorical.shape

(271, 7)

In [22]:
# construct our training and testing data points by concatenating
# the categorical features with the continuous features
trainX = np.hstack([trainCategorical, trainContinuous])
testX = np.hstack([testCategorical, testContinuous])

print(trainX.shape)
print(testX.shape)

(271, 10)
(91, 10)


## Model Architecture

In [25]:
dim = trainX.shape[1]

# define our MLP network
model = Sequential()
model.add(Dense(8, input_dim=dim, activation="relu"))
model.add(Dense(4, activation="relu"))
model.add(Dense(1, activation="linear"))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Compile Model

In [26]:
from tensorflow.keras.optimizers import Adam

opt = Adam(learning_rate=1e-3)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)

## Training Model

In [27]:
model.fit(x=trainX, y=trainY, validation_data=(testX, testY), epochs=200, batch_size=8)

Epoch 1/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - loss: 359.5607 - val_loss: 87.7315
Epoch 2/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 78.5842 - val_loss: 59.4321
Epoch 3/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 60.8043 - val_loss: 50.8352
Epoch 4/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 53.0693 - val_loss: 49.9522
Epoch 5/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 52.5476 - val_loss: 45.1922
Epoch 6/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 44.8414 - val_loss: 42.1018
Epoch 7/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 45.6417 - val_loss: 40.5503
Epoch 8/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 34.8826 - val_loss: 39.6934
Epoch 9/200
[1m34/34[0m [32

<keras.src.callbacks.history.History at 0x7de0760cfb10>

In [29]:
preds = model.predict(testX)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [30]:
# make prediction on the testing data


preds = model.predict(testX)

# compute the difference between the *predicted* house prices and the
# *actual* house prices, then compute the percentage difference and
# the absolute percentage difference
diff = preds.flatten() - testY
percentDiff = (diff / testY) * 100
absPercentDiff = np.abs(percentDiff)

# compute the mean and standard deviation of the absolute percentage
# difference
mean = np.mean(absPercentDiff)
std = np.std(absPercentDiff)

# finally, show some statistics on our model
locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
print("avg. house price: {}, std house price: {}".format(
    locale.currency(df["price"].mean(), grouping=True),
    locale.currency(df["price"].std(), grouping=True)))
print("mean: {:.2f}%, std: {:.2f}%".format(mean, std))

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
avg. house price: $533,388.27, std house price: $493,403.08
mean: 22.97%, std: 25.22%
