In [1]:
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(Path("../data/housing_price_dataset.csv"))
df.head(10)

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065
5,2095,2,3,Suburb,2020,198208.803907
6,2724,2,1,Suburb,1993,343429.31911
7,2044,4,3,Rural,1957,184992.321268
8,2638,4,3,Urban,1959,377998.588152
9,1121,5,2,Urban,2004,95961.926014


In [3]:
df.nunique()

SquareFeet       2000
Bedrooms            4
Bathrooms           3
Neighborhood        3
YearBuilt          72
Price           50000
dtype: int64

In [4]:
square_feet_vc = df["SquareFeet"].value_counts()
square_feet_vc

2486    43
1897    41
2946    40
2573    40
1398    40
        ..
1019    12
1635    12
2330    12
2087    11
1828    10
Name: SquareFeet, Length: 2000, dtype: int64

In [5]:
year_built_vc = df["YearBuilt"].value_counts()
year_built_vc

1968    777
1977    742
1983    741
1967    729
1999    728
       ... 
1950    643
2016    642
2006    640
2012    638
2001    621
Name: YearBuilt, Length: 72, dtype: int64

In [6]:
neighborhood_vc = df["Neighborhood"].value_counts()
neighborhood_vc

Suburb    16721
Rural     16676
Urban     16603
Name: Neighborhood, dtype: int64

In [7]:
bedroom_vc = df["Bedrooms"].value_counts()
bedroom_vc

3    12661
5    12468
2    12436
4    12435
Name: Bedrooms, dtype: int64

In [8]:
bathroom_vc = df["Bathrooms"].value_counts()
bathroom_vc

1    16755
2    16719
3    16526
Name: Bathrooms, dtype: int64

In [9]:
full_df = pd.get_dummies(df)
full_df.head(10)

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,YearBuilt,Price,Neighborhood_Rural,Neighborhood_Suburb,Neighborhood_Urban
0,2126,4,1,1969,215355.283618,1,0,0
1,2459,3,2,1980,195014.221626,1,0,0
2,1860,2,1,1970,306891.012076,0,1,0
3,2294,2,1,1996,206786.787153,0,0,1
4,2130,5,2,2001,272436.239065,0,1,0
5,2095,2,3,2020,198208.803907,0,1,0
6,2724,2,1,1993,343429.31911,0,1,0
7,2044,4,3,1957,184992.321268,1,0,0
8,2638,4,3,1959,377998.588152,0,0,1
9,1121,5,2,2004,95961.926014,0,0,1


In [10]:
print("Features")
X = full_df.drop(["Price"], axis=1)
print(X.head(10))
y = full_df[["Price"]]
print("\nValues")
print(y)

Features
   SquareFeet  Bedrooms  Bathrooms  YearBuilt  Neighborhood_Rural  \
0        2126         4          1       1969                   1   
1        2459         3          2       1980                   1   
2        1860         2          1       1970                   0   
3        2294         2          1       1996                   0   
4        2130         5          2       2001                   0   
5        2095         2          3       2020                   0   
6        2724         2          1       1993                   0   
7        2044         4          3       1957                   1   
8        2638         4          3       1959                   0   
9        1121         5          2       2004                   0   

   Neighborhood_Suburb  Neighborhood_Urban  
0                    0                   0  
1                    0                   0  
2                    1                   0  
3                    0                   1  
4     

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)
print(X_train)
print(y_train)

       SquareFeet  Bedrooms  Bathrooms  YearBuilt  Neighborhood_Rural  \
49236        2826         2          3       1976                   0   
24779        2665         5          3       1985                   1   
35795        2506         3          2       2010                   0   
44754        2971         3          2       1969                   0   
44356        1910         3          1       1958                   0   
...           ...       ...        ...        ...                 ...   
48417        1989         3          3       1965                   0   
22637        2436         3          1       1987                   0   
42891        2805         2          3       1982                   0   
38368        1226         5          2       2016                   0   
14000        2660         3          3       1996                   0   

       Neighborhood_Suburb  Neighborhood_Urban  
49236                    1                   0  
24779                    

In [12]:
# default learning rate for adam optimizer in tf is 0.001
def adam(learn_rate = 0.001):
    return tf.keras.optimizers.Adam(learning_rate=learn_rate)

In [16]:
nn = tf.keras.models.Sequential()

# model for the neural network
nn.add(tf.keras.layers.Dense(units=10, activation='relu', input_dim=len(X.columns)))

nn.add(tf.keras.layers.Dense(units=20, activation='relu'))

nn.add(tf.keras.layers.Dense(units=10, activation='relu'))

nn.add(tf.keras.layers.Dense(units=10, activation='relu'))

nn.add(tf.keras.layers.Dense(units=1, activation='relu'))

nn.summary()

nn.compile(loss="msle", optimizer=adam(0.0015), metrics=["msle"])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 10)                80        
                                                                 
 dense_7 (Dense)             (None, 20)                220       
                                                                 
 dense_8 (Dense)             (None, 10)                210       
                                                                 
 dense_9 (Dense)             (None, 10)                110       
                                                                 
 dense_10 (Dense)            (None, 1)                 11        
                                                                 
Total params: 631
Trainable params: 631
Non-trainable params: 0
_________________________________________________________________


In [17]:
nn_fit = nn.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
  32/1172 [..............................] - ETA: 1s - loss: 150.1799 - msle: 150.1799

In [15]:
# Evaluate the model using the test data
model_loss, model_msle = nn.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, MSLE: {model_msle}")

391/391 - 1s - loss: 0.1353 - msle: 0.1353 - 1s/epoch - 3ms/step
Loss: 0.13533152639865875, MSLE: 0.13533152639865875


In [19]:
# square feet, bedrooms, bathrooms, year built, rural, suburb, urban
value = nn.predict([[2700, 6, 4, 1999, 0, 0, 1]])
value[0][0]



295734.06

The optimized model so far has `0.006` less msle than the basic model.

In [20]:
nn.save(filepath="models/optimized_model_6.h5", save_format="h5")