<a href="https://colab.research.google.com/github/MikolajWasowski/Housing-Prices/blob/main/tensorflow_keras_Sequential.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format


In [25]:
raw_dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [26]:
dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [27]:
dataset.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [28]:
dataset.dropna(inplace=True)

In [29]:
dataset.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [30]:
dataset.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [31]:
px.histogram(dataset, x='median_house_value')

In [32]:
dataset.median_house_value.value_counts()

500001.000000    958
137500.000000    119
162500.000000    116
112500.000000    103
187500.000000     92
                ... 
359200.000000      1
51200.000000       1
39800.000000       1
377600.000000      1
47000.000000       1
Name: median_house_value, Length: 3833, dtype: int64

In [33]:
index_to_drop = dataset[dataset.median_house_value == 500001].index

In [34]:
dataset.drop(index=index_to_drop, inplace=True)

In [35]:
px.histogram(dataset, x='median_house_value')

In [37]:
dataset_dummies = pd.get_dummies(dataset, drop_first=True)
dataset_dummies.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


In [46]:
data_final = dataset_dummies.iloc[:, 2:]
data_final

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,41.000000,880.000000,129.000000,322.000000,126.000000,8.325200,452600.000000,0,0,1,0
1,21.000000,7099.000000,1106.000000,2401.000000,1138.000000,8.301400,358500.000000,0,0,1,0
2,52.000000,1467.000000,190.000000,496.000000,177.000000,7.257400,352100.000000,0,0,1,0
3,52.000000,1274.000000,235.000000,558.000000,219.000000,5.643100,341300.000000,0,0,1,0
4,52.000000,1627.000000,280.000000,565.000000,259.000000,3.846200,342200.000000,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
20635,25.000000,1665.000000,374.000000,845.000000,330.000000,1.560300,78100.000000,1,0,0,0
20636,18.000000,697.000000,150.000000,356.000000,114.000000,2.556800,77100.000000,1,0,0,0
20637,17.000000,2254.000000,485.000000,1007.000000,433.000000,1.700000,92300.000000,1,0,0,0
20638,18.000000,1860.000000,409.000000,741.000000,349.000000,1.867200,84700.000000,1,0,0,0


In [47]:
from sklearn.model_selection import train_test_split

In [48]:
target = data_final.pop("median_house_value")
target

0       452600.000000
1       358500.000000
2       352100.000000
3       341300.000000
4       342200.000000
             ...     
20635    78100.000000
20636    77100.000000
20637    92300.000000
20638    84700.000000
20639    89400.000000
Name: median_house_value, Length: 19475, dtype: float64

In [58]:
X_train, X_test, y_train, y_test = train_test_split(data_final, target)
X_train

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
15506,9.000000,5685.000000,1442.000000,3773.000000,1250.000000,3.042600,0,0,0,0
19143,23.000000,2846.000000,516.000000,1526.000000,492.000000,3.733000,0,0,0,0
11217,34.000000,3903.000000,717.000000,2054.000000,716.000000,4.273100,0,0,0,0
13501,32.000000,2699.000000,552.000000,2086.000000,551.000000,2.297400,1,0,0,0
2233,8.000000,3468.000000,675.000000,1604.000000,626.000000,4.207100,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
17767,22.000000,2914.000000,768.000000,2962.000000,762.000000,2.203100,0,0,0,0
1398,37.000000,1784.000000,313.000000,788.000000,304.000000,4.291700,0,0,1,0
3576,20.000000,6331.000000,1537.000000,2957.000000,1509.000000,3.389200,0,0,0,0
8419,32.000000,1254.000000,399.000000,1281.000000,386.000000,2.297600,0,0,0,0


In [50]:
from sklearn.preprocessing import StandardScaler

In [59]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
X_train_std


array([[-1.542996573031,  1.443811956209,  2.183017458146, ..., -0.018505196526, -0.345426530659, -0.376217575256],
       [-0.424891355838,  0.109642747979, -0.053504766497, ..., -0.018505196526, -0.345426530659, -0.376217575256],
       [ 0.453619886242,  0.60637288292 ,  0.431960640796, ..., -0.018505196526, -0.345426530659, -0.376217575256],
       ...,
       [-0.664485330951,  1.74739536886 ,  2.412466282488, ..., -0.018505196526, -0.345426530659, -0.376217575256],
       [ 0.2938905695  , -0.638507086295, -0.336089108056, ..., -0.018505196526, -0.345426530659, -0.376217575256],
       [ 1.012672494839, -0.518201585306, -0.309521349448, ..., -0.018505196526, -0.345426530659, -0.376217575256]])

In [72]:
def build_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer='l2', activation='relu', input_shape=[len(X_train.keys())]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae', 'mse'])
    return model

In [73]:
model = build_model()
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 1024)              11264     
                                                                 
 dense_5 (Dense)             (None, 512)               524800    
                                                                 
 dense_6 (Dense)             (None, 128)               65664     
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                                 
Total params: 601857 (2.30 MB)
Trainable params: 601857 (2.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [74]:
history = model.fit(X_train_std, y_train.values, epochs=100, validation_split=0.2, verbose=1, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [75]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epochs', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epochs', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)


In [76]:
for name, value in zip(model.metrics_names, model.evaluate(X_test_std, y_test.values)):
    print(f'{name:8}{value:.4f}')

loss    3136321536.0000
mae     40120.7656
mse     3136321536.0000


In [77]:

test_predictions = model.predict(X_test_std).flatten()
test_predictions



array([137982.4 , 339155.9 , 146059.64, ..., 103922.57, 211805.31, 290847.12], dtype=float32)

In [78]:
pred = pd.DataFrame(y_test)
pred['predictions'] = test_predictions
pred.head()

Unnamed: 0,median_house_value,predictions
8205,152700.0,137982.40625
9342,414300.0,339155.90625
2616,99600.0,146059.640625
16153,346400.0,241938.453125
14862,150000.0,190217.1875


In [79]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000], mode='lines'))
fig.show()

In [80]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()

Unnamed: 0,median_house_value,predictions,error
8205,152700.0,137982.40625,14717.59375
9342,414300.0,339155.90625,75144.09375
2616,99600.0,146059.640625,-46459.640625
16153,346400.0,241938.453125,104461.546875
14862,150000.0,190217.1875,-40217.1875


In [82]:
px.histogram(pred, 'error', marginal='rug', width=1000)