In [1]:
#california housing
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format
tf.__version__

'2.3.0'

In [26]:
dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [28]:
dataset.isnull().sum() / len(dataset)

longitude            0.000000
latitude             0.000000
housing_median_age   0.000000
total_rooms          0.000000
total_bedrooms       0.010029
population           0.000000
households           0.000000
median_income        0.000000
median_house_value   0.000000
ocean_proximity      0.000000
dtype: float64

In [29]:
dataset.dropna(inplace=True)

dataset.isnull().sum() / len(dataset)

longitude            0.000000
latitude             0.000000
housing_median_age   0.000000
total_rooms          0.000000
total_bedrooms       0.000000
population           0.000000
households           0.000000
median_income        0.000000
median_house_value   0.000000
ocean_proximity      0.000000
dtype: float64

In [30]:
dataset.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [31]:
dataset.describe(include=['object'])

Unnamed: 0,ocean_proximity
count,20433
unique,5
top,<1H OCEAN
freq,9034


In [32]:
dataset.ocean_proximity.value_counts()

<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: ocean_proximity, dtype: int64

In [33]:
px.histogram(dataset, x='median_house_value')

In [34]:
index_to_drop = dataset[dataset.median_house_value == 500001].index
dataset = dataset.drop(index=index_to_drop)
px.histogram(dataset, x='median_house_value')

In [35]:
dataset_dummies = pd.get_dummies(dataset)
dataset_dummies.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [36]:
#podział  na zbiór testowy i treningowy
train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)
test_dataset = dataset_dummies.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset length: {len(test_dataset)}')

train_dataset length: 15580
test_dataset length: 3895


In [37]:
px.scatter_matrix(train_dataset, dimensions=['median_house_value', 'housing_median_age', 'median_income', 
                                             'total_rooms'], color='median_house_value', height=700)

In [38]:
train_stats = train_dataset.describe()
train_stats.pop('median_house_value')
train_stats = train_stats.transpose()

In [39]:
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

In [40]:
#standaryzacja
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [41]:
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [43]:
normed_train_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
19234,-1.560743,1.332727,-0.838742,0.338845,-0.091639,0.000695,0.012727,1.918729,1.141765,-0.707459,-0.013877,-0.346133,-0.37823
1859,-2.28349,2.93495,-0.598855,0.034607,0.089751,-0.200934,-0.018438,-0.889913,-0.875781,-0.707459,-0.013877,-0.346133,2.643727
51,-1.351396,1.011351,1.160315,-0.342501,-0.197646,-0.327599,-0.244389,-1.380585,-0.875781,-0.707459,-0.013877,2.888874,-0.37823
11192,0.811858,-0.8517,-0.039119,-0.080165,0.03557,0.350532,0.171152,-0.416818,1.141765,-0.707459,-0.013877,-0.346133,-0.37823
20355,0.298459,-0.679368,-0.998666,-0.370283,-0.456775,-0.736029,-0.532671,-1.077075,1.141765,-0.707459,-0.013877,-0.346133,-0.37823


In [44]:
normed_test_data = normed_test_data.values
normed_train_data = normed_train_data.values

In [45]:
normed_train_data

array([[-1.560743238953,  1.332726839926, -0.838741814943, ..., -0.013877293686, -0.346133368881, -0.378229535826],
       [-2.283489512019,  2.934950272896, -0.598854942732, ..., -0.013877293686, -0.346133368881,  2.643727473489],
       [-1.351396042617,  1.01135062808 ,  1.16031545348 , ..., -0.013877293686,  2.888874361871, -0.378229535826],
       ...,
       [ 0.697215807244, -0.79580836027 , -0.279005779784, ..., -0.013877293686, -0.346133368881, -0.378229535826],
       [-1.241737987393,  0.913540476649, -0.998666396417, ..., -0.013877293686,  2.888874361871, -0.378229535826],
       [ 1.175723684584, -0.702655835098, -0.838741814943, ..., -0.013877293686, -0.346133368881, -0.378229535826]])

In [46]:
#budowa modelu
def build_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer='l2', activation='relu', input_shape=[len(train_dataset.keys())]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae', 'mse'])
    return model

In [47]:
model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              14336     
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 604,929
Trainable params: 604,929
Non-trainable params: 0
_________________________________________________________________


In [48]:
history = model.fit(normed_train_data, train_labels.values, epochs=15, validation_split=0.2, verbose=1, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [49]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epoki', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epoki', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)

In [50]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

array([229536.17 , 193060.7  , 191666.95 , ...,  96930.266, 114411.31 ,  68153.2  ], dtype=float32)

In [51]:
pred = pd.DataFrame(test_labels)
pred['predictions'] = test_predictions
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500.0,229536.171875
13,191300.0,193060.703125
19,162900.0,191666.953125
26,93800.0,174515.609375
40,155400.0,177726.703125


In [53]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000]))
fig.show()

In [54]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()

Unnamed: 0,median_house_value,predictions,error
10,281500.0,229536.171875,51963.828125
13,191300.0,193060.703125,-1760.703125
19,162900.0,191666.953125,-28766.953125
26,93800.0,174515.609375,-80715.609375
40,155400.0,177726.703125,-22326.703125


In [55]:
px.histogram(pred, 'error', marginal='rug', width=1000)