<a href="https://colab.research.google.com/github/Lukas-Swc/neural-network-course/blob/main/05_regression/01_housing_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Problem regresji - prognoza ceny mieszkań

Celem tego notebook'a jest zapoznanie się z problemami regresji. W odróżnieniu od problemów klasyfikacji, gdzie staramy się przyporządkować daną klasę, w modelach regresji przewidujemy ciągłą zmienną docelową.

### Spis treści:
1. [Import bibliotek](#a0)
2. [Załadowanie danych i wstępna eksploracja](#a1)
3. [Podział na zbiór treningowy oraz testowy](#a2)
4. [Standaryzacja danych](#a3)
5. [Budowa modelu](#a4)
6. [Trenowanie sieci](#a5)





### <a name='a0'></a> 1. Import biblotek

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format
tf.__version__

'2.18.0'

### <a name='a1'></a> 2. Załadowanie danych i wstępna eksploracja

In [2]:
raw_dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [3]:
dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
dataset.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [5]:
dataset.isnull().sum() / len(dataset)

Unnamed: 0,0
longitude,0.0
latitude,0.0
housing_median_age,0.0
total_rooms,0.0
total_bedrooms,0.010029
population,0.0
households,0.0
median_income,0.0
median_house_value,0.0
ocean_proximity,0.0


In [6]:
dataset.dropna(inplace=True)

dataset.isnull().sum() / len(dataset)

Unnamed: 0,0
longitude,0.0
latitude,0.0
housing_median_age,0.0
total_rooms,0.0
total_bedrooms,0.0
population,0.0
households,0.0
median_income,0.0
median_house_value,0.0
ocean_proximity,0.0


In [7]:
dataset.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [8]:
dataset.describe(include=['object'])

Unnamed: 0,ocean_proximity
count,20433
unique,5
top,<1H OCEAN
freq,9034


In [9]:
dataset.ocean_proximity.value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,9034
INLAND,6496
NEAR OCEAN,2628
NEAR BAY,2270
ISLAND,5


In [10]:
px.histogram(dataset, x='median_house_value')

In [11]:
dataset.median_house_value.value_counts()

Unnamed: 0_level_0,count
median_house_value,Unnamed: 1_level_1
500001.000000,958
137500.000000,119
162500.000000,116
112500.000000,103
187500.000000,92
...,...
321700.000000,1
300800.000000,1
393100.000000,1
203600.000000,1


In [12]:
index_to_drop = dataset[dataset.median_house_value == 500001].index
dataset = dataset.drop(index=index_to_drop)
px.histogram(dataset, x='median_house_value')

In [13]:
dataset_dummies = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset_dummies.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


### <a name='a2'></a> 3. Podział na zbiór treningowy oraz testowy

In [14]:
train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)
test_dataset = dataset_dummies.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset length: {len(test_dataset)}')

train_dataset length: 15580
test_dataset length: 3895


In [15]:
train_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
19234,-122.69,38.51,18.0,3364.0,501.0,1442.0,506.0,6.6854,313000.0,0,0,0,0
1859,-124.14,41.95,21.0,2696.0,578.0,1208.0,494.0,2.275,122400.0,0,0,0,1
51,-122.27,37.82,43.0,1868.0,456.0,1061.0,407.0,1.5045,93800.0,0,0,1,0
11192,-117.93,33.82,28.0,2444.0,555.0,1848.0,567.0,3.0179,198800.0,0,0,0,0
20355,-118.96,34.19,16.0,1807.0,346.0,587.0,296.0,1.9811,162500.0,0,0,0,0


In [16]:
test_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,281500.0,0,0,1,0
13,-122.26,37.84,52.0,696.0,191.0,345.0,174.0,2.6736,191300.0,0,0,1,0
19,-122.27,37.84,52.0,1503.0,298.0,690.0,275.0,2.6033,162900.0,0,0,1,0
26,-122.28,37.85,49.0,1130.0,244.0,607.0,239.0,2.4597,93800.0,0,0,1,0
40,-122.26,37.83,52.0,1665.0,419.0,946.0,395.0,2.0978,155400.0,0,0,1,0


In [17]:
px.scatter_matrix(train_dataset, dimensions=['median_house_value', 'housing_median_age', 'median_income', 'total_rooms'], color='median_house_value', height=700)

In [18]:
train_stats = train_dataset.describe()
train_stats.pop('median_house_value')
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,15580.0,-119.55878,2.006237,-124.35,-121.75,-118.49,-117.99,-114.47
latitude,15580.0,35.648614,2.147016,32.55,33.93,34.27,37.73,41.95
housing_median_age,15580.0,28.489217,12.505895,1.0,18.0,29.0,37.0,52.0
total_rooms,15580.0,2620.014506,2195.654212,2.0,1441.0,2112.0,3119.0,39320.0
total_bedrooms,15580.0,539.900578,424.499548,2.0,299.0,436.0,647.0,6445.0
population,15580.0,1441.193068,1160.542775,3.0,801.0,1179.0,1746.0,35682.0
households,15580.0,501.099487,385.039835,2.0,283.0,411.0,605.0,6082.0
median_income,15580.0,3.672427,1.570297,0.4999,2.519975,3.4405,4.581425,15.0001
ocean_proximity_INLAND,15580.0,0.333569,0.471503,0.0,0.0,0.0,1.0,1.0
ocean_proximity_ISLAND,15580.0,0.000193,0.013876,0.0,0.0,0.0,0.0,1.0


In [19]:
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

### <a name='a3'></a> 4. Standaryzacja danych

In [20]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [21]:
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [22]:
normed_train_data.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
ocean_proximity_INLAND,0
ocean_proximity_ISLAND,0


In [23]:
normed_train_data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
19234,-1.560743,1.332727,-0.838742,0.338845,-0.091639,0.000695,0.012727,1.918729,-0.707459,-0.013877,-0.346133,-0.37823
1859,-2.28349,2.93495,-0.598855,0.034607,0.089751,-0.200934,-0.018438,-0.889913,-0.707459,-0.013877,-0.346133,2.643727
51,-1.351396,1.011351,1.160315,-0.342501,-0.197646,-0.327599,-0.244389,-1.380585,-0.707459,-0.013877,2.888874,-0.37823


In [24]:
normed_test_data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-1.346412,1.025324,1.879976,-0.190383,-0.249472,-0.457711,-0.257375,-0.298878,-0.707459,-0.013877,2.888874,-0.37823
13,-1.346412,1.020666,1.879976,-0.876283,-0.82191,-0.944552,-0.849521,-0.636075,-0.707459,-0.013877,2.888874,-0.37823
19,-1.351396,1.020666,1.879976,-0.508739,-0.569849,-0.647277,-0.587211,-0.680844,-0.707459,-0.013877,2.888874,-0.37823


In [25]:
normed_test_data = normed_test_data.values
normed_train_data = normed_train_data.values

### <a name='a4'></a> 5. Budowa modelu

In [26]:
def build_model():
  model = tf.keras.models.Sequential([
            # Use the shape of the training data to determine the input shape
            tf.keras.layers.Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2, input_shape=(normed_train_data.shape[1],)),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(1)
            ])

  model.compile(optimizer='adam',
                loss='mse',
                metrics=['mae', 'mse'])
  return model

In [27]:
model = build_model()
model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



### <a name='a5'></a> 6. Trenowanie sieci

In [28]:
history = model.fit(normed_train_data, train_labels.values, epochs=150, validation_split=0.2, verbose=1, batch_size=32)

Epoch 1/150
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - loss: 31265906688.0000 - mae: 143951.0000 - mse: 31265906688.0000 - val_loss: 4781991936.0000 - val_mae: 49833.2617 - val_mse: 4781991936.0000
Epoch 2/150
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - loss: 4356649984.0000 - mae: 47808.5430 - mse: 4356649984.0000 - val_loss: 3839916544.0000 - val_mae: 45722.3789 - val_mse: 3839916544.0000
Epoch 3/150
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - loss: 3718796032.0000 - mae: 44284.0781 - mse: 3718796032.0000 - val_loss: 3606910720.0000 - val_mae: 44233.0586 - val_mse: 3606910720.0000
Epoch 4/150
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 3821595392.0000 - mae: 43952.3828 - mse: 3821595392.0000 - val_loss: 3535153152.0000 - val_mae: 43477.1602 - val_mse: 3535153152.0000
Epoch 5/150
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[

In [29]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epoki', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epoki', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)

In [30]:
for name, value in zip(model.metrics_names, model.evaluate(normed_test_data, test_labels.values)):
    print(f'{name:8}{value:.4f}')

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 2142464512.0000 - mae: 31618.6719 - mse: 2142464512.0000
loss    2231467008.0000
compile_metrics32431.0605


In [31]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


array([211950.05, 167345.64, 149483.55, ...,  84230.23, 125128.25,  66750.76], dtype=float32)

In [32]:
pred = pd.DataFrame(test_labels)
pred['predictions'] = test_predictions
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500.0,211950.046875
13,191300.0,167345.640625
19,162900.0,149483.546875
26,93800.0,128152.90625
40,155400.0,157831.453125


In [33]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000], mode='lines'))
fig.show()

In [34]:
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500.0,211950.046875
13,191300.0,167345.640625
19,162900.0,149483.546875
26,93800.0,128152.90625
40,155400.0,157831.453125


In [35]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()

Unnamed: 0,median_house_value,predictions,error
10,281500.0,211950.046875,69549.953125
13,191300.0,167345.640625,23954.359375
19,162900.0,149483.546875,13416.453125
26,93800.0,128152.90625,-34352.90625
40,155400.0,157831.453125,-2431.453125


In [36]:
px.histogram(pred, 'error', marginal='rug', width=1000)