<a href="https://colab.research.google.com/github/MattWroclaw/neural-networks/blob/main/05_regression/01_housing_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Problem regresji - prognoza ceny mieszkań

Celem tego notebook'a jest zapoznanie się z problemami regresji. W odróżnieniu od problemów klasyfikacji, gdzie staramy się przyporządkować daną klasę, w modelach regresji przewidujemy ciągłą zmienną docelową.

### Spis treści:
1. [Import bibliotek](#a0)
2. [Załadowanie danych i wstępna eksploracja](#a1)
3. [Podział na zbiór treningowy oraz testowy](#a2)
4. [Standaryzacja danych](#a3)
5. [Budowa modelu](#a4)
6. [Trenowanie sieci](#a5)





In [56]:

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format
tf.__version__

'2.17.0'

### <a name='a1'></a> 2. Załadowanie danych i wstępna eksploracja

In [57]:
raw_dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [58]:
dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [59]:
dataset.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [60]:
dataset.isnull().sum() / len(dataset)

Unnamed: 0,0
longitude,0.0
latitude,0.0
housing_median_age,0.0
total_rooms,0.0
total_bedrooms,0.010029
population,0.0
households,0.0
median_income,0.0
median_house_value,0.0
ocean_proximity,0.0


In [61]:
dataset.dropna(inplace=True)

dataset.isnull().sum() / len(dataset)

Unnamed: 0,0
longitude,0.0
latitude,0.0
housing_median_age,0.0
total_rooms,0.0
total_bedrooms,0.0
population,0.0
households,0.0
median_income,0.0
median_house_value,0.0
ocean_proximity,0.0


In [62]:
dataset.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [63]:
dataset.describe(include=['object'])

Unnamed: 0,ocean_proximity
count,20433
unique,5
top,<1H OCEAN
freq,9034


In [64]:
dataset.ocean_proximity.value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,9034
INLAND,6496
NEAR OCEAN,2628
NEAR BAY,2270
ISLAND,5


In [65]:
px.histogram(dataset, x='median_house_value')

In [66]:
dataset.median_house_value.value_counts()

Unnamed: 0_level_0,count
median_house_value,Unnamed: 1_level_1
500001.000000,958
137500.000000,119
162500.000000,116
112500.000000,103
187500.000000,92
...,...
359200.000000,1
51200.000000,1
39800.000000,1
377600.000000,1


In [67]:
index_to_drop = dataset[dataset.median_house_value == 500001].index
dataset = dataset.drop(index=index_to_drop)
px.histogram(dataset, x='median_house_value')

In [68]:
# Teraz zajmiemy się typem Object: ISLAND, NEAR_BAY, NEAR_OCEAN

dataset_dummies = pd.get_dummies(dataset, drop_first=True).astype(int)
dataset_dummies.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122,37,41,880,129,322,126,8,452600,0,0,1,0
1,-122,37,21,7099,1106,2401,1138,8,358500,0,0,1,0
2,-122,37,52,1467,190,496,177,7,352100,0,0,1,0
3,-122,37,52,1274,235,558,219,5,341300,0,0,1,0
4,-122,37,52,1627,280,565,259,3,342200,0,0,1,0


The line dataset_dummies = pd.get_dummies(dataset, drop_first=True) performs one-hot encoding on the categorical features of the dataset and drops the first category for each feature.

Here's a breakdown:

    pd.get_dummies(): This function is part of the Pandas library and is used for one-hot encoding. One-hot encoding converts categorical features into numerical representations by creating new columns for each unique category.

    dataset: This is the input DataFrame containing the data with categorical features.

    drop_first=True: This argument drops the first category for each feature during one-hot encoding. This is done to avoid multicollinearity, which can cause issues in some machine learning models.


### <a name='a2'></a> 3. Podział na zbiór treningowy oraz testowy

In [69]:
train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)
test_dataset = dataset_dummies.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset length: {len(test_dataset)}')

train_dataset length: 15580
test_dataset length: 3895


Okay, let's break down the line of code:

train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)

Use code with caution

This line of code is creating a training dataset by randomly sampling 80% of the data from the dataset_dummies DataFrame.

Here's a breakdown of the components:

    train_dataset: This is the variable that will store the new training dataset.
    dataset_dummies.sample(): This is the function call that does the sampling.
        frac=0.8: This argument specifies that 80% of the data should be included in the sample.
        random_state=0: This argument sets a seed for the random number generator. This ensures that the same random sample will be generated ever

Okay, let's analyze the code:

test_dataset = dataset_dummies.drop(train_dataset.index)

Use code with caution

This line creates the test dataset by removing the rows used in the training dataset (train_dataset) from the original dataset (dataset_dummies).

Here's a breakdown:

    test_dataset: This is the variable that will store the new test dataset.
    dataset_dummies.drop(): This function is used to remove rows or columns from a Pandas DataFrame. In this case, it's removing rows.
    train_dataset.index: This part provides the indices (row labels) of the data points that were selected for the training dataset.


In [70]:
train_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
19234,-122,38,18,3364,501,1442,506,6,313000,0,0,0,0
1859,-124,41,21,2696,578,1208,494,2,122400,0,0,0,1
51,-122,37,43,1868,456,1061,407,1,93800,0,0,1,0
11192,-117,33,28,2444,555,1848,567,3,198800,0,0,0,0
20355,-118,34,16,1807,346,587,296,1,162500,0,0,0,0


In [71]:
test_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-122,37,52,2202,434,910,402,3,281500,0,0,1,0
13,-122,37,52,696,191,345,174,2,191300,0,0,1,0
19,-122,37,52,1503,298,690,275,2,162900,0,0,1,0
26,-122,37,49,1130,244,607,239,2,93800,0,0,1,0
40,-122,37,52,1665,419,946,395,2,155400,0,0,1,0


In [72]:
px.scatter_matrix(train_dataset, dimensions=['median_house_value', 'housing_median_age', 'median_income', 'total_rooms'], color='median_house_value', height=700)

In [73]:
train_stats = train_dataset.describe()
train_stats.pop('median_house_value')
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,15580.0,-119.132413,2.006918,-124.0,-121.0,-118.0,-117.0,-114.0
latitude,15580.0,35.118678,2.157684,32.0,33.0,34.0,37.0,41.0
housing_median_age,15580.0,28.489217,12.505895,1.0,18.0,29.0,37.0,52.0
total_rooms,15580.0,2620.014506,2195.654212,2.0,1441.0,2112.0,3119.0,39320.0
total_bedrooms,15580.0,539.900578,424.499548,2.0,299.0,436.0,647.0,6445.0
population,15580.0,1441.193068,1160.542775,3.0,801.0,1179.0,1746.0,35682.0
households,15580.0,501.099487,385.039835,2.0,283.0,411.0,605.0,6082.0
median_income,15580.0,3.194288,1.6037,0.0,2.0,3.0,4.0,15.0
ocean_proximity_INLAND,15580.0,0.333569,0.471503,0.0,0.0,0.0,1.0,1.0
ocean_proximity_ISLAND,15580.0,0.000193,0.013876,0.0,0.0,0.0,0.0,1.0


In [74]:
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

Ten fragment kodu jest częścią przygotowania danych do trenowania modelu uczenia maszynowego. W szczególności wykonuje on następujące operacje:

### Wyjaśnienie krok po kroku:

1. **`train_dataset.pop('median_house_value')` oraz `test_dataset.pop('median_house_value')`:**
   - W tej linii z zestawu danych (zarówno `train_dataset`, jak i `test_dataset`) usuwana jest kolumna o nazwie **`median_house_value`**.
   - Metoda `.pop()` usuwa kolumnę z ramki danych (DataFrame) i zwraca jej wartości, co oznacza, że po tej operacji dane w tej kolumnie zostaną przypisane do zmiennych `train_labels` oraz `test_labels`.

2. **Dlaczego usuwamy kolumnę?**
   - Kolumna **`median_house_value`** zawiera wartości docelowe, czyli etykiety, które chcemy przewidywać za pomocą modelu. W kontekście problemu przewidywania cen mieszkań, wartości te reprezentują medianę cen domów.
   - Ponieważ ta kolumna stanowi etykiety (target), jest usuwana z zestawu cech (danych wejściowych do modelu), a jej wartości są przechowywane osobno w zmiennych `train_labels` oraz `test_labels`, które będą wykorzystywane jako prawdziwe wartości do nauki modelu oraz do testowania jego skuteczności.

3. **Podział na zbiór treningowy i testowy:**
   - Dane zostały podzielone na zestaw treningowy (`train_dataset`) oraz testowy (`test_dataset`), co jest standardową praktyką przy uczeniu modeli. Zestaw treningowy służy do trenowania modelu, a zestaw testowy do oceny jego działania na nieznanych danych.

### Ostatecznie:

- `train_labels` i `test_labels` będą zawierać wartości median cen mieszkań, które są wartościami docelowymi, a `train_dataset` i `test_dataset` będą zawierać pozostałe kolumny, które będą używane jako cechy do nauki modelu.

### <a name='a3'></a> 4. Standaryzacja danych

In [75]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

Metoda `norm(x)` normalizuje dane, czyli przekształca je w taki sposób, aby miały średnią równą 0 i odchylenie standardowe równe 1. Normalizacja jest często stosowana w uczeniu maszynowym, aby dane wejściowe były porównywalne, co może pomóc w szybszym i bardziej efektywnym trenowaniu modelu.

### Wyjaśnienie krok po kroku:

1. **Argument `x`:**
   - To wejściowa wartość lub zbiór wartości (np. kolumna danych), które chcesz znormalizować.

2. **`train_stats['mean']` i `train_stats['std']`:**
   - Zakładamy, że wcześniej został obliczony słownik statystyk dla zbioru treningowego (`train_stats`).
   - `train_stats['mean']` zawiera średnią wartość dla każdej kolumny w zbiorze treningowym.
   - `train_stats['std']` zawiera odchylenie standardowe dla każdej kolumny w zbiorze treningowym.

3. **Obliczenie normalizacji:**
   - Normalizacja danych jest wykonywana według wzoru:
     \[
     \text{norm}(x) = \frac{x - \text{mean}}{\text{std}}
     \]
   - W tej metodzie, dla każdej wartości `x`, odejmowana jest średnia wartość danej cechy (kolumny) (`train_stats['mean']`), a następnie wynik dzielony przez odchylenie standardowe tej samej cechy (`train_stats['std']`).
   - Dzięki temu uzyskujemy znormalizowaną wartość, która ma średnią 0 i odchylenie standardowe 1.

### Dlaczego to jest ważne?
- **Skalowanie cech:** Modele uczenia maszynowego (np. sieci neuronowe, regresja liniowa) działają lepiej, gdy dane są znormalizowane. Dzięki temu model nie jest zdominowany przez cechy o większych wartościach (np. liczby mogą się różnić skalą: np. powierzchnia w m² i liczba pokoi).
- **Szybsza i bardziej stabilna konwergencja:** Normalizacja sprawia, że model może szybciej znaleźć optymalne rozwiązanie, ponieważ cechy są porównywalne w skali.

### Przykład:
Jeśli jedna z cech to powierzchnia domu, a średnia powierzchnia w zbiorze treningowym wynosi 150 m² z odchyleniem standardowym 50 m², to dla domu o powierzchni 200 m² normalizacja będzie wyglądać tak:
\[
\text{norm}(200) = \frac{200 - 150}{50} = 1
\]
Znormalizowana wartość wynosi 1.

In [76]:
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [77]:
normed_train_data.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
ocean_proximity_INLAND,0
ocean_proximity_ISLAND,0


In [78]:
normed_train_data.head(3)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
19234,-1.428851,1.335378,-0.838742,0.338845,-0.091639,0.000695,0.012727,1.749525,-0.707459,-0.013877,-0.346133,-0.37823
1859,-2.425404,2.725758,-0.598855,0.034607,0.089751,-0.200934,-0.018438,-0.744708,-0.707459,-0.013877,-0.346133,2.643727
51,-1.428851,0.871918,1.160315,-0.342501,-0.197646,-0.327599,-0.244389,-1.368266,-0.707459,-0.013877,2.888874,-0.37823


In [79]:
normed_test_data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-1.428851,0.871918,1.879976,-0.190383,-0.249472,-0.457711,-0.257375,-0.12115,-0.707459,-0.013877,2.888874,-0.37823
13,-1.428851,0.871918,1.879976,-0.876283,-0.82191,-0.944552,-0.849521,-0.744708,-0.707459,-0.013877,2.888874,-0.37823
19,-1.428851,0.871918,1.879976,-0.508739,-0.569849,-0.647277,-0.587211,-0.744708,-0.707459,-0.013877,2.888874,-0.37823


In [80]:
normed_test_data = normed_test_data.values
normed_train_data = normed_train_data.values
print(normed_test_data)
print(normed_train_data)

[[-1.428851116375  0.871917581847  1.879976070113 ... -0.013877293686  2.888874361871 -0.378229535826]
 [-1.428851116375  0.871917581847  1.879976070113 ... -0.013877293686  2.888874361871 -0.378229535826]
 [-1.428851116375  0.871917581847  1.879976070113 ... -0.013877293686  2.888874361871 -0.378229535826]
 ...
 [-0.930574589457  1.798837572647 -0.678817233469 ... -0.013877293686 -0.346133368881 -0.378229535826]
 [-0.930574589457  1.798837572647 -0.998666396417 ... -0.013877293686 -0.346133368881 -0.378229535826]
 [-0.930574589457  1.798837572647 -0.279005779784 ... -0.013877293686 -0.346133368881 -0.378229535826]]
[[-1.428851116375  1.335377577247 -0.838741814943 ... -0.013877293686 -0.346133368881 -0.378229535826]
 [-2.425404170211  2.725757563448 -0.598854942732 ... -0.013877293686 -0.346133368881  2.64372747349 ]
 [-1.428851116375  0.871917581847  1.16031545348  ... -0.013877293686  2.888874361871 -0.378229535826]
 ...
 [ 0.564254991298 -0.981922399754 -0.279005779784 ... -0.01387

### <a name='a4'></a> 5. Budowa modelu

In [81]:
def build_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer='l2', activation='relu', input_shape=[len(train_dataset.keys())]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae', 'mse'])
    return model

In [82]:
model = build_model()
model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



### <a name='a5'></a> 6. Trenowanie sieci

In [83]:
history = model.fit(
    normed_train_data,
    train_labels.values,
    # epochs=150,
    epochs = 20,
    validation_split=0.2,
    verbose=1, batch_size=32)

Epoch 1/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - loss: 31056889856.0000 - mae: 143150.3125 - mse: 31056889856.0000 - val_loss: 4872975360.0000 - val_mae: 50581.4102 - val_mse: 4872975360.0000
Epoch 2/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 4590851072.0000 - mae: 49055.9336 - mse: 4590851072.0000 - val_loss: 4032856832.0000 - val_mae: 47224.0469 - val_mse: 4032856832.0000
Epoch 3/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - loss: 4042124800.0000 - mae: 45990.5430 - mse: 4042124800.0000 - val_loss: 3832527360.0000 - val_mae: 45709.1367 - val_mse: 3832527360.0000
Epoch 4/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - loss: 3919965696.0000 - mae: 45327.0859 - mse: 3919965696.0000 - val_loss: 3752738048.0000 - val_mae: 45469.3164 - val_mse: 3752738048.0000
Epoch 5/20
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27m

In [84]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epoki', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epoki', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)

In [85]:
for name, value in zip(model.metrics_names, model.evaluate(normed_test_data, test_labels.values)):
    print(f'{name:8}{value:.4f}')

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3270459904.0000 - mae: 41958.6328 - mse: 3270459904.0000
loss    3406912000.0000
compile_metrics42444.1484


In [86]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


array([250454.67 , 195982.55 , 195793.55 , ...,  97583.1  , 126955.055,  75452.02 ], dtype=float32)

In [87]:
pred = pd.DataFrame(test_labels)
pred['predictions'] = test_predictions
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500,250454.671875
13,191300,195982.546875
19,162900,195793.546875
26,93800,183636.4375
40,155400,201267.1875


In [88]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000], mode='lines'))
fig.show()

In [89]:
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500,250454.671875
13,191300,195982.546875
19,162900,195793.546875
26,93800,183636.4375
40,155400,201267.1875


In [90]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()

Unnamed: 0,median_house_value,predictions,error
10,281500,250454.671875,31045.328125
13,191300,195982.546875,-4682.546875
19,162900,195793.546875,-32893.546875
26,93800,183636.4375,-89836.4375
40,155400,201267.1875,-45867.1875


In [91]:
px.histogram(pred, 'error', marginal='rug', width=1000)