# Problem regresji - prognoza ceny mieszkań
Celem tego notebook'a jest zapoznanie się z problemami regresji. W odróżnieniu od problemów klasyfikacji, gdzie staramy się przyporządkować daną klasę, w modelach regresji przewidujemy ciągłą zmienną docelową.

# 1. Import bibliotek

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format
tf.__version__

# 2. Załadowanie danych i wstępna eksploracja

In [None]:
raw_dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
raw_dataset.info()

In [None]:
dataset = raw_dataset.copy()
dataset.head()

In [None]:
dataset.isnull().sum() / len(dataset)

In [None]:
dataset.dropna(inplace=True)

dataset.isnull().sum() / len(dataset)

In [None]:
dataset.describe()

In [None]:
dataset.describe(include=['object'])

In [None]:
dataset.ocean_proximity.value_counts()

In [None]:
px.histogram(dataset, x='median_house_value')

In [None]:
dataset.median_house_value.value_counts()

In [None]:
index_to_drop = dataset[dataset.median_house_value == 500001].index
dataset = dataset.drop(index=index_to_drop)
px.histogram(dataset, x='median_house_value')

In [None]:
dataset_dummies = pd.get_dummies(dataset, drop_first=True, dtype=float)
dataset_dummies.head()

# 3. Podział na zbiór treningowy oraz testowy

In [None]:
train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)
test_dataset = dataset_dummies.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset length: {len(test_dataset)}')

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

In [None]:
px.scatter_matrix(train_dataset, dimensions=['median_house_value', 'housing_median_age', 'median_income', 'total_rooms'], color='median_house_value', height=700)


In [None]:
train_stats = train_dataset.describe()
train_stats.pop('median_house_value')
train_stats = train_stats.transpose()
train_stats

In [None]:
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

# 4. Standaryzacja danych

In [None]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [None]:
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [None]:
normed_train_data.isnull().sum()

In [None]:
normed_train_data.head(3)

In [None]:
normed_test_data.head(3)

In [None]:
normed_test_data = normed_test_data.values
normed_train_data = normed_train_data.values

# 5. Budowa modelu

In [None]:
def build_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer='l2', activation='relu', input_shape=[len(train_dataset.keys())]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae', 'mse'])
    return model

In [None]:
model = build_model()
model.summary()

# 6. Trenowanie sieci

In [None]:
history = model.fit(normed_train_data, train_labels.values, epochs=150, validation_split=0.2, verbose=1, batch_size=32)

In [None]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epoki', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epoki', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)

In [None]:
for name, value in zip(model.metrics_names, model.evaluate(normed_test_data, test_labels.values)):
    print(f'{name:8}{value:.4f}')

In [None]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

In [None]:
pred = pd.DataFrame(test_labels)
pred['predictions'] = test_predictions
pred.head()

In [None]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000], mode='lines'))
fig.show()

In [None]:
pred.head()

In [None]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()