In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotnine as pln
import pickle as pkl
import glob
import pathlib

from scipy import stats

from sklearn.preprocessing import OneHotEncoder

random_state = np.random.RandomState(37676373)

# Import the file and preprocess it

In [None]:
data = pd.read_csv('./ML case/data.csv', parse_dates=['date_in'], dtype={'agency_rating': np.int64})

`agancy_rating`, the only ordinal variable will be treated as a numeric one for the sake of this classification.

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data['year'] = data['date_in'].dt.year
data['month'] = data['date_in'].dt.month
data['day'] = data['date_in'].dt.day
data['weekday'] = data['date_in'].dt.weekday

In [None]:
data['house_pk'].value_counts().plot(kind='hist')

In [None]:
data[['agency_id', 'agency_rating']].pivot_table(index='agency_id', columns='agency_rating', aggfunc=len)

In [None]:
# we want to preserve agency_id for now
data = pd.concat([data['agency_id'], pd.get_dummies(data, columns=['agency_id'], drop_first=True)], axis=1, sort=False)

# Exploratory data anlysis (EDA)

First, we will discover the relationship between the target variable and the features.

### Plot y's (price's) distribution

In [None]:
sns.distplot(data['price'])

As the distribution of price is highly non-normal, it will be more reasonable to work with log-normalized data.

In [None]:
data['price'] = np.log10(data['price'])

In [None]:
sns.distplot(data['price'])

### Plot mean price over time and add some context to it with all of the observations on one plot

In [None]:
holidays = pd.DataFrame({'start': ['2016-07-01', '2017-07-01', '2018-07-01'], 'finish': ['2016-08-31', '2017-08-31', '2018-08-31'], 
                         'ymin': [-np.inf, -np.inf, -np.inf], 'ymax': [np.inf, np.inf, np.inf]}, )
holidays['start'] = pd.to_datetime(holidays['start'])
holidays['finish'] = pd.to_datetime(holidays['finish'])

christmas = pd.DataFrame({'start': ['2016-12-20', '2017-12-20', '2018-12-20'], 'finish': ['2017-01-15', '2018-01-15', '2019-01-15'], 
                         'ymin': [-np.inf, -np.inf, -np.inf], 'ymax': [np.inf, np.inf, np.inf]}, )
christmas['start'] = pd.to_datetime(christmas['start'])
christmas['finish'] = pd.to_datetime(christmas['finish'])

In [None]:
(pln.ggplot(pln.aes('date_in', 'price', group='factor(house_pk)'), data=data)
 + pln.geom_line(color='gray', alpha=0.3, size=0.5)
 + pln.geom_line(pln.aes('date_in', 'price'), data.groupby('date_in')['price'].agg('mean').to_frame().reset_index(drop=False), size=3, inherit_aes=False)
 + pln.geom_rect(pln.aes(xmin='start', xmax='finish', ymin='ymin', ymax='ymax'), holidays, fill='green', alpha=0.25, inherit_aes=False)
 + pln.geom_rect(pln.aes(xmin='start', xmax='finish', ymin='ymin', ymax='ymax'), christmas, fill='red', alpha=0.25, inherit_aes=False)
 + pln.labels.xlab('Date')
 + pln.labels.ylab('Price')
 + pln.theme_bw()
 + pln.theme(axis_text_x=pln.element_text(rotation=90, hjust=0.5)))

### Categorical variables

It pays off to plot the distribution with respect to the year even if it cannot be used as an explanatory variable.

In [None]:
cat_variables = ['agency_id', 'apartment', 'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'agency_rating', 'year', 
                 'month', 'weekday']

In [None]:
n_rows = len(cat_variables) // 2 + len(cat_variables) % 2
n_cols = 2
fig, ax = plt.subplots(ncols=n_cols, nrows=n_rows, sharey=True, figsize=(18, 50))

for i, var in enumerate(cat_variables):
    var_name = var.upper().replace('_', ' ')
    sns.boxplot(data=data, x=var, y='price', ax=ax[i // 2, i % 2])
    # please mind that the data has been already log-transformed
    _, p = stats.f_oneway(*[data['price'].loc[x[1][var].index].values for x in data[var].to_frame().groupby(var, squeeze=True)])
    ax[i // 2, i % 2].set_title(f'{var_name}, one-way ANOVA p-value: {np.round(p, 4)}')
    ax[i // 2, i % 2].set_xlabel(var_name)
    ax[i // 2, i % 2].set_ylabel('PRICE')
    
# remove unused axes
if i+1 != n_rows*n_cols:
    ax.flat[-1].set_visible(False)

### Continuous variables

In [None]:
con_variables = ['dis_water_real', 'dis_shopping', 'no_bedrooms', 'max_persons', 'house_size', 'land_size', 'build_year', 'renovation_year']

Apparently, `no_bedrooms` could be treated as a categorical variable due to the low number of values it takes.

In [None]:
n_rows = len(con_variables) // 2 + len(con_variables) % 2
n_cols = 2

fig, ax = plt.subplots(ncols=n_cols, nrows=n_rows, sharey=True, figsize=(18, 30))

for i, var in enumerate(con_variables):
    var_name = var.upper().replace('_', ' ')
    sns.scatterplot(data=data, x=var, y='price', ax=ax[i // 2, i % 2])
    corr, p_value = stats.pearsonr(data['price'], data[var])
    ax[i // 2, i % 2].set_title(f'{var_name}, correlation: {np.round(corr, 5)}, non-correlation test p-value: {np.round(p_value, 5)}')
    ax[i // 2, i % 2].set_xlabel(var_name)
    ax[i // 2, i % 2].set_ylabel('PRICE')
    
# remove unused axes
if i+1 != n_rows*n_cols:
    ax.flat[-1].set_visible(False)

### Correlation between features

In [None]:
all_variables = cat_variables + con_variables

In [None]:
palette = sns.xkcd_palette(colors=["windows blue", "amber"])
col_mapping = [palette[0] if var in cat_variables else palette[1] for var in all_variables]

In [None]:
# legend for the rectangles on the plot
sns.palplot(palette)
for i, val in enumerate(['categorical', 'continuous']):
    plt.text(i, 0, val, horizontalalignment='center', verticalalignment='center', rotation=45)
plt.show()

In [None]:
sns.clustermap(data[all_variables].corr(method='pearson'), method='ward', figsize=(15, 15), row_cluster=True, col_cluster=True, col_colors=col_mapping,
               row_colors=col_mapping)

# Regression model, solely on tabular data, to predict price

In [None]:
# no weekday
X_features = ['dis_water_real', 'dis_shopping', 'no_bedrooms', 'max_persons', 'house_size', 'land_size', 'build_year', 'renovation_year', 'apartment', 
              'indoor_pool', 'spa', 'internet', 'pets_allowed', 'water_view', 'fire_stove', 'agency_rating', 'year', 'month', 'day', 'agency_id_121', 
              'agency_id_130', 'agency_id_160']

### Prepare validation set

Validation set consists of 15% of observations for a given house_pk within a given year and month.

In [None]:
validation_set = data.groupby(['house_pk', 'year', 'month']).apply(lambda gr: gr.sample(frac=0.15, random_state=random_state)).reset_index(drop=True)

In [None]:
training_set = pd.merge(data, validation_set, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)

In [None]:
assert validation_set.shape[0] > 0 and training_set.shape[0] > 0 and validation_set.shape[0] + training_set.shape[0] == data.shape[0]

### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
rf_model = RandomForestRegressor(n_estimators=250, criterion='mse', max_features='sqrt', n_jobs=3, random_state=random_state)

rf_grid = {'max_depth': [20, 22, 25, 27, 30]}  # the grid was chosen after some initial trials

rf_cv = GridSearchCV(rf_model, rf_grid, scoring='neg_mean_squared_error', 
                     cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state), 
                     n_jobs=2, return_train_score=False)

Training of the model takes considerable amount of time, so it has been saved and is ready to be re-used.

In [None]:
# rf_cv.fit(training_set[X_features], training_set['price'])

In [None]:
# pkl.dump(rf_cv, open('rf_model.pkl', 'wb'))

In [None]:
rf_cv = pkl.load(open('rf_model.pkl', 'rb'))

In [None]:
rf_cv.best_params_

In [None]:
rf_predictions = rf_cv.predict(validation_set[X_features])

### LightGBM

In [None]:
import lightgbm as lgbm
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [None]:
lgbm_model = lgbm.LGBMRegressor(n_jobs=2, num_leaves=2**10, random_state=37676373, importance_type='gain', )

lgbm_grid = {'n_estimators': Integer(50, 500),
             'learning_rate': Real(0.01, 0.35, 'uniform'),
             'reg_alpha': Real(1e-6, 1, 'log-uniform'), 
             'reg_lambda': Real(1e-6, 1, 'log-uniform'), 
             'gamma': Real(0, 1, 'uniform'), 
             'subsample': Real(0.5, 1, 'uniform'), 
             'colsample_bytree': Real(0.5, 1, 'uniform'),
             'max_depth': Integer(10, 30)
            }

lgbm_cv = BayesSearchCV(lgbm_model, lgbm_grid, n_iter=25, scoring='neg_mean_squared_error', refit=True, n_points=1,
                        cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state),
                        n_jobs=3, random_state=random_state)

In [None]:
# lgbm_cv.fit(training_set[X_features], training_set['price'])

In [None]:
# pkl.dump(lgbm_cv, open('lgbm_model.pkl', 'wb'))

In [None]:
lgbm_cv = pkl.load(open('lgbm_model.pkl', 'rb'))

In [None]:
lgbm_cv.best_params_

In [None]:
lgbm_predictions = lgbm_cv.predict(validation_set[X_features])

### Compare  both models side-by-side

In [None]:
fig, ax = plt.subplots(1, 2, True, True, figsize=(20, 7))
# the two lines below are for common labels for both plots
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)

ax[0].plot(validation_set['price'], rf_predictions, color='green', marker='x', linewidth=0)
ax[0].set_title(f'Random forest predictions - MSE: {np.round(mean_squared_error(validation_set["price"], rf_predictions), 5)}',
                fontdict={'fontsize': 20, 'fontweight': 'bold'})

ax[1].plot(validation_set['price'], lgbm_predictions, color='blue', marker='x', linewidth=0)
ax[1].set_title(f'LightGBM predictions - MSE: {np.round(mean_squared_error(validation_set["price"], lgbm_predictions), 5)}',
                fontdict={'fontsize': 20, 'fontweight': 'bold'})

plt.xlabel('log10(Real price)')
plt.ylabel('log10(Predicted price)')
plt.tight_layout()

plt.show()

### Plot feature importance

In [None]:
pd.DataFrame({'importance': lgbm_cv.best_estimator_.feature_importances_, 'features': X_features}).sort_values('importance', ascending=False).\
    plot(x='features', y='importance', kind='bar', figsize=(12, 9), title='Feature importance for LGBM model')

In [None]:
data['spa'].value_counts()

# Let's add images

In [None]:
from wand.image import Image as w_Image
from wand.display import display
import matplotlib.image as matimg

In [None]:
complete_imgs_folder = pathlib.Path('./ML case/aerial_photos/complete_photos')

### Brief look at some images

In [None]:
# some pictures are identical
sample_of_house_pk = ['7602', '7603', '7604', '7605', '27735', '27742', '27743']
sample_pictures = [x + '.png' for x in sample_of_house_pk]

In [None]:
ncols = 2
fig = plt.figure(figsize=(20, 30))

for i, pic in enumerate(sample_pictures):
    img_path = complete_imgs_folder / pic
    a = fig.add_subplot(np.ceil(len(sample_pictures)/float(ncols)), ncols, i + 1)
    plt.imshow(matimg.imread(str(img_path)))
    a.set_title(img_path.stem)

In [None]:
(pln.ggplot(pln.aes('date_in', 'price', group='factor(house_pk)'), data=data.loc[data['house_pk'].isin(sample_of_house_pk)])
 + pln.geom_line(pln.aes(color='factor(house_pk)'), alpha=0.8, size=1)
 + pln.labels.xlab('Date')
 + pln.labels.ylab('Price')
 + pln.labels.labs(color='House_pk')
 + pln.theme_bw()
 + pln.theme(axis_text_x=pln.element_text(rotation=90, hjust=0.5)))

### Crop two parts from a mosaic of images

In [None]:
for img_path in glob.glob(str(complete_imgs_folder / '*.png')):
    img_path = pathlib.Path(img_path)
    with w_Image(filename=img_path) as img:
        # crop the complete image so that only top left image is retained
        with img.clone() as i:
            i.crop(width=235, height=290)
            i.trim(fuzz=0)  # remove white boundary around the image
            i.save(filename=str(img_path.parent.parent / 'aerial' / img_path.name))

        # as above, but retain only the plan (Google Map view)    
        with img.clone() as i:
            i.crop(left=240, width=320, height=300)
            i.trim(fuzz=0)
            i.save(filename=str(img_path.parent.parent / 'plans' / img_path.name))        

### Embed the images

In [None]:
import torch
import torch.nn as nn
import torchvision.models as torch_models
import torchvision.transforms as torch_transforms
from torch.autograd import Variable

from PIL import Image

In [None]:
model = torch_models.resnext50_32x4d(pretrained=True)

In [None]:
embed_layer = model._modules.get('fc')

In [None]:
%%capture
model.eval()

In [None]:
# transformations necessary for torchvision models
normalize = torch_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
to_tensor = torch_transforms.ToTensor()

In [None]:
def get_vector(image_name):
    """
    Source of the function: https://becominghuman.ai/extract-a-feature-vector-for-any-image-with-pytorch-9717561d1d4c
    Modifications: mine
    """
    img = Image.open(image_name).convert('RGB')
    t_img = Variable(normalize(to_tensor(img)).unsqueeze(0))
    my_embedding = torch.zeros(2048)
    
    def copy_data(m, i, o):
        """Let's capture the input that is passed to the last layer of the neural network."""
        my_embedding.copy_(i[0].squeeze().data)

    h = embed_layer.register_forward_hook(copy_data)
    model(t_img)
    h.remove()

    return my_embedding

In [None]:
images_paths = [pathlib.Path(x) for x in glob.glob(str(complete_imgs_folder.parent / 'aerial' / '*.png'))]
aerial_embeddings = []

for img_path in images_paths:
    embedding = get_vector(img_path)
    aerial_embeddings.append(embedding)
    
aerial_embeddings = np.stack(aerial_embeddings, axis=0)    
aerial_embeddings = pd.DataFrame(aerial_embeddings, index=[path.stem for path in images_paths], 
                                 columns=[f'X_{i}' for i in range(aerial_embeddings.shape[1])])
aerial_embeddings.to_csv('aerial_embeddings.csv', index=True)

In [None]:
images_paths = [pathlib.Path(x) for x in glob.glob(str(complete_imgs_folder.parent / 'plans' / '*.png'))]
plans_embeddings = []

for img_path in images_paths:
    embedding = get_vector(img_path)
    plans_embeddings.append(embedding)
    
plans_embeddings = np.stack(plans_embeddings, axis=0)    
plans_embeddings = pd.DataFrame(plans_embeddings, index=[path.stem for path in images_paths],
                                columns=[f'X_{i}' for i in range(plans_embeddings.shape[1])])
plans_embeddings.to_csv('plans_embeddings.csv', index=True)

### Plot the embeddings

In [None]:
import umap

In [None]:
aerial_embeddings = pd.read_csv('aerial_embeddings.csv', index_col=0).assign(type_='aerial')
plans_embeddings = pd.read_csv('plans_embeddings.csv', index_col=0).assign(type_='plan')

In [None]:
all_embeddings = pd.concat([aerial_embeddings, plans_embeddings])

In [None]:
all_embeddings.shape

In [None]:
umap_dim_reduction = umap.UMAP(metric='cosine', n_neighbors=20, min_dist=0.25, random_state=random_state)
umap_data = umap_dim_reduction.fit_transform(all_embeddings.loc[:, all_embeddings.columns.str.startswith('X')].values)

In [None]:
umap_data = pd.DataFrame(umap_data, index=all_embeddings.index, columns=['UMAP_1', 'UMAP_2']).assign(type_=all_embeddings['type_']).reset_index()

In [None]:
(pln.ggplot(pln.aes(x='UMAP_1', y='UMAP_2', label='index', color='type_'), data=umap_data)
  + pln.geom_text()
  + pln.theme_bw()
  + pln.theme(figure_size=(12, 8))
  + pln.ggtitle('UMAP mapping of the two embedding types.')
  + pln.labs(color='Type')
#  + pln.xlim((5, 7.5))
)

### Create models exclusively on the embeddings

In [None]:
rf_grid = {'max_depth': [8, 10, 12, 15]}  # change the grid from the last time - we do not have that many observations, so shallower trees are preferred

rf_cv = GridSearchCV(rf_model, rf_grid, scoring='neg_mean_squared_error', 
                     cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state), 
                     n_jobs=2, return_train_score=False)

In [None]:
for type_, embeddings in zip(('Aerial', 'Plan'), (aerial_embeddings, plans_embeddings)):
    embed_features = embeddings.columns[embeddings.columns.str.startswith('X')]
    embed_training_set = training_set[['house_pk', 'price']].groupby('house_pk').mean().merge(embeddings, left_on='house_pk', right_index=True, sort=False)
    
    print(f'{type_} embeddings; training dataset shape: {embed_training_set[embed_features].shape}')
    
    # unfortunetly, I had not sufficient RAM to train LightGBM on this dataset which is considerably wider
    rf_cv.fit(embed_training_set[embed_features], embed_training_set['price'])
    
    pkl.dump(rf_cv, open(f'rf_model_{type_.lower()}_mean.pkl', 'wb'))
    print(f'{type_} model stored.')

### Visualize predictions from the embeddings' models

In [None]:
fig, ax = plt.subplots(1, 2, True, True, figsize=(20, 7))
# the two lines below are for common labels for both plots
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)

for i, (type_, embeddings) in enumerate(zip(('aerial', 'plan'), (aerial_embeddings, plans_embeddings))):
    embed_features = embeddings.columns[embeddings.columns.str.startswith('X')]
    embed_validation_set = training_set[['house_pk', 'price']].groupby('house_pk').mean().merge(embeddings, left_on='house_pk', right_index=True, sort=False)
    
    embed_model = pkl.load(open(f'rf_model_{type_.lower()}_mean.pkl', 'rb'))
    
    embed_model_predictions = embed_model.predict(embed_validation_set[embed_features])
    ax[i].plot(embed_validation_set['price'], embed_model_predictions, color='green', marker='x', linewidth=0)
    ax[i].set_title(f'Random forest predictions {type_} embedding \n MSE: {np.round(mean_squared_error(embed_validation_set["price"], embed_model_predictions), 5)}',
                    fontdict={'fontsize': 20, 'fontweight': 'bold'})
    ax[i].plot([0,1],[0,1], transform=ax[i].transAxes)  # add y=x line

plt.xlabel('log10(Real price)')
plt.ylabel('log10(Predicted price)')
plt.tight_layout()

plt.show()

### Create second order models for houses with pictures

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
# get the training and validation observations where the picture is provided
training_set_with_pictures = training_set.merge(aerial_embeddings, left_on='house_pk', right_index=True, sort=False)
validation_set_with_pictures = validation_set.merge(aerial_embeddings, left_on='house_pk', right_index=True, sort=False)

# get all models we want to use
lgbm_cv = pkl.load(open('lgbm_model.pkl', 'rb'))
rf_aerial_cv = pkl.load(open('rf_model_aerial_mean.pkl', 'rb'))
rf_plan_cv = pkl.load(open('rf_model_plan_mean.pkl', 'rb'))

In [None]:
# get predictions from the three models that we trained
lgbm_embed_predictions = lgbm_cv.predict(training_set_with_pictures[X_features])
rf_aerial_predictions = rf_aerial_cv.predict(training_set_with_pictures[aerial_embeddings.columns[aerial_embeddings.columns.str.startswith('X')]])
rf_plan_predictions = rf_plan_cv.predict(training_set.merge(plans_embeddings, left_on='house_pk', right_index=True, sort=False)[plans_embeddings.columns[plans_embeddings.columns.str.startswith('X')]])

In [None]:
all_training_predictions = pd.DataFrame({
    'LGBM': lgbm_cv.predict(training_set_with_pictures[X_features]), 
    'Aerial': rf_aerial_cv.predict(training_set_with_pictures[aerial_embeddings.columns[aerial_embeddings.columns.str.startswith('X')]]), 
    'Plan': rf_plan_cv.predict(training_set.merge(plans_embeddings, left_on='house_pk', right_index=True, sort=False)[plans_embeddings.columns[plans_embeddings.columns.str.startswith('X')]]), 
    'price': training_set_with_pictures['price']
})

all_validation_predictions = pd.DataFrame({
    'LGBM': lgbm_cv.predict(validation_set_with_pictures[X_features]), 
    'Aerial': rf_aerial_cv.predict(validation_set_with_pictures[aerial_embeddings.columns[aerial_embeddings.columns.str.startswith('X')]]), 
    'Plan': rf_plan_cv.predict(validation_set.merge(plans_embeddings, left_on='house_pk', right_index=True, sort=False)[plans_embeddings.columns[plans_embeddings.columns.str.startswith('X')]]), 
    'price': validation_set_with_pictures['price']    
})

In [None]:
elasticnet_model = ElasticNet(random_state=random_state)

elasticnet_grid = {
             'alpha': Real(0, 1, 'uniform'),
             'l1_ratio': Real(0, 1, 'uniform'), 
            }

elasticnet_cv = BayesSearchCV(elasticnet_model, elasticnet_grid, n_iter=25, scoring='neg_mean_squared_error', refit=True, n_points=1,
                              cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state),
                              n_jobs=3, random_state=random_state)

In [None]:
elasticnet_cv.fit(all_training_predictions[['LGBM', 'Aerial', 'Plan']], all_training_predictions['price'])

In [None]:
elasticnet_cv.best_estimator_.coef_

In [None]:
fig, ax = plt.subplots(2, 2, True, True, figsize=(20, 14))
# the two lines below are for common labels for both plots
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)

ax[0][0].plot(all_validation_predictions['price'], elasticnet_cv.predict(all_validation_predictions[['LGBM', 'Aerial', 'Plan']]), 
           color='green', marker='x', linewidth=0)
ax[0][0].set_title(f'Predictions from second order model - MSE: {np.round(mean_squared_error(all_validation_predictions["price"], elasticnet_cv.predict(all_validation_predictions[["LGBM", "Aerial", "Plan"]])), 5)}',
                fontdict={'fontsize': 20, 'fontweight': 'bold'})

ax[0][1].plot(validation_set_with_pictures['price'], lgbm_cv.predict(validation_set_with_pictures[X_features]), color='blue', marker='x', linewidth=0)
ax[0][1].set_title(f'LightGBM predictions - MSE: {np.round(mean_squared_error(validation_set_with_pictures["price"], lgbm_cv.predict(validation_set_with_pictures[X_features])), 5)}',
                fontdict={'fontsize': 20, 'fontweight': 'bold'})

ax[1][0].plot(elasticnet_cv.predict(all_validation_predictions[['LGBM', 'Aerial', 'Plan']]), 
              lgbm_cv.predict(validation_set_with_pictures[X_features]), color='red', marker='o', linewidth=0)
ax[1][0].set_title(f'Comparison of LightGBM and second-order model predictions \n MSE: {np.round(mean_squared_error(elasticnet_cv.predict(all_validation_predictions[["LGBM", "Aerial", "Plan"]]), lgbm_cv.predict(validation_set_with_pictures[X_features])), 5)}', 
                   fontdict={'fontsize': 16, 'fontweight': 'bold'})

ax.flat[-1].set_visible(False)

plt.xlabel('log10(Real price)')
plt.ylabel('log10(Predicted price)')
plt.tight_layout()

plt.show()