In [73]:
import keras
import numpy as np
import pandas as pd
import pickle as pkl


import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.regularizers import l2, l1
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.losses import MeanAbsoluteError, MeanAbsolutePercentageError, MeanSquaredError
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, History, LearningRateScheduler
from tensorflow.keras.layers import BatchNormalization, Conv2D, Conv1D, MaxPooling2D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling2D, Activation, Dropout, Dense, Input, Multiply, concatenate

#### Pre-process the data

In [44]:
from sklearn.preprocessing import LabelEncoder

# load listing data
listing_data = pd.read_csv('./data/listings.csv')

# copy the data frame to apply changes savely
new_listing_df = listing_data.copy()

# data pre-processing
# drop identifiers & unrequired columns
drop_cols_list = [
    'scrape_id', 'name', 'description', 'neighborhood_overview',
    'picture_url', 'host_id', 'host_url', 'host_name', 'host_location', 'host_about',
    'host_response_time', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_verifications',
    'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'property_type', 'room_type',
    'bathrooms_text', 'amenities', 'license', 'last_scraped', 'source', 'host_since', 'calendar_updated',
    'calendar_last_scraped', 'first_review', 'last_review'
]
new_listing_df.drop(drop_cols_list, axis=1, inplace=True)

# drop records with null [price, host_response_rate, host_acceptance_rate] value
new_listing_df = new_listing_df.dropna(axis=0, subset=['price', 'host_response_rate', 'host_acceptance_rate'])

# encode T/F columns 
tf_columns = [
    'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
    'has_availability', 'instant_bookable',     
]
label_encoder = LabelEncoder().fit(new_listing_df['host_is_superhost'])
for col in tf_columns:
    new_listing_df[col] = label_encoder.transform(new_listing_df[col])

# fill null numerical values with median
cols_to_fill_miss_values = [
    'reviews_per_month', 'review_scores_value', 'review_scores_location', 'review_scores_communication',
    'review_scores_checkin', 'review_scores_cleanliness', 'review_scores_accuracy', 'review_scores_rating', 'beds',
    'bedrooms', 'bathrooms', 
]
for col in cols_to_fill_miss_values:
    new_listing_df[col] = new_listing_df[col].fillna(new_listing_df[col].median())

# convert object values to numerical values
obj_cols = ['host_response_rate', 'host_acceptance_rate']
for col in obj_cols:
    new_listing_df[col] = pd.to_numeric(new_listing_df[col].map(lambda val: val.replace('%', '')))
new_listing_df['price'] = pd.to_numeric(new_listing_df['price'].map(lambda val: val.replace('$', '').replace(',', '')))


# all two columns('images_names', 'images_No') with none values
new_listing_df['images_names'] = [None for _ in range(len(new_listing_df))]
new_listing_df['images_No'] = [ 0 for _ in range(len(new_listing_df))]

new_listing_df.to_csv('./data/preprocessed_listings.csv')

In [38]:
new_listing_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19142 entries, 1 to 37763
Data columns (total 46 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            19142 non-null  int64  
 1   host_response_rate                            19142 non-null  int64  
 2   host_acceptance_rate                          19142 non-null  int64  
 3   host_is_superhost                             19142 non-null  int32  
 4   host_listings_count                           19142 non-null  float64
 5   host_total_listings_count                     19142 non-null  float64
 6   host_has_profile_pic                          19142 non-null  int32  
 7   host_identity_verified                        19142 non-null  int32  
 8   latitude                                      19142 non-null  float64
 9   longitude                                     19142 non-null  floa

* #### We need to fill new columns (images_names, images_No) using script_of_image_scraping file first then follow 

In [2]:
# read and prepare data
img_size = (224, 224)
data = pd.read_csv('./data/preprocessed_listings.csv')
data = data.drop('Unnamed: 0', axis=1)[data['images_No'] == 5] # drop 'Unnamed: 0' column and use only recordes with images. 

  data = pd.read_csv('./data/preprocessed_listings.csv')


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1268 entries, 0 to 1323
Data columns (total 47 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            1268 non-null   int64  
 1   listing_url                                   1268 non-null   object 
 2   host_response_rate                            1268 non-null   int64  
 3   host_acceptance_rate                          1268 non-null   int64  
 4   host_is_superhost                             1268 non-null   int64  
 5   host_listings_count                           1268 non-null   float64
 6   host_total_listings_count                     1268 non-null   float64
 7   host_has_profile_pic                          1268 non-null   int64  
 8   host_identity_verified                        1268 non-null   int64  
 9   latitude                                      1268 non-null   f

### Fine tune CV model

In [4]:
def get_paths_and_prices(df):
    paths = []
    prices = []
    for _, row in df.iterrows():
        for name in row['images_names'].split(','):
            paths.append(f'./images/{name}')
            prices.append(row['price'])
    return paths, prices

def customise_data(data):
    x_train, x_test, _, _ = train_test_split(data, [0 for _ in range(len(data))], test_size=.2, shuffle=True, random_state=303)
    paths, prices = get_paths_and_prices(x_train)
    dict_ = {
        'imgs': paths,
        'prices': prices
    }
    new_x_train = pd.DataFrame(dict_)

    paths, prices = get_paths_and_prices(x_test)
    dict_ = {
        'imgs': paths,
        'prices': prices
    }
    new_x_test = pd.DataFrame(dict_)

    return new_x_train, new_x_test

In [12]:
train_df, test_df = customise_data(data)

# fine tune CV model (Inception v3)
# Create the model
img_input = Input(shape=(img_size[0], img_size[1], 3))
Incep3_base_model = InceptionV3(weights="imagenet", include_top=False, input_tensor=img_input)

# stop weight update
for layer in Incep3_base_model.layers:
    layer.trainable = False

# update inseption model strcture by adding some layers to make it deal with our price prediction task
Incep3_base_model = Incep3_base_model(img_input, training=False)
Incep3_base_model = GlobalAveragePooling2D(name="avg_pool")(Incep3_base_model)
Incep3_base_model = BatchNormalization()(Incep3_base_model)
Incep3_base_model = Dense(64, activation='relu', kernel_initializer='he_normal', name='dense_hidden_cv')(Incep3_base_model)
Incep3_base_model = Dense(1, activation='linear')(Incep3_base_model)
Incep3_model = Model(inputs=img_input, outputs=Incep3_base_model)

# initialize hyper-prameters
lr = 0.1 
verbose = 1
epochs = 100
batch_size = 64 
        
Incep3_model.compile(optimizer=Adam(learning_rate=lr, epsilon=1), metrics=['mse'], loss='mse')
Incep3_model.summary()
plot_model(Incep3_model, show_shapes=True, to_file='./inceptionV3_model_image.png')
stop = EarlyStopping(monitor="val_loss", patience=30, restore_best_weights=True, mode='min', verbose=verbose)
best = ModelCheckpoint(
    filepath='./model/best_iv3_weights.keras',
    save_best_only=True,
    monitor='val_loss',
    verbose=verbose,
    mode='min'
)

train_img_generator = ImageDataGenerator(
    brightness_range=(0.75, 1),
    shear_range=0.1,
    zoom_range=[0.50, 1],
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True
)
test_img_generator = ImageDataGenerator(rescale=1.0)

imgs_train = train_img_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col="imgs",  
        y_col="prices",  
        class_mode="raw",  
        color_mode='rgb',
        target_size=img_size,
        batch_size=batch_size
        )
imgs_test = test_img_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col="imgs",
        y_col="prices",
        class_mode="raw",
        color_mode='rgb',
        target_size=img_size,
        batch_size=batch_size
        )

hist = Incep3_model.fit(
    imgs_train, 
    validation_data = imgs_test,
    batch_size=batch_size,
    verbose=verbose,
    epochs=epochs,
    callbacks=[stop, best]
)

Incep3_model.save('./model/last_iv3_model_simple.keras')


You must install pydot (`pip install pydot`) for `plot_model` to work.
Found 5070 validated image filenames.
Found 1270 validated image filenames.




Epoch 1/100


  self._warn_if_super_not_called()


[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 38124.3594 - mse: 38319.4883
Epoch 1: val_loss improved from inf to 37123.03516, saving model to ./model/best_iv3_weights.keras
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 4s/step - loss: 38085.5391 - mse: 38280.3906 - val_loss: 37123.0352 - val_mse: 37293.1758
Epoch 2/100
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 34927.4141 - mse: 34932.2891
Epoch 2: val_loss improved from 37123.03516 to 30626.72852, saving model to ./model/best_iv3_weights.keras
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 3s/step - loss: 34898.8828 - mse: 34903.9922 - val_loss: 30626.7285 - val_mse: 30866.5586
Epoch 3/100
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 34807.6680 - mse: 35101.1484
Epoch 3: val_loss did not improve from 30626.72852
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m306s[0m 4s/step - loss



ValueError: Unable to synchronously create dataset (name already exists)

### Fine tune model for structure data

In [95]:
# read, prepare and split data
data_std = data.drop(['id', 'listing_url', 'images_names', 'images_No'], axis=1)
y_data_std = data_std['price']
x_data_std = data_std.drop('price', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_data_std, y_data_std, test_size=.2, shuffle=True, random_state=303)

In [96]:
# build std model (simple MLP algorithm)
input_layer = Input(shape=(x_train.shape[1],), name='input_layer_std')
base_model = Dense(x_train.shape[1], activation='relu', kernel_initializer='he_normal')(input_layer)
base_model = Dense(25, activation='relu', name='dense_hidden_std')(base_model)
output_layer = Dense(1, activation='linear')(base_model)
model_std = Model(inputs=input_layer, outputs=output_layer)

print(model_std.summary())

None


In [97]:
# compile and train the model
epochs = 400
lr = ExponentialDecay(0.01, decay_steps=100000, decay_rate=0.96, staircase=True)
stop = EarlyStopping(monitor="val_loss", patience=30, restore_best_weights=True, mode='min', verbose=verbose)
best = ModelCheckpoint(
    filepath='./model/best_std_weights.keras',
    save_best_only=True,
    monitor='val_loss',
    verbose=verbose,
    mode='min'
)

model_std.compile(
    loss="mean_squared_error",
    metrics=[MeanSquaredError()],
    optimizer=Adam(learning_rate=lr, epsilon=1)
)
plot_model(model_std, show_shapes=True, to_file='./model_std.png')

hist_std = model_std.fit(
    x_train, y_train,
    epochs=epochs,
    callbacks=[stop, best],
    validation_data=([x_test, y_test])
)

model_std.save('./model/last_std_weights.keras')

You must install pydot (`pip install pydot`) for `plot_model` to work.
Epoch 1/400
[1m 1/32[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m29s[0m 958ms/step - loss: 60376.2695 - mean_squared_error: 60376.2695
Epoch 1: val_loss improved from inf to 69128.20312, saving model to ./model/best_std_weights.keras
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 57300.7031 - mean_squared_error: 57300.7031 - val_loss: 69128.2031 - val_mean_squared_error: 69128.2031
Epoch 2/400
[1m 1/32[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 20ms/step - loss: 25399.1875 - mean_squared_error: 25399.1875
Epoch 2: val_loss did not improve from 69128.20312
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 30988.6953 - mean_squared_error: 30988.6953 - val_loss: 76306.3750 - val_mean_squared_error: 76306.3750
Epoch 3/400
[1m 1/32[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 17ms/step - loss: 20656.9473 - mean_squared_error: 20656.9473
Epoch 3: val_loss im

### Fine tune multi-modal model

In [115]:
def split_XY_data(df:pd.DataFrame):
    df_copy = df.copy()
    x_imgs = []
    y = []
    for i, row in df.iterrows():
        img_names = row['images_names'].split(',')
        for img_name in img_names:
            img_path = f'./images/{img_name}'
            # print(img_path)
            img = load_img(img_path, target_size=img_size)
            img = img_to_array(img)
            # img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
            x_imgs.append(img)
            y.append(row['price'])
    Y = pd.DataFrame({'price': y})
    x_std = df_copy.drop(['id', 'listing_url', 'images_names', 'images_No', 'price'], axis=1)
    x_std = pd.DataFrame(np.repeat(x_std.values, 5, axis=0), columns=x_std.columns)
    
    return np.array(x_imgs), x_std, Y 

In [42]:
# def load_imgs(img_list, img_size):
#     all_imgs = []
#     for img_sublist in img_list:
#         imgs_for_one_unit = []
#         for img_name in img_sublist:
#             img_path = f'./images/{img_name}'
#             img = load_img(img_path, img_size)
#             img = img_to_array(img)
#             imgs_for_one_unit.append(img)
#         all_imgs.append(imgs_for_one_unit) 
        
#     all_imgs = np.array(all_imgs)
#     print(all_imgs.shape)
#     return all_imgs

In [116]:
# python code to create the final architecture

train_df, test_df, _, _ = train_test_split(data, [0 for _ in range(len(data))], test_size=.2, shuffle=True, random_state=303)
# load data (training)
x_train_imgs, x_train_std, y_train = split_XY_data(train_df)
print(f'X_train imgs len: {len(x_train_imgs)}')
print(f'X_train std shape: {x_train_std.shape}')
print(f'Y_train std shape: {y_train.shape}')

# load data (testing)
x_test_imgs, x_test_std, y_test = split_XY_data(test_df)
print(f'X_test imgs len: {len(x_test_imgs)}')
print(f'X_test std shape: {x_test_std.shape}')
print(f'Y_test std shape: {y_test.shape}')

# # save loaded images for future use
# with open('./model/x_train_imgs.pkl','wb') as train_file, open('./model/x_test_imgs.pkl','wb') as test_file:
#     pkl.dump(x_train_imgs, train_file)
#     pkl.dump(x_test_imgs, test_file)

X_train imgs len: 5070
X_train std shape: (5070, 42)
Y_train std shape: (5070, 1)
X_test imgs len: 1270
X_test std shape: (1270, 42)
Y_test std shape: (1270, 1)


In [114]:
for arr in x_test_imgs:
    if arr.shape == (1, 224, 224, 3):
        print(arr.shape)

In [98]:
# build multimodal NN

# load fine tuned models
# 1) load computer vision model
cv_tuned_model_path = './model/best_iv3_weights.keras'
cv_tuned_model = keras.models.load_model(cv_tuned_model_path, compile=False)
# 2) load structured model
std_tuned_model_path = './model/best_std_weights.keras'
std_tuned_model = keras.models.load_model(std_tuned_model_path, compile=False)

# remove last layer from each model
# just to simply identify the layers after concatenation we will add postfix (_cv, _std) for layers
# _cv => to layers of computer vision model layers
# _std => to layers of structure NN model layers
cv_tuned_model_layer = Model(inputs=cv_tuned_model.input, outputs=cv_tuned_model.layers[-2].output)
for layer in cv_tuned_model_layer.layers:
    layer._name = f'{layer.name}_cv'
std_tuned_model_layer = Model(inputs=std_tuned_model.input, outputs=std_tuned_model.layers[-2].output)
for layer in std_tuned_model_layer.layers:
    layer._name = f'{layer.name}_std'

In [100]:
# concate models and add 2 final layer
multimodal_structure = concatenate([
    cv_tuned_model_layer.output,
    std_tuned_model_layer.output
])
multimodal_structure = Dense(50, activation='relu', name='dense_hidden_final')(multimodal_structure)
multimodal_structure = Dense(1, activation='linear')(multimodal_structure)

# build model
multimodal_model = keras.Model(inputs=[
    cv_tuned_model_layer.input[0], 
    std_tuned_model_layer.input[0]],
    outputs=[multimodal_structure]
)

# model compilation
lr = 0.01
multimodal_model.compile(
    optimizer=Adam(learning_rate=lr, epsilon=1),
    loss="mean_squared_error",
    metrics=[ MeanSquaredError()]
)

print(multimodal_model.summary())

None


In [101]:
# plot model
plot_model(
    multimodal_model,
    dpi=350,
    show_shapes=True,
    show_dtype=False,
    show_layer_names=True,
    to_file="multimodal_model.png"
)

You must install pydot (`pip install pydot`) for `plot_model` to work.


In [117]:
# fit model
stop = EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True, mode='min', verbose=verbose)
best = ModelCheckpoint(
    verbose=1,
    mode='min',
    monitor='val_loss',
    save_best_only=True, 
    save_weights_only=False, 
    filepath='./model/best_multimodal_weights.keras'
)

hist_multimodal = multimodal_model.fit(
    [x_train_imgs, x_train_std],
    y_train,                   
    epochs=epochs,
    batch_size = 256,
    validation_data=([x_test_imgs, x_test_std], y_test),
    callbacks=[stop, best]
)

Epoch 1/400
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28s/step - loss: 30794.9785 - mean_squared_error: 30794.9785 
Epoch 1: val_loss did not improve from inf
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m695s[0m 29s/step - loss: 30349.7773 - mean_squared_error: 30349.7773 - val_loss: inf - val_mean_squared_error: inf
Epoch 2/400
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13s/step - loss: 17583.7812 - mean_squared_error: 17583.7812 
Epoch 2: val_loss did not improve from inf
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 14s/step - loss: 17589.6523 - mean_squared_error: 17589.6523 - val_loss: inf - val_mean_squared_error: inf
Epoch 3/400
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - loss: 16568.1992 - mean_squared_error: 16568.1992
Epoch 3: val_loss improved from inf to 50643.82812, saving model to ./model/best_multimodal_weights.keras
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m