In [1]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow.python.keras.layers import  Input, Embedding, Dot, Reshape, Dense
from tensorflow.python.keras.models import Model
from tensorflow.keras.models import load_model

In [2]:
booking = pd.read_csv('final_item_metadata.csv')
trivago = pd.read_csv('trivago_item_metadata.csv')

In [3]:
#function that generate batches for model to training
def generate_batch(pairs, nPositive, negative, data):
    
    #sets batch's size
    batchSize = nPositive * (1 + negative)
    batch = np.zeros((batchSize, 3))
    label = -1
    
    #creates a generator
    while True:
        #choose randomly element from positives
        for count, (hotel_id, prop) in enumerate(random.sample(pairs, nPositive)):
            batch[count, :] = (hotel_id, prop, 1)

        #increment count by 1
        count += 1
        
        #adding negative examples if batch size is greater to count
        while count < batchSize:
            
            #random selection
            randomCount = random.randrange(len(data))
            randomProp = random.randrange(len(columns))
            
            #control for if the example is not positive
            if (randomCount, randomProp) not in pairs_set:
                
                #adding example to batch and increment count
                batch[count, :] = (random_count, random_prop, label)
                count += 1
                
        #make shuffle and yield the batch
        np.random.shuffle(batch)
        yield {'hotel_id': batch[:, 0], 'property': batch[:, 1]}, batch[:, 2]


In [4]:
#function that creates the embedding model 
def embedding_model(data):
    #model takes 2 input as otels' and properties
    hotel_id = Input(name = 'hotel_id', shape = [1])
    prop = Input(name = 'property', shape = [1])
    
    #embedding the sent dataset and adjusted inputs with a output size of 50
    hotel_embedding = Embedding(name = 'hotel_embedding', input_dim = data.shape[0], output_dim = 50)(hotel_id)
    prop_embedding = Embedding(name = 'prop_embedding', input_dim = len(columns), output_dim = 50)(prop)
    
    #dot product calculation of 2 embedding models' 
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([hotel_embedding, prop_embedding])
    #reshape the dot product result for output
    merged = Reshape(target_shape = [1])(merged)
    
    #sets the input and output of the model
    model = Model(inputs = [hotel_id, prop], outputs = merged)
    #sets 'mse' for loss parameter and 'Adam' for optimizer parameter
    model.compile(optimizer = 'Adam', loss = 'mse')
    
    return model

In [7]:
def create_properties_df(data):
    #get "properties" hotel dataset and split them by |
    properties_df = pd.DataFrame(data.properties.str.split('|').tolist())
    column_values = properties_df.values.ravel()
    unique_values =  pd.unique(column_values)
    cols = ['item_id'] + unique_values.tolist()
    
    frame_list = []
    
    #in the list of all properties, give 1 to the cell that the hotel has and 0 to the one that does not
    for row in range(data.shape[0]):
        empty_row = dict.fromkeys(cols,0)
        empty_row['item_id'] = data['id'][row]
        for value in unique_values:
            if value in properties_df.loc[row].values:
                empty_row[value] = 1
            else:
                empty_row[value] = 0
        frame_list.append(empty_row)

    return pd.DataFrame(frame_list)

# EMBEDDING FOR BOOKING DATASET

In [None]:
booking_property = create_properties_df(booking)
booking_property.to_csv('booking_property.csv', index = False)
booking_property = pd.read_csv('booking_property.csv')

In [12]:
#get item_id's and columns names from booking_property 
item_id = booking_property['item_id'].values
columns = booking_property.columns.to_list()[1:]
properties = booking_property[columns].values

#create a dictionary as item_id to index
bookingid_to_index = {item_id: idx for idx, item_id in enumerate(item_id)}
bookingindex_to_id = {idx: item_id for item_id, idx in bookingid_to_index.items()}

In [None]:
booking_pairs = []

#pairs otel ids with its properties index
for item in item_id:
    booking_pairs.extend((bookingid_to_index[item], columns[p]) for p in range(len(columns)) if (booking_property.loc[bookingid_to_index[item]][columns[p]] == 1))

pairs_set = set(booking_pairs)

In [34]:
#create embedding model and show the parameters
model = embedding_model(booking_property)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hotel_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
property (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
hotel_embedding (Embedding)     (None, 1, 50)        7565300     hotel_id[0][0]                   
__________________________________________________________________________________________________
prop_embedding (Embedding)      (None, 1, 50)        17350       property[0][0]                   
______________________________________________________________________________________________

In [36]:
#gets the next batch of the generator
gen = generate_batch(booking_pairs, 1024, 2, booking_property)

#fit the model with 15 epoch and with generator function
h = model.fit_generator(gen, epochs = 15, steps_per_epoch = len(booking_pairs) // 1024, verbose = 1)

model.save('./booking_embedding_50.h5')

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [9]:
#get booking model weights
b_model = load_model('booking_embedding_50.h5')
b_layer = b_model.get_layer('hotel_embedding')
b_weights = b_layer.get_weights()[0]
b_weights.shape

(151306, 50)

In [17]:
#normalization of booking model
b_weights = b_weights / np.linalg.norm(b_weights, axis = 1).reshape((-1, 1))
print(b_weights[0][:10])
print(np.sum(np.square(b_weights[0])))

[ 0.01047642 -0.1007245   0.13313863 -0.09584478 -0.24112956  0.17258188
  0.06434256  0.07664035 -0.1952924   0.03927489]
0.99999994


# EMBEDDING FOR TRIVAGO DATASET

In [None]:
trivago_property = create_properties_df(trivago)
trivago_property.to_csv('trivago_property.csv', index = False)
trivago_property = pd.read_csv('trivago_property.csv')

In [20]:
#get item_id's and columns names from booking_property 
item_id = trivago_property['item_id'].values
columns = trivago_property.columns.to_list()[1:]
properties = trivago_property[columns].values

#create a dictionary as item_id to index and properties to index
trivagoid_to_index = {item_id: idx for idx, item_id in enumerate(item_id)}
trivagoindex_to_id = {idx: item_id for item_id, idx in trivagoid_to_index.items()}
index_to_properties = {idx: properties for idx, properties in  enumerate(properties)}
properties_to_index = {prop:idx  for idx, prop in  enumerate(columns)}

In [None]:
trivago_pairs = []

#pairs otel ids with its properties index
for item in item_id:
    trivago_pairs.extend((trivagoid_to_index[item], properties_to_index[columns[p]]) for p in range(len(columns)) if (trivago_property.loc[trivagoid_to_index[item]][columns[p]] == 1))

pairs_set = set(trivago_pairs)

In [132]:
#create embedding model and show the parameters
model = embedding_model(trivago_property)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hotel_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
property (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
hotel_embedding (Embedding)     (None, 1, 50)        7565300     hotel_id[0][0]                   
__________________________________________________________________________________________________
prop_embedding (Embedding)      (None, 1, 50)        7900        property[0][0]                   
______________________________________________________________________________________________

In [133]:
#gets the next batch of the generator
gen = generate_batch(trivago_pairs, 1024, 2, trivago_property)

#fit the model with 15 epoch and with generator function
h = model.fit_generator(gen, epochs = 15, steps_per_epoch = len(trivago_pairs) // 1024, verbose = 1)

model.save('./trivago_embedding_50.h5')

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [22]:
#get trivago model weights
t_model = load_model('trivago_embedding_50.h5')
t_layer = t_model.get_layer('hotel_embedding')
t_weights = t_layer.get_weights()[0]
t_weights.shape

(151306, 50)

In [23]:
#normalization of booking model
t_weights = t_weights / np.linalg.norm(t_weights, axis = 1).reshape((-1, 1))
print(t_weights[0][:10])
print(np.sum(np.square(t_weights[0])))

[-0.04348783 -0.27479154 -0.03999451 -0.03117155  0.08040609  0.07249643
 -0.04680613  0.03372157  0.04383764 -0.03346203]
1.0


# FINDING SIMILARITY

In [24]:
#function that find similarities between two embedding model according to their weights
def find_similar(hotel_id, b_weights, t_weights):
    
    index = bookingid_to_index
    rindex = bookingindex_to_id
    global maxi, match
    
    #calculate dot products of 2 embedding models' weights for find distances
    try:
        distances = np.dot(b_weights, t_weights[trivagoid_to_index[hotel_id]])
    except KeyError:
        return
    
    #sort the distances in descending order and get the maximum value
    sorted_dists = np.argsort(distances)
    closest = sorted_dists[-len(trivago):]

    for c in reversed(closest):
        if rindex[c] not in maxi:
            maxi.add(rindex[c])
            match.add((hotel_id, rindex[c], distances[c]))
            break

In [None]:
maxi = set()
match = set()

for i in range(0,len(trivago)):
    find_similar(trivago['item_id'][i], b_weights, t_weights)

In [None]:
#create a dataframe of matched otel ids with similarity
match_df = pd.DataFrame(match, columns = ['Trivago id', 'Booking id', 'Similarity'])

In [31]:
match_df

Unnamed: 0,Trivago id,Booking id,Similarity
0,1319616,2212240,0.200216
1,9728918,242526,0.043005
2,52782,61590,0.152323
3,1369589,2653900,-0.157994
4,2833400,929753,-0.038480
...,...,...,...
151301,2831716,492066,-0.032943
151302,4728514,1408010,-0.286148
151303,19570,61815,0.019076
151304,57237,2422279,-0.099991


In [32]:
#based on paired ids: 
#combine booking's name, city, country, address, photo, score columns with trivago's id, properties columns
match_df.columns = match_df.columns.str.replace('Trivago id', 'item_id')
match_df.columns = match_df.columns.str.replace('Booking id', 'id')
match_df = pd.merge(match_df, booking, on='id')
match_df = pd.merge(match_df, trivago, on='item_id')
match_df.columns = match_df.columns.str.replace('properties', 'booking_properties')
match_df = match_df.drop(columns=['id', 'Similarity', 'url', 'booking_properties'])
match_df.to_csv('real_item_metadata.csv', index = False)

# EMBEDDING FOR MATCHED DATASET

In [27]:
item_metadata = pd.read_csv('real_item_metadata.csv')
matched_property = pd.read_csv('trivago_property.csv')

In [29]:
#get item_id's and columns names from booking_property 
item_id = matched_property['item_id'].values
columns = matched_property.columns.to_list()[1:]
properties = matched_property[columns].values

#create a dictionary as item_id to index and properties to index
matchedid_to_index = {item_id: idx for idx, item_id in enumerate(item_id)}
matchedindex_to_id = {idx: item_id for item_id, idx in matchedid_to_index.items()}
index_to_properties = {idx: properties for idx, properties in  enumerate(properties)}
properties_to_index = {prop:idx  for idx, prop in  enumerate(columns)}

In [None]:
matched_pairs = []

#pairs otel ids with its properties index
for item in item_id:
    matched_pairs.extend((matchedid_to_index[item], properties_to_index[columns[p]]) for p in range(len(columns)) if (matched_property.loc[matchedid_to_index[item]][columns[p]] == 1))

pairs_set = set(matched_pairs)

In [9]:
#create embedding model and show the parameters
model = embedding_model(matched_property)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hotel_id (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
property (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
hotel_embedding (Embedding)     (None, 1, 50)        7565300     hotel_id[0][0]                   
__________________________________________________________________________________________________
prop_embedding (Embedding)      (None, 1, 50)        7900        property[0][0]                   
______________________________________________________________________________________________

In [10]:
#gets the next batch of the generator
gen = generate_batch(matched_pairs, 1024, 2, matched_property)

#fit the model with 15 epoch and with generator function
h = model.fit_generator(gen, epochs = 15, steps_per_epoch = len(matched_pairs) // 1024, verbose = 1)

model.save('./real_embedding_50.h5')

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [32]:
#get matched model weights
model = load_model('real_embedding_50.h5')
layer = model.get_layer('hotel_embedding')
weights = layer.get_weights()[0]
weights.shape

(151306, 50)

In [34]:
#normalization of matched model
weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
print(weights[0][:10])
print(np.sum(np.square(weights[0])))

[ 0.1264639   0.2942896   0.14372419 -0.29082537 -0.12410337  0.11697369
 -0.06726164  0.13039161 -0.00575748 -0.02209389]
1.0
