In [34]:
import pickle
import numpy as np
import pandas as pd
from glob import glob
import tensorflow as tf
from tensorflow import keras
from scipy.sparse import csr_matrix
from tensorflow.keras import layers
from tensorflow.keras import Input, Model
from sklearn.neighbors import DistanceMetric
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional

In [3]:
train = pd.read_csv('real_train.csv')
test = pd.read_csv('real_test.csv')

In [4]:
#get "impressions" and "prices" from train dataset and split them by |
train['impressions'] = train['impressions'].astype(str)
train['impressions'] = train['impressions'].map(lambda x: x.lstrip('[').rstrip(']'))
impressions = pd.DataFrame(train.impressions.str.split('|').tolist())
train.drop(columns=['impressions'], inplace = True)
train['prices'] = train['prices'].astype(str)
prices = pd.DataFrame(train.prices.str.split('|').tolist())

#concatenate impressions to train dataset as columns 
train = pd.concat([train, impressions], axis=1)
columns = train.columns.to_list()
columns[11:] = ['impressions_'+str(i) for i in range(25)]
train.columns = columns

In [None]:
# function that calculates weights from a neural network model
def calculate_weights(name, model):
    #calculate weights
    layer = model.get_layer(name)
    weights = layer.get_weights()[0]
    
    #normalize weight results
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    
    return weights

#return hotel index 
def get_hotel_index(x):
    if x != None:
        try:
            return id_to_index[int(x)] 
        except:
            return 0
    else:
        return np.nan

In [19]:
embedding_model = load_model('real_embedding_50.h5')
hotel_weights = calculate_weights('hotel_embedding', embedding_model)

In [None]:
properties_df = pd.read_csv('real_embedding.csv')

#get item_id's and columns names from properties_df 
item_id = properties_df['item_id'].values

#create a dictionary as item_id to index
id_to_index = {item_id: idx for idx, item_id in enumerate(item_id)}
index_to_id = {idx: item_id for item_id, idx in id_to_index.items()}

# LSTM

In [23]:
#create a LSTM model with multi input to one output with 2 Dense Layer and Bidirectional Layer
def multi_input_one_output_LSTM(weight):
    inputs = Input(shape=(None,)) 
    
    embed = Embedding(hotel_weights.shape[0], 50, weights = [hotel_weights], trainable = False)(inputs) 
    lstm_out = Bidirectional(LSTM(25, return_sequences = False))(embed) 
    
    #outputs are set as softmax
    classifier = Dense(36, activation = 'relu')(lstm_out) 
    outputs = Dense(weight, activation = 'softmax')(classifier) 
    model = Model(inputs = [inputs], outputs = [outputs])
    
    return model

In [None]:
#function that prepare datasets for train_test_split()
def prepare_split_LSTM():
    #get not null rows' ids and indices
    impressionOtel_id = impressions[impressions[0] != 'nan']
    impressionOtel_index = impressionOtel_id.applymap(get_hotel_index)

    X = []
    y = []
    
    #for each line of train dataset impressions
    for i in range(impressionOtel_index.shape[0]):
        notNullCount = impressionOtel_index.iloc[i][:].count()
       
        y.append(id_to_index[train['reference'][i]])
        row = np.array(list(impressionOtel_index.iloc[i][:notNullCount].values)).reshape(-1,1)
        X.append(row)
        
    return X,y

In [None]:
#separates generated X and y arrays 20% as test and 80% as train
X, y = prepare_split_LSTM()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [24]:
#create LSTM model and compile with sparse_categorical_crossentropy as loss and adam as optimizer 
model = multi_input_one_output_LSTM(hotel_weights.shape[0])

model.compile(loss = "sparse_categorical_crossentropy", 
              optimizer = "adam", metrics = [tf.keras.metrics.SparseTopKCategoricalAccuracy(k = 1000)])

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 50)          7565300   
_________________________________________________________________
bidirectional (Bidirectional (None, 50)                15200     
_________________________________________________________________
dense (Dense)                (None, 36)                1836      
_________________________________________________________________
dense_1 (Dense)              (None, 151306)            5598322   
Total params: 13,180,658
Trainable params: 5,615,358
Non-trainable params: 7,565,300
_________________________________________________________________


In [None]:
X_train = [np.array(i).astype('int') for i in X_train]
y_train = [np.array([i]).astype('int') for i in y_train] 

In [None]:
#function that generate batches for model to training
def generator(x, y, epoch):
    for j in range(epoch):
        for i in range(len(y)):
            yield x[i].reshape(1,-1), y[i]
train_gen = generator(X_train, y_train, 5)

In [126]:
#fit the model with 5 epoch and with generator function
model.fit(train_gen, epochs = 5, steps_per_epoch = len(y_train))
model.save('lstm_predict.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fba0d26ea90>

In [25]:
model = load_model('lstm_predict.h5')

In [None]:
#finding the predict result for each element of X_test and save in every 1000 to a file

maximum = []
otel_count = 1
line = 0

for i in X_test:
    temp = model.predict(i.reshape(-1,len(i)))
    temp = temp.reshape(-1)
    line += 1
    if (line % 1000) == 0:
        with open(f"/predict_results/prediction{otel_count}.pkl", 'wb') as fp:
            pickle.dump(maximum,fp)
        otel_count += 1
        maximum = []
    #append largest 20 predicted otels
    maximum.append(temp.argsort()[-20:][::-1])
    

otel_count += 1
with open(f"/predict_results/prediction{otel_count}.pkl", 'wb') as fp:
    pickle.dump(maximum,fp)

files = glob('/predict_results/*.pkl')
predict_df = pd.concat([pd.DataFrame(pd.read_pickle(fp)) for fp in files], ignore_index = True)
predict_df.to_csv('lstm_predict.csv', index = False)

In [30]:
lstm_predict = pd.read_csv('lstm_predict.csv')

In [31]:
lstm_predict

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,123903,36025,75398,136887,51696,41243,16223,113137,35070,35551,35262,92850,63654,24532,96904,71460,32937,109706,63542,88871
1,6632,105666,7525,87228,133153,27623,123345,44867,75800,147797,118887,57587,106741,95331,32937,92038,93266,32318,77274,120525
2,123903,41243,36025,51696,136887,75398,35070,16223,113137,35551,35262,92850,63654,71460,96904,24532,63542,109706,88871,120345
3,11616,55061,83261,29439,142504,127463,104746,109157,92038,133153,115027,25254,34811,111065,42253,124195,30523,119694,83815,6632
4,9516,54953,78379,123220,83261,115707,135050,51829,49541,108574,137802,31842,72436,70896,114273,54028,141937,55693,139913,53093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145201,107412,72400,18000,45400,141937,105981,99611,34811,34024,75800,123345,94530,133153,123212,65619,30523,127336,45263,75577,100538
145202,58625,140719,78379,135050,140782,9829,101059,9516,93326,113414,991,123212,40188,31600,85345,75341,2850,91825,102345,101649
145203,140719,31842,9516,78379,48094,145100,104539,131002,110528,68953,149697,34811,118475,68041,135050,53093,72436,133153,991,127478
145204,66637,71254,31472,103499,15089,20906,144228,96285,135544,56301,79852,97299,115885,490,142012,63511,139584,114391,11125,20360


# KNN ALGORITHM

In [None]:
#function that prepare datasets for train_test_split()
def prepare_split_KNN():
    #get not null rows' ids and indices
    impressionOtel_id = impressions[impressions[0] != 'nan']
    impressionOtel_index = impressionOtel_id.applymap(get_hotel_index)
    
    X = []
    y = []
    
    #for each line of train dataset impressions
    for i in range(impressionOtel_index.shape[0]):
        notNullCount = impressionOtel_index.iloc[i][:].count()
        
        #checks if not null count of impressions list greater than zero 
        if(notNullCount > 0):
            y.append(id_to_index[train['reference'][i]])
            row = list(impressionOtel_index.iloc[i][:notNullCount].values)
            
            #checks if not null count of impressions list less than 25 (if true assign 0)
            if(notNullCount < 25):
                row = row + [0]*(25-notNullCount)

            X.append(np.asarray(row))
            
    return X,y

In [None]:
#separates generated X and y arrays 20% as test and 80% as train
X_KNN, y_KNN = prepare_split_KNN()
X_KNN_train, X_KNN_test, y_KNN_train, y_KNN_test = train_test_split(X_KNN, y_KNN, test_size = 0.2)

In [174]:
#calls KNeighborsClassifier function with n_neighbors parameter as 1 and fit to X_train and y_train
neigh = KNeighborsClassifier(n_neighbors= 1)
neigh.fit(X_KNN_train, y_KNN_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [None]:
#finding the predict result for each element of X_test
predict = []

for i in X_KNN_test:
    temp = neigh.kneighbors(i.reshape(-1, len(i)))[1]
    temp = temp.reshape(-1)
    predict.append(X_KNN_train[temp[0]])

In [184]:
predict = pd.DataFrame(predict)
predict.to_csv("knn_predict.csv", index = False)

In [32]:
knn_predict = pd.read_csv("knn_predict.csv")

In [33]:
knn_predict

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,124593.0,136145.0,114846.0,136272.0,43385.0,24422.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,61458.0,65147.0,58464.0,103121.0,67662.0,134459.0,83684.0,147872.0,80653.0,143656.0,...,135478.0,54919.0,21759.0,89000.0,58546.0,137805.0,15632.0,106901.0,0.0,0.0
2,20787.0,15947.0,139718.0,37600.0,38028.0,116657.0,33687.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,93333.0,122065.0,59533.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,94947.0,55246.0,3649.0,3159.0,13541.0,111845.0,22418.0,97195.0,85270.0,105451.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145201,51622.0,48890.0,135168.0,13673.0,15663.0,82282.0,139044.0,14903.0,103163.0,92526.0,...,22018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145202,78031.0,38663.0,110273.0,82977.0,3429.0,134067.0,35212.0,101812.0,64816.0,133461.0,...,101909.0,131868.0,54608.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145203,1544.0,28109.0,81119.0,63760.0,23870.0,122004.0,116243.0,46668.0,145802.0,87990.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145204,111479.0,63224.0,47069.0,76671.0,130948.0,150152.0,63512.0,104844.0,123354.0,89815.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# EVALUATION

## MRR CALCULATION

In [None]:
#MRR navigates each row of the predictions of true_items' elements and finds id equality, 
#it takes that row position into rank.

#if it cannot find an equal id according to the metric we added, 
#when 125 (according to 20% error) and more properties find equality, 
#it takes its row position into rank.

def mrr(true_items, predictions):
  
    rank = 0.0
    count = 0
    for i in range(len(true_items)):
        predictions[i] = np.array(predictions[i])
        
        for j in range(len(predictions[i])):
 
            if(int(predictions[i][j]) == int(true_items[i])):
                count += 1
                rank += 1/(j+1)
                break
                
            else:
                result = (properties_df.loc[properties_df['item_id'] == index_to_id[int(predictions[i][j])]].values == properties_df.loc[properties_df['item_id'] == index_to_id[int(true_items[i])]].values)
                num = result.reshape(-1).tolist().count(True)
            
                if num > 125:
                    count += 1
                    rank += 1/(j+1)
                    break                 
                  
    print("count:",count)
    print("rank:",rank)
    mrr = rank / len(true_items)
    return mrr

#### MRR RESULT FOR LSTM

In [160]:
print(mrr(y_test, lstm_predict.values.tolist()))

count: 98696
rank: 42516.19599534056
0.2927991680463656


#### MRR RESULT FOR KNN

In [19]:
print(mrr(y_KNN_test, knn_predict.values.tolist()))

count: 79060
rank: 38862.040088987866
0.2676338449443402


## MSE CALCULATION

In [None]:
#similarity rate between the list of properties of 2 hotels
def properties_ratio(true_items, data, pred):
    
    final = []
    for j in range(len(true_items)):
        count = 0
        for i in range(1, len(data.columns)):
            if (list(data.loc[true_items[j]])[i:i+1][0] != list(data.loc[pred[j][0]])[i:i+1][0]): 
                count += 1

        final.append((158-count)/158)
    return final

#### MSE RESULT FOR LSTM

In [None]:
lstm_pred = lstm_predict['0']
lstm_pred = [float(i) for i in lstm_pred]
lstm_pred = np.array(lstm_pred).reshape(-1, 1)
lstm_test = [1.0] * len(lstm_pred['0'])

In [142]:
mean_squared_error(lstm_test, properties_ratio(y_test, properties_df, lstm_pred))

0.07676015461079432

#### MSE RESULT FOR KNN

In [None]:
knn_pred = knn_predict['0']
knn_pred = [float(i) for i in knn_pred]
knn_pred = np.array(knn_pred).reshape(-1, 1)
knn_test = [1.0] * len(knn_pred['0'])

In [199]:
mean_squared_error(knn_test, properties_ratio(y_KNN_test, properties_df, knn_pred))

0.055124941393782885