In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input, Concatenate, GlobalAveragePooling1D, Normalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras import backend as K

In [2]:
# Load data
whole_data = pd.read_excel('data/whole_data_cleaned.xlsx')
whole_data.drop(["website","place_links","description","territory_id.1"],axis=1,inplace=True)


In [3]:
# Preprocess whole_data: Convert 'tags' from comma-separated strings to lists of integers
def safe_int_convert(tag_list):
    if isinstance(tag_list, list):
        return tag_list
    elif isinstance(tag_list, (int, float)):
        return [int(tag_list)]
    elif isinstance(tag_list, str):
        try:
            return list(map(int, tag_list.split(',')))
        except ValueError:
            return []
    return []

whole_data['tags'] = whole_data['tags'].apply(safe_int_convert)

In [4]:
#Convert rating to float and handle any missing data
whole_data['rating'] = whole_data['rating'].astype(float).fillna(whole_data['rating'].mean())


In [5]:
# Pad the sequences for tags
tags_padded = pad_sequences(whole_data['tags'], padding='post')
tags_padded

array([[ 1,  9, 14, ...,  0,  0,  0],
       [ 1,  3,  6, ...,  0,  0,  0],
       [13, 14, 15, ...,  0,  0,  0],
       ...,
       [ 1,  2,  3, ...,  0,  0,  0],
       [ 1,  2,  3, ...,  9, 17,  0],
       [ 1,  2,  3, ...,  0,  0,  0]])

In [6]:
whole_data['tags'] = list(tags_padded)
whole_data

Unnamed: 0,name,rating,territory_id,tags,locationYX
0,Kadikoy Ferry Terminal,4.6,2,"[1, 9, 14, 0, 0, 0, 0]","40.99269778351916, 29.023280555674663"
1,Kadikoy Bull Statue,4.5,2,"[1, 3, 6, 0, 0, 0, 0]","40.990473264783475, 29.029131932189433"
2,Kadikoy-moda Streets,4.6,2,"[13, 14, 15, 0, 0, 0, 0]","41.0300084184215, 28.98441527977153"
3,Moda Beach Park,4.7,2,"[9, 11, 12, 0, 0, 0, 0]","40.98000940235465, 29.026848556608424"
4,IDEA Kadikoy,4.6,2,"[9, 14, 16, 0, 0, 0, 0]","40.98015752580784, 29.02810950338273"
...,...,...,...,...,...
183,Palace of the Porphyrogenitus,4.5,3,"[2, 3, 4, 7, 0, 0, 0]","41.03788124723142, 28.93945445055248"
184,Column of Marcian,4.3,3,"[1, 2, 3, 7, 0, 0, 0]","41.02630809319681, 28.95342213409043"
185,Beyazıt Tower,4.5,3,"[1, 2, 3, 7, 0, 0, 0]","41.01817417223928, 28.96347312671263"
186,Grand Bazaar,4.1,3,"[1, 2, 3, 7, 9, 17, 0]","41.014800403046365, 28.967378072061717"


In [7]:
# Extract unique tags
tags_flat = [tag for sublist in whole_data['tags'].tolist() for tag in sublist]
unique_tags = np.unique(tags_flat)
unique_tags

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22])

In [8]:
# Define the model
def create_model(unique_tags, embedding_dim=64, dense_units=128):
    # Tag embedding
    tag_input = Input(shape=(None,), dtype=tf.int32, name='tags')
    tag_lookup = tf.keras.layers.IntegerLookup(vocabulary=unique_tags, mask_token=None)
    tag_embedding = Embedding(input_dim=len(unique_tags) + 1, output_dim=embedding_dim)
    tag_embeddings = tag_embedding(tag_lookup(tag_input))
    tag_embeddings = GlobalAveragePooling1D()(tag_embeddings)
    
    # Normalize rating
    rating_input = Input(shape=(1,), dtype=tf.float32, name='rating')
    rating_normalization = Normalization(axis=None)
    
    # Manually specify input shape
    rating_normalization.build((None, 1))
    
    rating_normalized = rating_normalization(rating_input)
    
    # Concatenate all features
    combined_embeddings = Concatenate()([
        tag_embeddings,
        rating_normalized
    ])
    
    dense = Dense(dense_units, activation='relu')(combined_embeddings)
    output = Dense(embedding_dim)(dense)
    
    model = Model(inputs=[tag_input, rating_input], outputs=output)
    return model

# Adjusted hyperparameters
model = create_model(unique_tags, embedding_dim=64, dense_units=128)


In [9]:
def triplet_loss(anchor, positive, negative, margin=1.0):
    # Compute the distance between the anchor and the positive
    positive_distance = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
    
    # Compute the distance between the anchor and the negative
    negative_distance = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
    
    # Compute the triplet loss
    loss = tf.maximum(positive_distance - negative_distance + margin, 0.0)
    
    return tf.reduce_mean(loss)

In [10]:
# Create a dataset for triplet loss training
def generate_triplets(dataframe):
    anchor_features = {
        'tags': [],
        'rating': []
    }
    positive_features = {
        'tags': [],
        'rating': []
    }
    negative_features = {
        'tags': [],
        'rating': []
    }
    
    for _, row in dataframe.iterrows():
        anchor_features['tags'].append(row['tags'])
        anchor_features['rating'].append(row['rating'])
        
        positive_idx = np.random.choice(dataframe.index)
        negative_idx = np.random.choice(dataframe.index)
        
        positive_row = dataframe.loc[positive_idx]
        negative_row = dataframe.loc[negative_idx]
        
        positive_features['tags'].append(positive_row['tags'])
        positive_features['rating'].append(positive_row['rating'])
        
        negative_features['tags'].append(negative_row['tags'])
        negative_features['rating'].append(negative_row['rating'])
    
    return (
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in anchor_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in positive_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in negative_features.items()}
    )

anchor_features, positive_features, negative_features = generate_triplets(whole_data)

In [11]:
def triplet_generator(anchor_features, positive_features, negative_features, batch_size=32):
    while True:
        indices = np.arange(len(anchor_features['tags']))
        np.random.shuffle(indices)
        
        for start in range(0, len(anchor_features['tags']), batch_size):
            end = start + batch_size
            batch_indices = indices[start:end]
            
            anchor_batch = {k: v[batch_indices] for k, v in anchor_features.items()}
            positive_batch = {k: v[batch_indices] for k, v in positive_features.items()}
            negative_batch = {k: v[batch_indices] for k, v in negative_features.items()}
            
            yield (anchor_batch, positive_batch, negative_batch)

triplet_gen = triplet_generator(anchor_features, positive_features, negative_features, batch_size=32)


In [12]:
# Compile the model with custom training loop
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

@tf.function
def train_step(anchor_batch, positive_batch, negative_batch):
    with tf.GradientTape() as tape:
        anchor_embeddings = model(anchor_batch, training=True)
        positive_embeddings = model(positive_batch, training=True)
        negative_embeddings = model(negative_batch, training=True)
        
        loss = triplet_loss(anchor_embeddings, positive_embeddings, negative_embeddings)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [14]:
# Training loop
epochs = 20
steps_per_epoch = len(anchor_features['tags']) 

for epoch in range(epochs):
    for step in range(steps_per_epoch):
        anchor_batch, positive_batch, negative_batch = next(triplet_gen)
        loss = train_step(anchor_batch, positive_batch, negative_batch)
        
        if step % 100 == 0:
            print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.numpy()}")

Epoch 1, Step 0, Loss: 0.9952486157417297
Epoch 1, Step 100, Loss: 0.9736141562461853
Epoch 2, Step 0, Loss: 0.6865629553794861
Epoch 2, Step 100, Loss: 0.697481095790863
Epoch 3, Step 0, Loss: 0.6584605574607849
Epoch 3, Step 100, Loss: 0.4703589677810669
Epoch 4, Step 0, Loss: 0.48223477602005005
Epoch 4, Step 100, Loss: 0.5489470958709717
Epoch 5, Step 0, Loss: 0.6833053231239319
Epoch 5, Step 100, Loss: 0.35163408517837524
Epoch 6, Step 0, Loss: 0.5739628672599792
Epoch 6, Step 100, Loss: 0.7218126654624939
Epoch 7, Step 0, Loss: 0.41418397426605225
Epoch 7, Step 100, Loss: 0.35457807779312134
Epoch 8, Step 0, Loss: 0.4765431880950928
Epoch 8, Step 100, Loss: 0.4856039881706238
Epoch 9, Step 0, Loss: 0.3882622718811035
Epoch 9, Step 100, Loss: 0.37298956513404846
Epoch 10, Step 0, Loss: 0.3746669590473175
Epoch 10, Step 100, Loss: 0.49482762813568115
Epoch 11, Step 0, Loss: 0.37833356857299805
Epoch 11, Step 100, Loss: 0.26471588015556335
Epoch 12, Step 0, Loss: 0.521055281162262
E

In [15]:
# Save the trained model
model.save('triplet_model.h5')



In [16]:
# Generate embeddings for all locations
features = {
    'tags': np.array(pad_sequences(whole_data['tags'], padding='post')),
    'rating': np.array(whole_data['rating'])
}
location_embeddings = model.predict(features)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [28]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input, Concatenate, GlobalAveragePooling1D, Normalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras import backend as K

# Load data
whole_data = pd.read_excel('data/whole_data_cleaned.xlsx')
whole_data.drop(["website","place_links","description"],axis=1,inplace=True)

# Preprocess whole_data: Convert 'tags' from comma-separated strings to lists of integers
def safe_int_convert(tag_list):
    if isinstance(tag_list, list):
        return tag_list
    elif isinstance(tag_list, (int, float)):
        return [int(tag_list)]
    elif isinstance(tag_list, str):
        try:
            return list(map(int, tag_list.split(',')))
        except ValueError:
            return []
    return []

whole_data['tags'] = whole_data['tags'].apply(safe_int_convert)

# Convert rating to float and handle any missing data
whole_data['rating'] = whole_data['rating'].astype(float).fillna(whole_data['rating'].mean())

# Drop unnecessary columns
whole_data.drop('territory_id', axis=1, inplace=True)

# Pad the sequences for tags
tags_padded = pad_sequences(whole_data['tags'], padding='post')
whole_data['tags'] = list(tags_padded)
# Extract unique tags
tags_flat = [tag for sublist in whole_data['tags'].tolist() for tag in sublist]
unique_tags = np.unique(tags_flat)

# Define the model
def create_model(unique_tags, embedding_dim=64, dense_units=128):
    # Tag embedding
    tag_input = Input(shape=(None,), dtype=tf.int32, name='tags')
    tag_lookup = tf.keras.layers.IntegerLookup(vocabulary=unique_tags, mask_token=None)
    tag_embedding = Embedding(input_dim=len(unique_tags) + 1, output_dim=embedding_dim)
    tag_embeddings = tag_embedding(tag_lookup(tag_input))
    tag_embeddings = GlobalAveragePooling1D()(tag_embeddings)
    
    # Normalize rating
    rating_input = Input(shape=(1,), dtype=tf.float32, name='rating')
    rating_normalization = Normalization(axis=None)
    
    # Manually specify input shape
    rating_normalization.build((None, 1))
    
    rating_normalized = rating_normalization(rating_input)
    
    # Concatenate all features
    combined_embeddings = Concatenate()([
        tag_embeddings,
        rating_normalized
    ])
    
    dense = Dense(dense_units, activation='relu')(combined_embeddings)
    output = Dense(embedding_dim)(dense)
    
    model = Model(inputs=[tag_input, rating_input], outputs=output)
    return model

# Adjusted hyperparameters
model = create_model(unique_tags, embedding_dim=64, dense_units=128)

def triplet_loss(anchor, positive, negative, margin=1.0):
    # Compute the distance between the anchor and the positive
    positive_distance = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
    
    # Compute the distance between the anchor and the negative
    negative_distance = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
    
    # Compute the triplet loss
    loss = tf.maximum(positive_distance - negative_distance + margin, 0.0)
    
    return tf.reduce_mean(loss)
# Create a dataset for triplet loss training
def generate_triplets(dataframe):
    anchor_features = {
        'tags': [],
        'rating': []
    }
    positive_features = {
        'tags': [],
        'rating': []
    }
    negative_features = {
        'tags': [],
        'rating': []
    }
    
    for _, row in dataframe.iterrows():
        anchor_features['tags'].append(row['tags'])
        anchor_features['rating'].append(row['rating'])
        
        positive_idx = np.random.choice(dataframe.index)
        negative_idx = np.random.choice(dataframe.index)
        
        positive_row = dataframe.loc[positive_idx]
        negative_row = dataframe.loc[negative_idx]
        
        positive_features['tags'].append(positive_row['tags'])
        positive_features['rating'].append(positive_row['rating'])
        
        negative_features['tags'].append(negative_row['tags'])
        negative_features['rating'].append(negative_row['rating'])
    
    return (
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in anchor_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in positive_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in negative_features.items()}
    )

anchor_features, positive_features, negative_features = generate_triplets(whole_data)

def triplet_generator(anchor_features, positive_features, negative_features, batch_size=32):
    while True:
        indices = np.arange(len(anchor_features['tags']))
        np.random.shuffle(indices)
        
        for start in range(0, len(anchor_features['tags']), batch_size):
            end = start + batch_size
            batch_indices = indices[start:end]
            
            anchor_batch = {k: v[batch_indices] for k, v in anchor_features.items()}
            positive_batch = {k: v[batch_indices] for k, v in positive_features.items()}
            negative_batch = {k: v[batch_indices] for k, v in negative_features.items()}
            
            yield (anchor_batch, positive_batch, negative_batch)

triplet_gen = triplet_generator(anchor_features, positive_features, negative_features)

# Compile the model with custom training loop
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

@tf.function
def train_step(anchor_batch, positive_batch, negative_batch):
    with tf.GradientTape() as tape:
        anchor_embeddings = model(anchor_batch, training=True)
        positive_embeddings = model(positive_batch, training=True)
        negative_embeddings = model(negative_batch, training=True)
        
        loss = triplet_loss(anchor_embeddings, positive_embeddings, negative_embeddings)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# Training loop
epochs = 50
steps_per_epoch = len(anchor_features['tags'])

for epoch in range(epochs):
    for step in range(steps_per_epoch):
        anchor_batch, positive_batch, negative_batch = next(triplet_gen)
        loss = train_step(anchor_batch, positive_batch, negative_batch)
        
        if step % 100 == 0:
            print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.numpy()}")
            
# Save the trained model
model.save('triplet_model.h5')

# Generate embeddings for all locations
features = {
    'tags': np.array(pad_sequences(whole_data['tags'], padding='post')),
    'rating': np.array(whole_data['rating'])
}
location_embeddings = model.predict(features)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(location_embeddings)

# Function to generate recommendations for a specific place
def recommend(input_features, k=20):
    input_dict = {
        'tags': tf.convert_to_tensor([input_features['tags']], dtype=tf.int32),
        'rating': tf.convert_to_tensor([input_features['rating']], dtype=tf.float32)
    }
    
    # Generate the query embedding
    query_embedding = model.predict(input_dict)
    
    # Compute cosine similarity between the query embedding and all location embeddings
    similarities = cosine_similarity(query_embedding, location_embeddings)
    
    # Get the top-k most similar locations
    top_k_indices = similarities[0].argsort()[-k:][::-1]
    
    return top_k_indices

# Example usage
input_features = {
    'tags': [5],  # Example tags
    'rating': 4.8
}

recommendations = recommend(input_features, k=20)
print("Recommended items:", recommendations)

# Fetching the items from the dataset
recommended_items = whole_data.iloc[recommendations]
#fetching the items from the dataset
whole_data.iloc[recommendations]

Epoch 1, Step 0, Loss: 0.9937557578086853
Epoch 1, Step 100, Loss: 0.7765480279922485
Epoch 2, Step 0, Loss: 0.5640192627906799
Epoch 2, Step 100, Loss: 0.6643266677856445
Epoch 3, Step 0, Loss: 0.8852274417877197
Epoch 3, Step 100, Loss: 0.6527173519134521
Epoch 4, Step 0, Loss: 0.51887047290802
Epoch 4, Step 100, Loss: 0.5849913954734802
Epoch 5, Step 0, Loss: 0.5407980680465698
Epoch 5, Step 100, Loss: 0.5340427160263062
Epoch 6, Step 0, Loss: 0.15049847960472107
Epoch 6, Step 100, Loss: 0.45789289474487305
Epoch 7, Step 0, Loss: 0.295703649520874
Epoch 7, Step 100, Loss: 0.45842573046684265
Epoch 8, Step 0, Loss: 0.4715948700904846
Epoch 8, Step 100, Loss: 0.4094574451446533
Epoch 9, Step 0, Loss: 0.46723440289497375
Epoch 9, Step 100, Loss: 0.16244332492351532
Epoch 10, Step 0, Loss: 0.34980344772338867
Epoch 10, Step 100, Loss: 0.4084891378879547
Epoch 11, Step 0, Loss: 0.5343102216720581
Epoch 11, Step 100, Loss: 0.507289469242096
Epoch 12, Step 0, Loss: 0.6919879913330078
Epoch



Epoch 50, Step 100, Loss: 0.07682393491268158
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
Recommended items: [ 66  92 103  79 148 101 115 143  17 114  53  24   4 166 174 150   6  56
 137  85]


Unnamed: 0,name,rating,tags,territory_id.1,locationYX
66,Emaar Aquarium and Underwater Zoo,4.3,"[18, 20, 21, 0, 0, 0, 0]",1,"41.00262778665512, 29.072372921916998"
92,Besiktas Stadium,4.7,"[18, 19, 20, 0, 0, 0, 0]",0,"41.039995299764044, 28.994412698597092"
103,Galatasaray Museum,4.6,"[18, 19, 20, 0, 0, 0, 0]",0,"41.10354479923963, 28.990814351439013"
79,Besiktas JK Museum,4.8,"[4, 19, 0, 0, 0, 0, 0]",0,"41.03882956368591, 28.99520725767128"
148,The Ferhad Pasha Tomb,4.8,"[3, 5, 7, 0, 0, 0, 0]",3,"41.05279175499929, 28.936219464892545"
101,Galata Mevlevihane Museum,4.6,"[2, 3, 4, 5, 0, 0, 0]",0,"41.02854814780152, 28.97444783302122"
115,Huseyin Aga Tomb,4.6,"[3, 4, 5, 0, 0, 0, 0]",0,"41.03568770893633, 28.980668084071656"
143,Tomb of Mehmed the Conqueror,4.7,"[3, 5, 7, 0, 0, 0, 0]",3,"41.02447391739445, 28.951672208895168"
17,Moda Sea Club,4.8,"[13, 16, 18, 0, 0, 0, 0]",2,"40.97906035376859, 29.023443808638966"
114,French Street Cultural Center,4.2,"[2, 13, 14, 15, 16, 0, 0]",0,"41.03218963326699, 28.979591696647134"


In [30]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input, Concatenate, GlobalAveragePooling1D, Normalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity

def load_and_preprocess_data(filepath):
    # Load data
    whole_data = pd.read_excel(filepath)
    whole_data.drop(["website", "place_links", "description", "territory_id"], axis=1, inplace=True)

    # Preprocess whole_data: Convert 'tags' from comma-separated strings to lists of integers
    def safe_int_convert(tag_list):
        if isinstance(tag_list, list):
            return tag_list
        elif isinstance(tag_list, (int, float)):
            return [int(tag_list)]
        elif isinstance(tag_list, str):
            try:
                return list(map(int, tag_list.split(',')))
            except ValueError:
                return []
        return []

    whole_data['tags'] = whole_data['tags'].apply(safe_int_convert)

    # Convert rating to float and handle any missing data
    whole_data['rating'] = whole_data['rating'].astype(float).fillna(whole_data['rating'].mean())

    # Pad the sequences for tags
    tags_padded = pad_sequences(whole_data['tags'], padding='post')
    whole_data['tags'] = list(tags_padded)
    
    # Extract unique tags
    tags_flat = [tag for sublist in whole_data['tags'].tolist() for tag in sublist]
    unique_tags = np.unique(tags_flat)
    
    return whole_data, unique_tags

def create_model(unique_tags, embedding_dim=64, dense_units=128):
    # Tag embedding
    tag_input = Input(shape=(None,), dtype=tf.int32, name='tags')
    tag_lookup = tf.keras.layers.IntegerLookup(vocabulary=unique_tags, mask_token=None)
    tag_embedding = Embedding(input_dim=len(unique_tags) + 1, output_dim=embedding_dim)
    tag_embeddings = tag_embedding(tag_lookup(tag_input))
    tag_embeddings = GlobalAveragePooling1D()(tag_embeddings)
    
    # Normalize rating
    rating_input = Input(shape=(1,), dtype=tf.float32, name='rating')
    rating_normalization = Normalization(axis=None)
    
    # Manually specify input shape
    rating_normalization.build((None, 1))
    
    rating_normalized = rating_normalization(rating_input)
    
    # Concatenate all features
    combined_embeddings = Concatenate()([
        tag_embeddings,
        rating_normalized
    ])
    
    dense = Dense(dense_units, activation='relu')(combined_embeddings)
    output = Dense(embedding_dim)(dense)
    
    model = Model(inputs=[tag_input, rating_input], outputs=output)
    return model

def triplet_loss(y_true, y_pred):
    anchor, positive, negative = y_pred[:, 0], y_pred[:, 1], y_pred[:, 2]
    positive_distance = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
    negative_distance = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
    loss = tf.maximum(positive_distance - negative_distance + 1.0, 0.0)
    return tf.reduce_mean(loss)

def generate_triplets(dataframe):
    anchor_features = {'tags': [], 'rating': []}
    positive_features = {'tags': [], 'rating': []}
    negative_features = {'tags': [], 'rating': []}
    
    for _, row in dataframe.iterrows():
        anchor_features['tags'].append(row['tags'])
        anchor_features['rating'].append(row['rating'])
        
        positive_idx = np.random.choice(dataframe.index)
        negative_idx = np.random.choice(dataframe.index)
        
        positive_row = dataframe.loc[positive_idx]
        negative_row = dataframe.loc[negative_idx]
        
        positive_features['tags'].append(positive_row['tags'])
        positive_features['rating'].append(positive_row['rating'])
        negative_features['tags'].append(negative_row['tags'])
        negative_features['rating'].append(negative_row['rating'])
    
    return (
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in anchor_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in positive_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in negative_features.items()}
    )

def triplet_generator(anchor_features, positive_features, negative_features, batch_size=32):
    while True:
        indices = np.arange(len(anchor_features['tags']))
        np.random.shuffle(indices)
        
        for start in range(0, len(anchor_features['tags']), batch_size):
            end = start + batch_size
            batch_indices = indices[start:end]
            
            anchor_batch = {k: v[batch_indices] for k, v in anchor_features.items()}
            positive_batch = {k: v[batch_indices] for k, v in positive_features.items()}
            negative_batch = {k: v[batch_indices] for k, v in negative_features.items()}
            
            yield (anchor_batch, positive_batch, negative_batch)

def train_model(model, anchor_features, positive_features, negative_features, batch_size=32, epochs=20, learning_rate=0.001):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    @tf.function
    def train_step(anchor_batch, positive_batch, negative_batch):
        with tf.GradientTape() as tape:
            anchor_embeddings = model(anchor_batch, training=True)
            positive_embeddings = model(positive_batch, training=True)
            negative_embeddings = model(negative_batch, training=True)
            
            loss = triplet_loss(None, tf.stack([anchor_embeddings, positive_embeddings, negative_embeddings], axis=1))
        
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return loss

    triplet_gen = triplet_generator(anchor_features, positive_features, negative_features, batch_size=batch_size)
    steps_per_epoch = len(anchor_features['tags']) // batch_size

    for epoch in range(epochs):
        for step in range(steps_per_epoch):
            anchor_batch, positive_batch, negative_batch = next(triplet_gen)
            loss = train_step(anchor_batch, positive_batch, negative_batch)
            
            if step % 100 == 0:
                print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.numpy()}")

    return model

def save_model(model, filepath):
    model.save(filepath)

def load_model(filepath):
    return tf.keras.models.load_model(filepath, custom_objects={'triplet_loss': triplet_loss})

def compute_embeddings(model, dataframe):
    features = {
        'tags': np.array(pad_sequences(dataframe['tags'], padding='post')),
        'rating': np.array(dataframe['rating'])
    }
    return model.predict(features)

def cosine_sim_matrix(embeddings):
    return cosine_similarity(embeddings)

def recommend(model, input_features, location_embeddings, k=20):
    input_dict = {
        'tags': tf.convert_to_tensor([input_features['tags']], dtype=tf.int32),
        'rating': tf.convert_to_tensor([input_features['rating']], dtype=tf.float32)
    }
    
    query_embedding = model.predict(input_dict)
    similarities = cosine_similarity(query_embedding, location_embeddings)
    top_k_indices = similarities[0].argsort()[-k:][::-1]
    
    return top_k_indices

# Main functions for training and prediction
def train(filepath, model_save_path, embedding_dim=64, dense_units=128, batch_size=32, epochs=20, learning_rate=0.001):
    whole_data, unique_tags = load_and_preprocess_data(filepath)
    model = create_model(unique_tags, embedding_dim, dense_units)
    anchor_features, positive_features, negative_features = generate_triplets(whole_data)
    trained_model = train_model(model, anchor_features, positive_features, negative_features, batch_size, epochs, learning_rate)
    save_model(trained_model, model_save_path)
    return trained_model

def predict(model_save_path, input_features, k=20):
    model = load_model(model_save_path)
    whole_data, _ = load_and_preprocess_data('data/whole_data_cleaned.xlsx')
    location_embeddings = compute_embeddings(model, whole_data)
    recommendations = recommend(model, input_features, location_embeddings, k)
    recommended_items = whole_data.iloc[recommendations]
    return recommended_items

# Training
trained_model = train('data/whole_data_cleaned.xlsx', 'triplet_model.keras', embedding_dim=64, dense_units=128, batch_size=32, epochs=20, learning_rate=0.001)

# Prediction
input_features = {
    'tags': [5],  # Example tags
    'rating': 4.0
}
recommendations = predict('triplet_model.keras', input_features, k=20)
recommendations


Epoch 1, Step 0, Loss: 1.0015225410461426
Epoch 2, Step 0, Loss: 1.0023534297943115
Epoch 3, Step 0, Loss: 1.0052435398101807
Epoch 4, Step 0, Loss: 1.0077370405197144
Epoch 5, Step 0, Loss: 0.9994121193885803
Epoch 6, Step 0, Loss: 0.9862458109855652
Epoch 7, Step 0, Loss: 0.949332058429718
Epoch 8, Step 0, Loss: 0.8920482993125916
Epoch 9, Step 0, Loss: 0.8621934652328491
Epoch 10, Step 0, Loss: 0.822844386100769
Epoch 11, Step 0, Loss: 0.8253400325775146
Epoch 12, Step 0, Loss: 0.8771654367446899
Epoch 13, Step 0, Loss: 0.889432966709137
Epoch 14, Step 0, Loss: 0.8793674111366272
Epoch 15, Step 0, Loss: 0.844650149345398
Epoch 16, Step 0, Loss: 0.9238991141319275
Epoch 17, Step 0, Loss: 0.7589319944381714
Epoch 18, Step 0, Loss: 1.078138828277588
Epoch 19, Step 0, Loss: 0.9903103709220886
Epoch 20, Step 0, Loss: 0.7637349367141724
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step


Unnamed: 0,name,rating,tags,territory_id.1,locationYX
58,Historical Plane Tree of Çengelköy,4.3,"[1, 2, 3, 12, 13, 14, 15]",1,"41.05022883927636, 29.05278206286947"
158,Topkapı Palace,4.7,"[1, 2, 3, 4, 6, 7, 9]",3,"41.01718931703204, 28.984405485753218"
159,Sultanahmet Square,4.6,"[1, 2, 3, 12, 13, 14, 15]",3,"41.00640399569341, 28.976145898680326"
186,Grand Bazaar,4.1,"[1, 2, 3, 7, 9, 17, 0]",3,"41.014800403046365, 28.967378072061717"
82,Dolmabahce Palace,4.7,"[2, 3, 4, 6, 7, 9, 0]",0,"41.039330192801955, 29.000513044178206"
131,Artisans Park,4.4,"[1, 3, 6, 11, 12, 0, 0]",3,"41.033810678476115, 28.982959818457207"
156,Basilica Cistern,4.9,"[1, 2, 3, 4, 7, 9, 0]",3,"41.0135359792602, 28.977187971940108"
170,Great Palace Mosaics Museum,4.4,"[2, 3, 4, 6, 7, 0, 0]",3,"41.004434742520054, 28.977408242329407"
162,Istanbul Archaeology Museum,4.6,"[2, 3, 4, 6, 7, 0, 0]",3,"41.005911914447026, 28.97541304225422"
166,Turkish and Islamic Arts Museum,4.6,"[2, 3, 4, 5, 6, 0, 0]",3,"41.01085761893604, 28.975403362743883"


# Collabrative Filtering

In [27]:
user_df = pd.read_excel('data\generated_user_data.xlsx')
user_df

  user_df = pd.read_excel('data\generated_user_data.xlsx')


Unnamed: 0,user_id,user_gender,place_id,continent,score,user_age
0,1,Male,563837,2,1,54
1,1,Male,776089,2,0,54
2,1,Male,207024,2,1,54
3,1,Male,711495,2,1,54
4,1,Male,463612,2,1,54
...,...,...,...,...,...,...
9995,1000,Male,871687,1,1,51
9996,1000,Male,950867,1,1,51
9997,1000,Male,761356,1,1,51
9998,1000,Male,723599,1,1,51


In [28]:
# Load user data
user_data = pd.read_excel('data\generated_user_data.xlsx')
user_data

  user_data = pd.read_excel('data\generated_user_data.xlsx')


Unnamed: 0,user_id,user_gender,place_id,continent,score,user_age
0,1,Male,563837,2,1,54
1,1,Male,776089,2,0,54
2,1,Male,207024,2,1,54
3,1,Male,711495,2,1,54
4,1,Male,463612,2,1,54
...,...,...,...,...,...,...
9995,1000,Male,871687,1,1,51
9996,1000,Male,950867,1,1,51
9997,1000,Male,761356,1,1,51
9998,1000,Male,723599,1,1,51


In [29]:
user_data[user_data['user_id'] == 1]

Unnamed: 0,user_id,user_gender,place_id,continent,score,user_age
0,1,Male,563837,2,1,54
1,1,Male,776089,2,0,54
2,1,Male,207024,2,1,54
3,1,Male,711495,2,1,54
4,1,Male,463612,2,1,54
5,1,Male,733989,2,1,54
6,1,Male,375467,2,1,54
7,1,Male,992964,2,1,54
8,1,Male,342528,2,1,54
9,1,Male,459627,2,1,54


In [30]:
# Preprocess user_data
user_data['user_id'] = user_data.index

# Extract unique user_ids and place_ids
unique_user_ids = user_data['user_id'].unique()
unique_place_ids = user_data['place_id'].unique()

# Create mappings for user_ids and place_ids
user_id_to_index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
place_id_to_index = {place_id: index for index, place_id in enumerate(unique_place_ids)}

# Map user_ids and place_ids to indices
user_data['user_index'] = user_data['user_id'].map(user_id_to_index)
user_data['place_index'] = user_data['place_id'].map(place_id_to_index)

In [31]:
user_data

Unnamed: 0,user_id,user_gender,place_id,continent,score,user_age,user_index,place_index
0,0,Male,563837,2,1,54,0,0
1,1,Male,776089,2,0,54,1,1
2,2,Male,207024,2,1,54,2,2
3,3,Male,711495,2,1,54,3,3
4,4,Male,463612,2,1,54,4,4
...,...,...,...,...,...,...,...,...
9995,9995,Male,871687,1,1,51,9995,72
9996,9996,Male,950867,1,1,51,9996,85
9997,9997,Male,761356,1,1,51,9997,64
9998,9998,Male,723599,1,1,51,9998,101


In [32]:
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.models import Model

def create_collaborative_filtering_model(num_users, num_places, embedding_dim=50):
    # User input
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
    user_embedding = Flatten()(user_embedding)
    
    # Place input
    place_input = Input(shape=(1,), name='place_input')
    place_embedding = Embedding(input_dim=num_places, output_dim=embedding_dim, name='place_embedding')(place_input)
    place_embedding = Flatten()(place_embedding)
    
    # Dot product of user and place embeddings
    dot_product = Dot(axes=1)([user_embedding, place_embedding])
    
    # Add biases
    user_bias = Embedding(input_dim=num_users, output_dim=1, name='user_bias')(user_input)
    user_bias = Flatten()(user_bias)
    
    place_bias = Embedding(input_dim=num_places, output_dim=1, name='place_bias')(place_input)
    place_bias = Flatten()(place_bias)
    
    # Add all components
    output = Add()([dot_product, user_bias, place_bias])
    
    # Build and compile model
    model = Model(inputs=[user_input, place_input], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

num_users = len(unique_user_ids)
num_places = len(unique_place_ids)
embedding_dim = 50

collab_model = create_collaborative_filtering_model(num_users, num_places, embedding_dim)


In [33]:
# Prepare training data
user_indices = user_data['user_index'].values
place_indices = user_data['place_index'].values
scores = user_data['score'].values

# Train the model
collab_model.fit(
    x=[user_indices, place_indices],
    y=scores,
    batch_size=32,
    epochs=30,
    validation_split=0.2
)


Epoch 1/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.8524 - val_loss: 0.7296
Epoch 2/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6538 - val_loss: 0.6004
Epoch 3/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.4096 - val_loss: 0.5020
Epoch 4/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.1491 - val_loss: 0.4425
Epoch 5/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0129 - val_loss: 0.4269
Epoch 6/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0019 - val_loss: 0.4258
Epoch 7/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0013 - val_loss: 0.4262
Epoch 8/30
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 7.9055e-04 - val_loss: 0.4251
Epoch 9/30
[1m250/250[0m [32m━━━━

<keras.src.callbacks.history.History at 0x2c009676d80>

In [34]:
def recommend_places(user_id, model, place_data, k=10):
    user_index = user_id_to_index[user_id]
    user_vector = np.full((len(place_data),), user_index)
    place_indices = np.arange(len(place_data))
    
    # Predict scores for all places for the given user
    predictions = model.predict([user_vector, place_indices])
    top_k_indices = predictions.flatten().argsort()[-k:][::-1]
    
    # Get the recommended places
    recommended_places = place_data.iloc[top_k_indices]
    return recommended_places

# Example usage
user_id = 0  # Replace with the actual user_id
recommended_places = recommend_places(user_id, collab_model, whole_data, k=10)
print("Recommended places for user:", user_id)
recommended_places






[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Recommended places for user: 0


Unnamed: 0,name,rating,tags,territory_id.1,locationYX
0,Kadikoy Ferry Terminal,4.6,"[1, 9, 14, 0, 0, 0, 0]",2,"40.99269778351916, 29.023280555674663"
154,Zal Mahmud Pasha Mosque,4.7,"[1, 2, 3, 5, 0, 0, 0]",3,"41.05393920722448, 28.931139546558196"
148,The Ferhad Pasha Tomb,4.8,"[3, 5, 7, 0, 0, 0, 0]",3,"41.05279175499929, 28.936219464892545"
21,Ciya Sofrası,4.5,"[13, 0, 0, 0, 0, 0, 0]",2,"40.989294633145974, 29.024399407849852"
152,Chora Mosque,4.4,"[1, 2, 3, 5, 0, 0, 0]",3,"41.03139923940517, 28.939209757670586"
9,Caddebostan Beach,4.5,"[11, 12, 14, 0, 0, 0, 0]",2,"40.962204847303646, 29.06226475095148"
4,IDEA Kadikoy,4.6,"[9, 14, 16, 0, 0, 0, 0]",2,"40.98015752580784, 29.02810950338273"
53,Validebag Grove,4.6,"[10, 11, 12, 13, 0, 0, 0]",1,"41.014218144541246, 29.04711087702171"
37,Omer Faruk Toprak Library,4.6,"[8, 0, 0, 0, 0, 0, 0]",2,"40.96463983879542, 29.094771508668305"
72,Filizler Meatball Restaurant,4.0,"[13, 0, 0, 0, 0, 0, 0]",1,"41.02255261847614, 29.00690662144621"


In [48]:
# Example usage
user_id = 987  # Replace with the actual user_id
recommended_places = recommend_places(user_id, collab_model, whole_data, k=10)
print("Recommended places for user:", user_id)
display(recommended_places)
display(user_data[user_data["user_id"] == 987])

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Recommended places for user: 987


Unnamed: 0,name,rating,tags,territory_id.1,locationYX
24,Sukru Saracoglu Stadium,4.4,"[16, 17, 18, 0, 0, 0, 0]",2,"40.98759526258761, 29.036799104926455"
100,Sadberk Hanım Museum,4.7,"[2, 3, 4, 6, 0, 0, 0]",0,"41.16331366950165, 29.04817019579597"
47,Camlica Hill,4.7,"[9, 10, 11, 12, 13, 0, 0]",1,"41.028286494032216, 29.069149118577506"
73,Museum of Ottoman Women Sultans,4.5,"[2, 3, 4, 0, 0, 0, 0]",1,"41.025289232590495, 29.010161074493688"
69,Kuzguncuk Icadiye Street,4.6,"[12, 13, 14, 15, 17, 0, 0]",1,"41.034480060651845, 29.030833399333115"
63,Historical Kuzguncuk Houses,4.7,"[1, 2, 3, 7, 0, 0, 0]",1,"41.035635359473225, 29.029979104928746"
151,Golden Horn Bridge,4.7,"[1, 3, 7, 9, 0, 0, 0]",3,"41.052983479947436, 28.940533293620383"
34,Sekerci Cafer Erol,4.7,"[14, 0, 0, 0, 0, 0, 0]",2,"40.990696769055376, 29.024770154139464"
93,Besiktas Panayia Greek Orthodox Church,4.0,"[1, 2, 3, 5, 0, 0, 0]",0,"41.04390892586081, 29.00515410237262"
89,Çırağan Palace,4.7,"[1, 2, 3, 4, 7, 0, 0]",0,"41.04387308576512, 29.015929923862068"


Unnamed: 0,user_id,user_gender,place_id,continent,score,user_age,user_index,place_index
987,987,Female,188547,0,1,27,987,24


In [36]:
user_data[user_data["user_id"] == 1231]


Unnamed: 0,user_id,user_gender,place_id,continent,score,user_age,user_index,place_index
1231,1231,Female,207141,4,1,53,1231,150


In [37]:
whole_data[whole_data["place_id"] == 274391]

KeyError: 'place_id'

In [38]:
whole_data

Unnamed: 0,name,rating,tags,territory_id.1,locationYX
0,Kadikoy Ferry Terminal,4.6,"[1, 9, 14, 0, 0, 0, 0]",2,"40.99269778351916, 29.023280555674663"
1,Kadikoy Bull Statue,4.5,"[1, 3, 6, 0, 0, 0, 0]",2,"40.990473264783475, 29.029131932189433"
2,Kadikoy-moda Streets,4.6,"[13, 14, 15, 0, 0, 0, 0]",2,"41.0300084184215, 28.98441527977153"
3,Moda Beach Park,4.7,"[9, 11, 12, 0, 0, 0, 0]",2,"40.98000940235465, 29.026848556608424"
4,IDEA Kadikoy,4.6,"[9, 14, 16, 0, 0, 0, 0]",2,"40.98015752580784, 29.02810950338273"
...,...,...,...,...,...
183,Palace of the Porphyrogenitus,4.5,"[2, 3, 4, 7, 0, 0, 0]",3,"41.03788124723142, 28.93945445055248"
184,Column of Marcian,4.3,"[1, 2, 3, 7, 0, 0, 0]",3,"41.02630809319681, 28.95342213409043"
185,Beyazıt Tower,4.5,"[1, 2, 3, 7, 0, 0, 0]",3,"41.01817417223928, 28.96347312671263"
186,Grand Bazaar,4.1,"[1, 2, 3, 7, 9, 17, 0]",3,"41.014800403046365, 28.967378072061717"


In [40]:
# Example usage
user_id = 1  # Replace with the actual user_id
recommended_places = recommend_places(user_id, collab_model, whole_data, k=10)
print("Recommended places for user:", user_id)
print(recommended_places)

# Checking user_data for user_id = 0
print(user_data[user_data["user_id"] == 1])

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Recommended places for user: 1
                                name  rating                       tags  \
19                       Caffe Cadde     4.8    [13, 14, 0, 0, 0, 0, 0]   
141       Sarachane Archaeology Park     4.3     [2, 3, 6, 11, 0, 0, 0]   
106                  Dogancay Museum     4.4      [2, 4, 6, 0, 0, 0, 0]   
10       Caddebostan Cultural Center     4.5     [2, 6, 16, 0, 0, 0, 0]   
95                  Maslak Pavilions     4.6      [1, 3, 4, 7, 0, 0, 0]   
117         Taksim Republic Monument     4.7      [1, 3, 7, 0, 0, 0, 0]   
50   Museum of Islamic Civilizations     4.7      [2, 4, 5, 0, 0, 0, 0]   
172          Sirkeci Railway Station     4.6      [3, 4, 7, 0, 0, 0, 0]   
51                 Fethi Pasha Grove     4.6  [9, 10, 11, 12, 13, 0, 0]   
152                     Chora Mosque     4.4      [1, 2, 3, 5, 0, 0, 0]   

     territory_id.1                              locationYX  
19       