In [68]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, GlobalAveragePooling1D, Normalization, Multiply
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.list_physical_devices('CPU')))
print("Num TPUs Available: ", len(tf.config.list_physical_devices('TPU')))

2.16.1
[]
Num GPUs Available:  0
Num CPUs Available:  1
Num TPUs Available:  0


---

This Python cell performs data preprocessing on a CSV file located at the specified file path. Here's a breakdown of the preprocessing steps:

1. **Reading Data**: Reads the CSV file into a pandas DataFrame named `whole_data`.
2. **Removing Columns**: Drops specific columns ('website', 'place_links', 'description', 'territory_id') from the DataFrame.
3. **Converting Tag Data**: Defines a function `safe_int_convert` to convert tag data into a standardized format. Tags are either stored as lists, single integers, or comma-separated strings. This function converts them into a list of integers.
4. **Applying Data Transformation**: Applies the `safe_int_convert` function to the 'tags' column of the DataFrame. Converts the 'rating' column to float type and fills missing values with the mean.
5. **Padding Sequences**: Utilizes `pad_sequences` from Keras to pad sequences of tags to ensure uniform length.
6. **Extracting Unique Tags**: Flattens the padded tags and extracts unique tags.
7. **Return Values**: Returns the preprocessed DataFrame `whole_data` and an array of unique tags `unique_tags`.

Additionally, it prints information about the preprocessed data, such as the shape of the DataFrame and the unique tags extracted.


In [69]:
# Data Preprocessing
def preprocess_data(file_path):
    whole_data = pd.read_csv(file_path)
    whole_data.drop(["website", "place_links", "description", "territory_id"], axis=1, inplace=True)


    def safe_int_convert(tag_list):
        if isinstance(tag_list, list):
            return tag_list
        elif isinstance(tag_list, (int, float)):
            return [int(tag_list)]
        elif isinstance(tag_list, str):
            try:
                return list(map(int, tag_list.split(',')))
            except ValueError:
                return []
        return []


    whole_data['tags'] = whole_data['tags'].apply(safe_int_convert)
    whole_data['rating'] = whole_data['rating'].astype(float).fillna(whole_data['rating'].mean())
    tags_padded = pad_sequences(whole_data['tags'], padding='post')
    whole_data['tags'] = list(tags_padded)
    tags_flat = [tag for sublist in whole_data['tags'].tolist() for tag in sublist]
    unique_tags = np.unique(tags_flat)
    
    return whole_data, unique_tags

print("Preprocessing data...")

whole_data, unique_tags = preprocess_data('../../../data/place_data.csv')

print("\nWhole data shape: ", whole_data.shape)

print("\nUnique tags shape: ", unique_tags.shape)
print("Unique tags: ", unique_tags)

print("\nWhole data columns: ", whole_data.columns)

print("\nWhole data table: ")
whole_data


Preprocessing data...

Whole data shape:  (187, 5)

Unique tags shape:  (23,)
Unique tags:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]

Whole data columns:  Index(['name', 'rating', 'place_id', 'tags', 'locationYX'], dtype='object')

Whole data table: 


Unnamed: 0,name,rating,place_id,tags,locationYX
0,Kadikoy Ferry Terminal,4.6,370509,"[1, 9, 14, 0, 0, 0, 0]","40.99269778351916,29.023280555674663"
1,Kadikoy Bull Statue,4.5,761356,"[1, 3, 6, 0, 0, 0, 0]","40.990473264783475,29.029131932189433"
2,Kadikoy-moda Streets,4.6,877687,"[13, 14, 15, 0, 0, 0, 0]","41.0300084184215,28.98441527977153"
3,Moda Beach Park,4.7,292410,"[9, 11, 12, 0, 0, 0, 0]","40.98000940235465,29.026848556608424"
4,IDEA Kadikoy,4.6,827370,"[9, 14, 16, 0, 0, 0, 0]","40.98015752580784,29.02810950338273"
...,...,...,...,...,...
182,Palace of the Porphyrogenitus,4.5,871687,"[2, 3, 4, 7, 0, 0, 0]","41.03788124723142,28.93945445055248"
183,Column of Marcian,4.3,265458,"[1, 2, 3, 7, 0, 0, 0]","41.02630809319681,28.95342213409043"
184,Beyazıt Tower,4.5,505870,"[1, 2, 3, 7, 0, 0, 0]","41.01817417223928,28.96347312671263"
185,Grand Bazaar,4.1,102220,"[1, 2, 3, 7, 9, 17, 0]","41.014800403046365,28.967378072061717"


---

This Python cell defines a custom triplet loss function commonly used in triplet loss networks for learning embeddings. Here's an explanation of the function:

1. **Input Parameters**:
   - `anchor`: The embedding vector for the anchor sample.
   - `positive`: The embedding vector for the positive sample (same class as anchor).
   - `negative`: The embedding vector for the negative sample (different class from anchor).
   - `margin`: The margin value, a hyperparameter that specifies the minimum difference between the distances of anchor-positive and anchor-negative pairs.

2. **Calculating Distances**:
   - Computes the Euclidean distance between the anchor and positive samples, and between the anchor and negative samples.
   - Uses `tf.reduce_sum` to sum the squared differences along the last axis (feature dimension).

3. **Loss Calculation**:
   - Computes the difference between the positive and negative distances, adds the margin, and takes the element-wise maximum with 0. This ensures that the loss is only calculated when the positive distance is not significantly smaller than the negative distance.
   - Finally, computes the mean of these losses across all samples.

4. **Return Value**:
   - Returns the mean triplet loss computed over the entire batch of samples.

This loss function encourages the network to learn embeddings such that the distance between the anchor and positive samples is minimized, while the distance between the anchor and negative samples is maximized by at least the specified margin.


In [70]:
# Define Triplet Loss
def triplet_loss(anchor, positive, negative, margin=1.0):
    positive_distance = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
    negative_distance = tf.reduce_sum(tf.square(anchor - negative), axis=-1)
    loss = tf.maximum(positive_distance - negative_distance + margin, 0.0)
    return tf.reduce_mean(loss)


---

This Python cell defines a function to create a triplet model for learning embeddings. Here's an explanation of the function:

1. **Input Parameters**:
   - `unique_tags`: An array containing unique tag IDs used for embedding the tag input.
   - `embedding_dim`: Dimensionality of the embedding space.
   - `dense_units`: Number of units in the dense layer.

2. **Model Architecture**:
   - **Tag Input**: Defines an input layer for tag data.
   - **Tag Lookup**: Utilizes `IntegerLookup` layer from TensorFlow to map tag IDs to embedding indices.
   - **Tag Embedding**: Embeds the tag input using an `Embedding` layer with specified dimensions.
   - **Tag Pooling**: Uses `GlobalAveragePooling1D` to aggregate tag embeddings across time dimension.
   - **Rating Input**: Defines an input layer for the rating data.
   - **Rating Normalization**: Normalizes the rating input using `Normalization` layer.
   - **Combining Inputs**: Concatenates the pooled tag embeddings and normalized rating.
   - **Dense Layers**: Applies a dense layer with ReLU activation to the combined embeddings.
   - **Output Layer**: Outputs the embeddings with dimensions equal to `embedding_dim`.

3. **Model Creation**:
   - Constructs a `Model` object with inputs as tag and rating inputs and output as the embedding layer.

This model architecture combines both tag embeddings and normalized ratings to generate embeddings of the specified dimensionality.


In [71]:
# Create Triplet Model
def create_triplet_model(unique_tags, embedding_dim=64, dense_units=128):
    tag_input = Input(shape=(None,), dtype=tf.int32, name='tags')
    tag_lookup = tf.keras.layers.IntegerLookup(vocabulary=unique_tags, mask_token=None)
    tag_embedding = Embedding(input_dim=len(unique_tags) + 1, output_dim=embedding_dim)
    tag_embeddings = tag_embedding(tag_lookup(tag_input))
    tag_embeddings = GlobalAveragePooling1D()(tag_embeddings)
    
    rating_input = Input(shape=(1,), dtype=tf.float32, name='rating')
    rating_normalization = Normalization(axis=None)
    rating_normalization.build((None, 1))
    rating_normalized = rating_normalization(rating_input)
    
    combined_embeddings = Concatenate()([tag_embeddings, rating_normalized])
    dense = Dense(dense_units, activation='relu')(combined_embeddings)
    output = Dense(embedding_dim)(dense)
    
    model = Model(inputs=[tag_input, rating_input], outputs=output)
    return model


---

This Python cell defines a function to generate triplets for training a triplet loss model. Here's an explanation of the function:

1. **Input Parameter**:
   - `dataframe`: Pandas DataFrame containing the preprocessed data.

2. **Data Preparation**:
   - Initializes dictionaries to store features for anchor, positive, and negative samples.
   - Iterates over each row in the provided DataFrame.

3. **Generating Triplets**:
   - For each row, selects a random index from the DataFrame to create positive and negative samples.
   - Retrieves the corresponding rows for positive and negative samples.
   - Appends tag features and rating to the respective dictionaries for anchor, positive, and negative samples.

4. **Data Padding**:
   - Applies padding to tag sequences using `pad_sequences` to ensure uniform length across all samples.

5. **Return Value**:
   - Returns three dictionaries containing features for anchor, positive, and negative samples, respectively. Each dictionary contains tag sequences padded to the maximum length and ratings as numpy arrays.

This function facilitates the creation of triplets necessary for training a triplet loss model, ensuring that each triplet consists of an anchor, a positive, and a negative sample.


In [72]:
# Create Triplets for Training
def generate_triplets(dataframe):
    anchor_features = {'tags': [], 'rating': []}
    positive_features = {'tags': [], 'rating': []}
    negative_features = {'tags': [], 'rating': []}
    
    for _, row in dataframe.iterrows():
        anchor_features['tags'].append(row['tags'])
        anchor_features['rating'].append(row['rating'])
        
        positive_idx = np.random.choice(dataframe.index)
        negative_idx = np.random.choice(dataframe.index)
        
        positive_row = dataframe.loc[positive_idx]
        negative_row = dataframe.loc[negative_idx]
        
        positive_features['tags'].append(positive_row['tags'])
        positive_features['rating'].append(positive_row['rating'])
        
        negative_features['tags'].append(negative_row['tags'])
        negative_features['rating'].append(negative_row['rating'])
    
    return (
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in anchor_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in positive_features.items()},
        {k: pad_sequences(v, padding='post') if k == 'tags' else np.array(v) for k, v in negative_features.items()}
    )


---

This Python cell defines a generator function for creating batches of triplets during model training. Here's an explanation of the function:

1. **Input Parameters**:
   - `anchor_features`: Dictionary containing features for anchor samples.
   - `positive_features`: Dictionary containing features for positive samples.
   - `negative_features`: Dictionary containing features for negative samples.
   - `batch_size`: Number of triplets to generate in each batch.

2. **Generator Loop**:
   - Enters an infinite loop to continuously generate batches of triplets.
   - Shuffles the indices of the anchor features to randomize the order of samples in each epoch.

3. **Batch Generation**:
   - Divides the shuffled indices into batches of size `batch_size`.
   - For each batch, retrieves the corresponding features for anchor, positive, and negative samples.
   - Constructs dictionaries for anchor, positive, and negative batches with features.

4. **Yielding Batches**:
   - Yields a tuple containing dictionaries for anchor, positive, and negative batches.

5. **Usage in Training**:
   - This generator function can be used with the `fit_generator` method in Keras to train models that require triplets as input.

This generator function enables the creation of batches of triplets on-the-fly, allowing efficient training of triplet loss models without the need to store all possible triplets in memory.


In [73]:
# Triplet Generator
def triplet_generator(anchor_features, positive_features, negative_features, batch_size=32):
    while True:
        indices = np.arange(len(anchor_features['tags']))
        np.random.shuffle(indices)
        
        for start in range(0, len(anchor_features['tags']), batch_size):
            end = start + batch_size
            batch_indices = indices[start:end]
            
            anchor_batch = {k: v[batch_indices] for k, v in anchor_features.items()}
            positive_batch = {k: v[batch_indices] for k, v in positive_features.items()}
            negative_batch = {k: v[batch_indices] for k, v in negative_features.items()}
            
            yield (anchor_batch, positive_batch, negative_batch)


---

This Python cell defines a custom training step using TensorFlow's `tf.function` decorator. Here's an explanation of the function:

1. **Input Parameters**:
   - `model`: The triplet model to be trained.
   - `optimizer`: The optimizer used for updating model weights.
   - `anchor_batch`: Dictionary containing features for anchor samples.
   - `positive_batch`: Dictionary containing features for positive samples.
   - `negative_batch`: Dictionary containing features for negative samples.

2. **Training Step**:
   - Enters a `tf.GradientTape` context to compute gradients with respect to the model's trainable variables.
   - Forward pass: Computes embeddings for anchor, positive, and negative samples using the model.
   - Computes the triplet loss using the embeddings.
   - Backward pass: Computes gradients of the loss with respect to the model's trainable variables.
   - Applies gradients to update the model's weights using the specified optimizer.

3. **Return Value**:
   - Returns the computed loss for the current training step.

4. **Usage in Training Loop**:
   - This function can be called within a custom training loop to perform a single optimization step.
   - It efficiently computes gradients and updates model weights using the triplet loss.

The `@tf.function` decorator ensures that the function is compiled into a TensorFlow graph, optimizing performance during training.


In [74]:
# Custom Training Loop
@tf.function
def train_step(model, optimizer, anchor_batch, positive_batch, negative_batch):
    with tf.GradientTape() as tape:
        anchor_embeddings = model(anchor_batch, training=True)
        positive_embeddings = model(positive_batch, training=True)
        negative_embeddings = model(negative_batch, training=True)
        
        loss = triplet_loss(anchor_embeddings, positive_embeddings, negative_embeddings)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


---

This Python cell defines a function to train a triplet model using a custom training loop. Here's an explanation of the function:

1. **Input Parameters**:
   - `model`: The triplet model to be trained.
   - `anchor_features`: Dictionary containing features for anchor samples.
   - `positive_features`: Dictionary containing features for positive samples.
   - `negative_features`: Dictionary containing features for negative samples.
   - `epochs`: Number of epochs for training.
   - `batch_size`: Number of triplets to include in each training batch.

2. **Optimizer Initialization**:
   - Initializes an Adam optimizer with a specified learning rate.

3. **Triplet Generator Initialization**:
   - Initializes a triplet generator using the provided anchor, positive, and negative features and the specified batch size.

4. **Training Loop**:
   - Iterates over each epoch.
   - Within each epoch, iterates over each step in the training data.
   - Retrieves the next batch of triplets from the triplet generator.
   - Performs a single training step using the `train_step` function defined previously.
   - Prints the loss at regular intervals for monitoring training progress.

5. **Model Saving**:
   - Saves the trained model after completing all epochs.

This function facilitates the training of a triplet model using a custom training loop, allowing fine-grained control over the training process, including batch size, optimizer choice, and training duration.


In [75]:
# Train Triplet Model
def train_triplet_model(model, anchor_features, positive_features, negative_features, epochs=100, batch_size=32):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    triplet_gen = triplet_generator(anchor_features, positive_features, negative_features, batch_size=batch_size)
    steps_per_epoch = len(anchor_features['tags']) // batch_size

    for epoch in range(epochs):
        for step in range(steps_per_epoch):
            anchor_batch, positive_batch, negative_batch = next(triplet_gen)
            loss = train_step(model, optimizer, anchor_batch, positive_batch, negative_batch)
            
            if step % 100 == 0:
                print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.numpy()}")

    model.save('../.models/triplet_model.keras')
    

---

This Python cell defines a function to create and train a Neural Collaborative Filtering (NCF) model. Here's an explanation of the function:

1. **Input Parameters**:
   - `num_users`: Total number of users in the dataset.
   - `num_places`: Total number of places in the dataset.
   - `embedding_dim`: Dimensionality of the embedding space for user and place embeddings.
   - `hidden_layers`: List of integers specifying the number of units in each hidden layer of the MLP component.

2. **Model Architecture**:
   - **User Input**: Defines an input layer for user IDs.
   - **User Embedding**: Embeds the user input using an `Embedding` layer.
   - **Flattening**: Flattens the user embedding to prepare for concatenation.
   - **Place Input**: Defines an input layer for place IDs.
   - **Place Embedding**: Embeds the place input using another `Embedding` layer.
   - **GMF Component**: Multiplies user and place embeddings element-wise to create a GMF vector.
   - **MLP Component**: Concatenates user and place embeddings to create an input vector for the MLP.
   - **Hidden Layers**: Applies a series of dense layers with ReLU activation to the concatenated input vector.
   - **Combined Vector**: Concatenates the GMF and MLP vectors.
   - **Output Layer**: Outputs a single sigmoid value indicating the likelihood of user-place interaction.

3. **Model Compilation**:
   - Compiles the model using the Adam optimizer and binary cross-entropy loss, with accuracy as the evaluation metric.

4. **Return Value**:
   - Returns the compiled NCF model ready for training.

This function enables the creation of an NCF model architecture suitable for collaborative filtering tasks, with customizable embedding dimensions and hidden layers for the MLP component.


In [76]:
# Create and Train NCF Model
def create_ncf_model(num_users, num_places, embedding_dim=50, hidden_layers=[64, 32, 16, 8]):
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
    user_embedding = Flatten()(user_embedding)
    
    place_input = Input(shape=(1,), name='place_input')
    place_embedding = Embedding(input_dim=num_places, output_dim=embedding_dim, name='place_embedding')(place_input)
    place_embedding = Flatten()(place_embedding)
    
    gmf_vector = Multiply()([user_embedding, place_embedding])
    mlp_vector = Concatenate()([user_embedding, place_embedding])
    
    for units in hidden_layers:
        mlp_vector = Dense(units, activation='relu')(mlp_vector)
        
    combined_vector = Concatenate()([gmf_vector, mlp_vector])
    output = Dense(1, activation='sigmoid')(combined_vector)
    
    model = Model(inputs=[user_input, place_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


---

This Python cell defines a function to train an NCF model using user-place interaction data. Here's an explanation of the function:

1. **Input Parameters**:
   - `model`: The NCF model to be trained.
   - `user_data`: Pandas DataFrame containing user-place interaction data, including 'user_index', 'place_index', and 'score' columns.
   - `batch_size`: Number of samples per gradient update.
   - `epochs`: Number of epochs to train the model.

2. **Data Preparation**:
   - Checks if the required columns ('user_index', 'place_index', and 'score') are present in the user data DataFrame.
   - Extracts user indices, place indices, and interaction scores from the DataFrame.

3. **Model Training**:
   - Calls the `fit` method of the model with user indices and place indices as input features and interaction scores as the target variable.
   - Specifies batch size and number of epochs for training.
   - Splits the data into training and validation sets using a validation split of 0.2.

4. **Model Saving**:
   - Saves the trained model after completing all epochs.

This function facilitates the training of the NCF model using user-place interaction data, enabling the model to learn to predict user preferences for different places.


In [77]:
# Train NCF Model
def train_ncf_model(model, user_data, batch_size=32, epochs=5):
    if 'user_index' not in user_data.columns or 'place_index' not in user_data.columns or 'score' not in user_data.columns:
        raise ValueError("user_data must contain 'user_index', 'place_index', and 'score' columns.")

    user_indices = user_data['user_index'].values
    place_indices = user_data['place_index'].values
    interactions = user_data['score'].values

    model.fit(
        x=[user_indices, place_indices],
        y=interactions,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2
    )

    model.save('../.models/ncf_model.keras')


---

This Python cell defines a function to generate recommendations using an NCF model. Here's an explanation of the function:

1. **Input Parameters**:
   - `input_features`: Dictionary containing input features for the recommendation query, including 'tags' and 'rating'.
   - `model`: The trained NCF model used for generating recommendations.
   - `place_data`: Pandas DataFrame containing information about places, including 'tags' and 'rating'.
   - `k`: Number of recommendations to generate (default is 20).

2. **Input Data Conversion**:
   - Converts the input features into TensorFlow tensors suitable for model prediction.
   - Constructs an input dictionary containing 'tags' and 'rating' tensors.

3. **Query Embedding**:
   - Uses the model to predict the embedding for the input query.

4. **Location Embeddings**:
   - Predicts embeddings for all places in the `place_data` DataFrame using the model.
   - Converts tags to sequences and pads them for compatibility with the model input.

5. **Similarity Calculation**:
   - Computes cosine similarity between the query embedding and embeddings of all places.
   - Determines the top k indices of places with the highest similarity scores.

6. **Recommendations**:
   - Returns the indices of the top k recommended places based on similarity scores.

This function facilitates the generation of recommendations by finding places with embeddings most similar to the embedding of the input query, as predicted by the trained NCF model.


In [78]:
# Generate Recommendations
def recommend(input_features, model, place_data, k=20):
    input_dict = {
        'tags': tf.convert_to_tensor([input_features['tags']], dtype=tf.int32),
        'rating': tf.convert_to_tensor([input_features['rating']], dtype=tf.float32)
    }
    
    query_embedding = model.predict(input_dict)
    location_embeddings = model.predict({
        'tags': pad_sequences(place_data['tags'], padding='post'),
        'rating': place_data['rating'].values
    })
    
    similarities = cosine_similarity(query_embedding, location_embeddings)
    top_k_indices = similarities[0].argsort()[-k:][::-1]
    
    return top_k_indices


---

This Python cell defines a function to generate recommendations for a specific user using an NCF model. Here's an explanation of the function:

1. **Input Parameters**:
   - `user_id`: Identifier of the user for whom recommendations are generated.
   - `model`: The trained NCF model used for generating recommendations.
   - `place_data`: Pandas DataFrame containing information about places.
   - `user_id_to_index`: Dictionary mapping user IDs to their corresponding indices.
   - `k`: Number of recommendations to generate (default is 10).

2. **User ID Validation**:
   - Checks if the provided user ID exists in the `user_id_to_index` dictionary. Raises a KeyError if not found.

3. **User Index Retrieval**:
   - Retrieves the index of the user from the `user_id_to_index` dictionary.

4. **Place Indices Generation**:
   - Creates an array of indices representing all places in the `place_data` DataFrame.
   - Creates an array of user indices, with the user index repeated for each place.

5. **Prediction and Sorting**:
   - Predicts scores for all places for the specified user using the NCF model.
   - Sorts the places based on predicted scores and selects the top k indices.

6. **Top-k Recommendations**:
   - Retrieves the top-k recommended places from the `place_data` DataFrame based on the sorted indices.

This function facilitates the generation of recommendations for a specific user by predicting scores for all places and selecting the top-k recommendations.


In [79]:
def ncf_recommend(user_id, model, place_data, user_id_to_index, k=10):
    if user_id not in user_id_to_index:
        raise KeyError(f"User ID {user_id} not found in user_id_to_index.")
    
    user_index = user_id_to_index[user_id]
    place_indices = np.arange(len(place_data))
    user_indices = np.full(len(place_data), user_index)
    
    scores = model.predict([user_indices, place_indices]).flatten()
    top_k_indices = scores.argsort()[-k:][::-1]
    top_k_places = place_data.iloc[top_k_indices]
    
    return top_k_places


---

This Python cell defines a function to generate recommendations using a weighted hybrid approach combining content-based and collaborative filtering methods. Here's an explanation of the function:

1. **Input Parameters**:
   - `input_features`: Dictionary containing input features for the recommendation query, including 'tags' and 'rating'.
   - `user_id`: Identifier of the user for whom recommendations are generated.
   - `content_model`: The trained content-based recommendation model.
   - `cf_model`: The trained collaborative filtering (CF) model.
   - `place_data`: Pandas DataFrame containing information about places.
   - `user_id_to_index`: Dictionary mapping user IDs to their corresponding indices.
   - `cb_weight`: Weight assigned to content-based recommendations (default is 0.5).
   - `cf_weight`: Weight assigned to collaborative filtering recommendations (default is 0.5).
   - `k`: Number of recommendations to generate (default is 10).
   - `food_tags`: List of tags representing food-related categories (default is [13, 14, 15, 16]).

2. **Content-Based Recommendations**:
   - Generates content-based recommendations using the `recommend` function.
   - Retrieves top k recommendations based on content similarity.

3. **Collaborative Filtering Recommendations**:
   - Generates collaborative filtering recommendations using the `ncf_recommend` function.
   - Retrieves top k recommendations based on predicted scores from the CF model.

4. **Combining Recommendations**:
   - Concatenates content-based and collaborative filtering recommendations.
   - Marks each recommendation with its source (content-based or collaborative filtering).

5. **Normalization and Weighted Scores**:
   - Normalizes scores obtained from both methods using MinMaxScaler.
   - Calculates weighted scores based on the specified weights for content-based and collaborative filtering recommendations.

6. **Sorting and Filtering**:
   - Sorts recommendations based on weighted scores in descending order.
   - Limits recommendations to top k places.

7. **Food Tag Limitation**:
   - Limits the number of food-tagged places to a maximum of 3.
   - Ensures that the final recommendations list contains exactly k places.

This function enables the generation of recommendations by combining content-based and collaborative filtering approaches with customizable weights.


In [80]:
def weighted_hybrid_recommend(input_features, user_id, content_model, cf_model, place_data, user_id_to_index, cb_weight=0.5, cf_weight=0.5, k=10, food_tags=[13,14,15,16]):
    # Get content-based recommendations
    cb_recommendations = recommend(input_features, content_model, place_data, k)
    
    # Get collaborative filtering recommendations
    try:
        cf_recommendations = ncf_recommend(user_id, cf_model, place_data, user_id_to_index, k)
        cf_scores = cf_model.predict([np.full((len(place_data),), user_id_to_index[user_id]), place_data.index.values])
        cf_scores = cf_scores.flatten()
    except KeyError:
        cf_recommendations = pd.DataFrame()
        cf_scores = np.zeros(len(place_data))

    # Combine and mark the recommendations
    all_recommendations = pd.concat([place_data.iloc[cb_recommendations], cf_recommendations])
    all_recommendations['source'] = ['content-based'] * len(cb_recommendations) + ['collaborative-filtering'] * len(cf_recommendations)

    if 'place_id' not in all_recommendations.columns:
        all_recommendations['place_id'] = all_recommendations['name'].apply(lambda name: place_data.loc[place_data['name'] == name, 'place_id'].iloc[0])

    all_recommendations = all_recommendations.drop_duplicates('place_id')

    # Normalize the scores
    scaler = MinMaxScaler()
    cb_recommendations = all_recommendations.index[all_recommendations['source'] == 'content-based']
    cf_recommendations = all_recommendations.index[all_recommendations['source'] == 'collaborative-filtering']

    all_recommendations['score'] = scaler.fit_transform(
        np.concatenate([
            cosine_similarity(content_model.predict({
                'tags': tf.convert_to_tensor([input_features['tags']], dtype=tf.int32),
                'rating': tf.convert_to_tensor([input_features['rating']], dtype=tf.float32)
            }), content_model.predict({
                'tags': pad_sequences([place_data.iloc[idx]['tags']], padding='post'),
                'rating': np.array([place_data.iloc[idx]['rating']])
            }))[0].reshape(-1, 1) for idx in cb_recommendations
        ] + [cf_scores[cf_recommendations].reshape(-1, 1)])
    )

    # Calculate the weighted scores
    all_recommendations['weighted_score'] = all_recommendations.apply(
        lambda row: row['score'] * cb_weight if row['source'] == 'content-based' else row['score'] * cf_weight,
        axis=1
    )

    # Sort and limit to top k recommendations
    top_recommendations = all_recommendations.sort_values('weighted_score', ascending=False).drop_duplicates('place_id').head(k)

    # Limit food-tagged places to a maximum of 3
    food_mask = top_recommendations['tags'].apply(lambda tags: any(tag in food_tags for tag in tags))
    food_places = top_recommendations[food_mask].head(3)
    non_food_places = top_recommendations[~food_mask]

    final_recommendations = pd.concat([non_food_places, food_places])

    # Ensure the final recommendations list has exactly 10 places
    if len(final_recommendations) < k:
        remaining_slots = k - len(final_recommendations)
        additional_non_food_places = all_recommendations[~all_recommendations.index.isin(final_recommendations.index) 
                                                         & ~all_recommendations['tags']
                                                        .apply(lambda tags: any(tag in food_tags for tag in tags))].head(remaining_slots)
        final_recommendations = pd.concat([final_recommendations, additional_non_food_places])
    
    return final_recommendations.head(k)


---

This Python cell defines the main script for recommending places to a user. Here's an explanation of the script:

1. **Data Preprocessing**:
   - Loads and preprocesses place data to prepare it for model training and recommendation.
   - Extracts unique tags from the place data.

2. **User Data Processing**:
   - Loads historical user interaction data.
   - Preprocesses user data by adding user and place indices.
   - Creates dictionaries mapping user and place IDs to their corresponding indices.

3. **Model Training (NCF)**:
   - Creates an NCF model using the number of unique users and places.
   - Trains the NCF model using historical user interaction data.

4. **Model Loading**:
   - Loads the pre-trained Triplet and NCF models.

5. **Input Features Generation**:
   - Randomly generates input features for recommendation, including tags and rating.

6. **Recommendation Generation**:
   - Calls the `weighted_hybrid_recommend` function to generate recommendations.
   - Passes the input features along with the Triplet and NCF models, user data, and other parameters.
   - Prints the recommended places for the specified user ID.

7. **Return Value**:
   - Returns the list of recommended place IDs.

This script orchestrates the entire process of recommending places to a user, including data preprocessing, model training, input feature generation, recommendation generation, and output display.


In [81]:
# Main script
def recommend_places(user_id):
    # # Preprocess data
    whole_data, unique_tags = preprocess_data('../../../data/place_data.csv')
    
    # # Create and train triplet model
    # triplet_model = create_triplet_model(unique_tags)
    # anchor_features, positive_features, negative_features = generate_triplets(whole_data)
    # train_triplet_model(triplet_model, anchor_features, positive_features, negative_features)
    
    # Load user data
    user_data = pd.read_csv('../../../data/historical/historical_interactions.csv')
    
    # Preprocess user_data
    user_data['user_id'] = user_data.index
    unique_user_ids = user_data['user_id'].unique()
    unique_place_ids = user_data['place_id'].unique()
    user_id_to_index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
    place_id_to_index = {place_id: index for index, place_id in enumerate(unique_place_ids)}
    user_data['user_index'] = user_data['user_id'].map(user_id_to_index)
    user_data['place_index'] = user_data['place_id'].map(place_id_to_index)
    
    num_users = user_data['user_index'].nunique()
    num_places = user_data['place_index'].nunique()
    
    # # Create and train NCF model
    ncf_model = create_ncf_model(num_users, num_places)
    train_ncf_model(ncf_model, user_data)
    
    # Load trained models
    triplet_model = load_model('../.models/triplet_model.keras', custom_objects={'triplet_loss': triplet_loss})
    ncf_model = load_model('../.models/ncf_model.keras')
    
    # Randomly generate input features for recommendation. Tags are between 1 and 14 at least 3 many, rating is between 1 and 5.
    tags = np.random.choice(unique_tags, size=5, replace=False).tolist()
    rating = 4.5

    print("Input features:")
    print("Tags:", tags)
    print("Rating:", rating)

    input_features = {
        'tags': tags,
        'rating': rating
    }

    print("Recommending places for user with id:", user_id)

    recommended_items = weighted_hybrid_recommend(input_features, user_id, triplet_model, ncf_model, whole_data, user_id_to_index, cb_weight=0.8, cf_weight=0.05, k=10)
    print(recommended_items)

    return recommended_items["place_id"].tolist()

recommend_places(1)

Epoch 1/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8478 - loss: 0.5444 - val_accuracy: 0.8896 - val_loss: 0.3513
Epoch 2/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9085 - loss: 0.2486 - val_accuracy: 0.7787 - val_loss: 0.5095
Epoch 3/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9998 - loss: 0.0068 - val_accuracy: 0.5220 - val_loss: 1.4006
Epoch 4/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 2.4760e-04 - val_accuracy: 0.4850 - val_loss: 1.6962
Epoch 5/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 1.0265e-04 - val_accuracy: 0.4915 - val_loss: 1.7989
Input features:
Tags: [22, 0, 16, 6, 19]
Rating: 4.5
Recommending places for user with id: 1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m6/6[0m [32m━━━━━

[129727,
 528752,
 424824,
 676421,
 615371,
 755016,
 925873,
 140042,
 286657,
 152392]