In [53]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [54]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas as pd
from tensorflow.keras.layers import StringLookup

In [55]:
product = pd.read_csv('product3.csv')

user = pd.read_csv('user3.csv')

In [56]:
# Create TensorFlow datasets from the data
user = tf.data.Dataset.from_tensor_slices({
    "user_id": tf.strings.as_string(user['UserID'].values),  # Ensure user_id is string
    "product_title": user['Title'].values,
    "interaction_type": tf.cast(user['InteractionType'].values, tf.int32)
})

product = tf.data.Dataset.from_tensor_slices({
    "title": product['Title'].values,
    "ingredients": product['Ingredients'].values,
    "combined": product['Combined'].values
})

In [57]:
# Map function to format the user dataset
user = user.map(lambda x: {
    "user_id": tf.cast(x["user_id"], tf.string),  # Ensure user_id is cast to string
    "product_title": x["product_title"],
    "interaction_type": tf.cast(x["interaction_type"], tf.int32)
})

# Map function to format the product dataset
product = product.map(lambda x: {
    "title": x["title"],
    "ingredients": x["ingredients"],
    "combined": x["combined"]
})

In [66]:
# Shuffle and split the user dataset
shuffled_user = user.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

# Split into train and test sets
train_user = shuffled_user.take(80_000)
test_user = shuffled_user.skip(80_000).take(20_000)

# Batch the product dataset
batched_product = product.batch(1_000)

# Extract unique movie/product titles
product_titles = batched_product.map(lambda x: x["title"])
product_titles_list = list(product_titles)  # Extract the list of product titles
unique_product_titles = np.unique(np.concatenate(product_titles_list))  # Concatenate and get unique titles

# Extract unique user IDs
user_ids = user.map(lambda x: x["user_id"]).batch(1_000_000)
user_ids_list = list(user_ids)  # Extract the list of user IDs
unique_user_ids = np.unique(np.concatenate(user_ids_list))  # Concatenate and get unique IDs

# Menampilkan contoh produk dan user
print(unique_product_titles[:10])
print(unique_user_ids[:10])

# Membuat pasangan data (user_id, product_title)
user_data = tf.data.Dataset.from_tensor_slices(unique_user_ids)
product_data = tf.data.Dataset.from_tensor_slices(unique_product_titles)

# Gabungkan kedua dataset (user, product) menjadi pasangan
dataset = tf.data.Dataset.zip((user_data, product_data))

# Lakukan batching dataset
batch_size = 4096
dataset = dataset.batch(batch_size)

[b'     rendang ayam kacang merah' b'    ayam penyet'
 b'    kering balado tempe kacang pedas manis'
 b'    lele goreng krenyes no amis' b'    orek tahu warna warni'
 b'   gulai ikan kakap merah khas padang' b'   pesmol ikan kerapu'
 b'   telur dadar kelapa parut' b'  soto bening daging sapi'
 b'  tahu brokoli pedas manis']
[b'1' b'10' b'11' b'12' b'13' b'14' b'15' b'16' b'17' b'18']


In [67]:
# Pastikan user_ids dan product_titles berupa string
unique_user_ids = unique_user_ids.astype(str)
unique_product_titles = unique_product_titles.astype(str)

# Buat vocabulary untuk user IDs dan product titles
user_ids_vocabulary = tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None)
product_titles_vocabulary = tf.keras.layers.StringLookup(vocabulary=unique_product_titles, mask_token=None)


In [121]:
# User Model
user_model = tf.keras.Sequential([
    user_ids_vocabulary,  # Map user IDs to integer indices
    tf.keras.layers.Embedding(
        user_ids_vocabulary.vocabulary_size(),
        256,
        embeddings_regularizer=tf.keras.regularizers.l2(1e-5)
    )
])

# Product Model
product_model = tf.keras.Sequential([
    product_titles_vocabulary,  # Map product titles to integer indices
    tf.keras.layers.Embedding(
        product_titles_vocabulary.vocabulary_size(),
        256,
        embeddings_regularizer=tf.keras.regularizers.l2(1e-5)
    )
])

task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(
        candidates=product.map(lambda x: x["title"]).batch(128).map(product_model)  # Kandidat untuk retrieval
    )
)

In [135]:
# Functional Model for User and Product
def create_product_retrieval_model(user_model: tf.keras.Model, product_model: tf.keras.Model, task: tfrs.tasks.Retrieval):
    # Input layers for user and product
    user_input = tf.keras.Input(shape=(), dtype=tf.string, name="user_id")
    product_input = tf.keras.Input(shape=(), dtype=tf.string, name="product_title")

    # Embedding layers for user and product
    user_embeddings = user_model(user_input)
    product_embeddings = product_model(product_input)

    # Calculate loss using task
    loss = task(user_embeddings, product_embeddings)

    # Define the model
    model = tf.keras.Model(inputs=[user_input, product_input], outputs=loss)
    return model

# Contoh penggunaan
model = create_product_retrieval_model(user_model, product_model, task)

In [136]:
# Create the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='factorized_top_k/top_5_categorical_accuracy',
    patience=500,
    restore_best_weights=True,
    verbose=1
)

model.summary

<bound method Model.summary of <tf_keras.src.engine.functional.Functional object at 0x7f4e9075bbf0>>

In [137]:
class LogEvery100Epochs(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % 100 == 0:  # Print every 100 epochs
            metrics = " - ".join([f"{k}: {v}" for k, v in logs.items()])
            print(f"Epoch {epoch + 1}/{self.params['epochs']} - {metrics}")

# Use the callback during training
history = model.fit(
    train_user.batch(4096),
    epochs=3000,
    callbacks=[LogEvery100Epochs(), early_stopping],
    verbose=0,
)

  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 100/3000 - loss: 0.018226128071546555 - factorized_top_k/top_1_categorical_accuracy: 0.04600000008940697 - factorized_top_k/top_5_categorical_accuracy: 0.148499995470047 - factorized_top_k/top_10_categorical_accuracy: 0.26249998807907104 - factorized_top_k/top_50_categorical_accuracy: 0.9415000081062317 - factorized_top_k/top_100_categorical_accuracy: 1.0
Epoch 200/3000 - loss: 0.0037649255245923996 - factorized_top_k/top_1_categorical_accuracy: 0.03799999877810478 - factorized_top_k/top_5_categorical_accuracy: 0.13899999856948853 - factorized_top_k/top_10_categorical_accuracy: 0.24650000035762787 - factorized_top_k/top_50_categorical_accuracy: 0.8774999976158142 - factorized_top_k/top_100_categorical_accuracy: 1.0
Epoch 300/3000 - loss: 0.0006202742224559188 - factorized_top_k/top_1_categorical_accuracy: 0.03500000014901161 - factorized_top_k/top_5_categorical_accuracy: 0.12549999356269836 - factorized_top_k/top_10_categorical_accuracy: 0.24300000071525574 - factorized_top_k/top

In [139]:
# Access the best accuracy from the early stopping callback
best_accuracy5 = history.history['factorized_top_k/top_5_categorical_accuracy'][early_stopping.best_epoch]
best_accuracy10 = history.history['factorized_top_k/top_10_categorical_accuracy'][early_stopping.best_epoch]
best_accuracy100 = history.history['factorized_top_k/top_100_categorical_accuracy'][early_stopping.best_epoch]
print(f"Best Top-5 Accuracy: {best_accuracy5:.4f}")
print(f"Best Top-10 Accuracy: {best_accuracy10:.4f}")
print(f"Best Top-100 Accuracy: {best_accuracy100:.4f}")


Best Top-5 Accuracy: 0.3610
Best Top-10 Accuracy: 0.3990
Best Top-100 Accuracy: 1.0000


In [145]:
user_id_input = "3"  # Replace with an actual user ID
k = 5  # The number of recommendations you want

# Create the input tensor for the user ID
user_input = tf.constant([user_id_input])

# Get user embeddings from the trained user model
user_embeddings = user_model(user_input)

# Get all product titles and convert them into tensor format
product_titles = product_titles_vocabulary.get_vocabulary()  # Get the list of product titles
product_titles_tensor = tf.constant(product_titles)  # Convert product titles to tensor format

# Use the product model to get product embeddings
product_embeddings = product_model(product_titles_tensor)

# Compute the cosine similarity or any other distance metric between the user and product embeddings
# In this case, we use cosine similarity for simplicity
similarity_scores = tf.linalg.matmul(user_embeddings, product_embeddings, transpose_b=True)

# Get the top K products
top_k_scores, top_k_indices = tf.math.top_k(similarity_scores, k=k)

# Get the actual product titles for the top K indices
top_k_product_titles = [product_titles[i] for i in top_k_indices.numpy()[0]]

print(f"Top {k} recommended products for user {user_id_input}:")
for title in top_k_product_titles:
    print(title)


Top 5 recommended products for user 3:
pecak ikan mujaer
nila goreng cabe ijo
ayam kecap bumbu giling
beef sausage teriyaki w  salad
sate buntel kambing


In [157]:
model.save('model.keras', save_format='tf')

NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [153]:
model.save_weights('model_weights.h5')