In [1]:
import os
import tempfile

import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [2]:
import pandas as pd
from os.path import join

dataset_parsed_path = "/home/jupyter/mlspec-blackfriday/dataset/parsed/202104130952/"
train_path = join(dataset_parsed_path, "train.csv")
test_path = join(dataset_parsed_path, "eval.csv")

df_train=pd.read_csv(train_path)
df_test=pd.read_csv(test_path)

print(f"df_train shape:{df_train.shape}")
print(f"df_test shape:{df_test.shape}")

df_train shape:(78366, 12)
df_test shape:(156733, 12)


In [3]:
df_train = df_train[["User_ID", "Gender", "Product_ID"]]
df_train["User_ID"] = df_train["User_ID"].astype(str)
df_train["Gender"] = df_train["Gender"].astype(str)
df_train["Product_ID"] = df_train["Product_ID"].astype(str)

print(df_train.dtypes)
df_train.head(3)

User_ID       object
Gender        object
Product_ID    object
dtype: object


Unnamed: 0,User_ID,Gender,Product_ID
0,1004732,M,P00235842
1,1003903,M,P00180442
2,1002340,M,P00303842


---

In [4]:
unique_gender = df_train["Gender"].unique()
type(unique_gender), unique_gender.dtype, len(unique_gender), unique_gender[0]

(numpy.ndarray, dtype('O'), 2, 'M')

In [5]:
unique_user_id = df_train["User_ID"].unique()
type(unique_user_id), unique_user_id.dtype, len(unique_user_id), unique_user_id[0]

(numpy.ndarray, dtype('O'), 5728, '1004732')

In [6]:
unique_product_id = df_train["Product_ID"].unique()
type(unique_product_id), unique_product_id.dtype, len(unique_product_id), unique_product_id[0]

(numpy.ndarray, dtype('O'), 3250, 'P00235842')

---

In [7]:
class UserModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_user_id, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_id) + 1, 32),
        ])

        self.gender_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=unique_gender, mask_token=None),
            tf.keras.layers.Embedding(len(unique_gender) + 1, 32),
        ])


    def call(self, inputs):
        return tf.concat([
            self.user_embedding(inputs["User_ID"]),
            self.gender_embedding(inputs["Gender"]),
        ], axis=1)

In [8]:
class ProductModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.product_embedding = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
              vocabulary=unique_product_id, mask_token=None),
          tf.keras.layers.Embedding(len(unique_product_id) + 1, 32)
        ])


    def call(self, products_id):
        return tf.concat([
            self.product_embedding(products_id)
        ], axis=1)

---

In [19]:
tf_products = tf.data.Dataset.from_tensor_slices(df_train["Product_ID"].astype(str).values)

In [10]:
class BlackFridayModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.query_model = tf.keras.Sequential([
          UserModel(),
          tf.keras.layers.Dense(32)
        ])
        self.candidate_model = tf.keras.Sequential([
          ProductModel(),
          tf.keras.layers.Dense(32)
        ])
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=tf_products.batch(128).map(self.candidate_model), # dataset of candidate embeddings from which candidates should be retrieved (embedded)
            ),
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model({
            "User_ID": features["User_ID"],
            "Gender": features["Gender"],
        })
        product_embeddings = self.candidate_model(features["Product_ID"])

        return self.task(query_embeddings, product_embeddings)

---

In [11]:
tf_dataset = tf.data.Dataset.from_tensor_slices(dict(df_train)) # dict missing!!!
tf_dataset.element_spec

{'User_ID': TensorSpec(shape=(), dtype=tf.string, name=None),
 'Gender': TensorSpec(shape=(), dtype=tf.string, name=None),
 'Product_ID': TensorSpec(shape=(), dtype=tf.string, name=None)}

In [12]:
tf.random.set_seed(42)
shuffled = tf_dataset.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [13]:
cached_train.element_spec, cached_test.element_spec

({'User_ID': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Gender': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Product_ID': TensorSpec(shape=(None,), dtype=tf.string, name=None)},
 {'User_ID': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Gender': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Product_ID': TensorSpec(shape=(None,), dtype=tf.string, name=None)})

In [14]:
model = BlackFridayModel()

In [15]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [16]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fbfb82c5710>

In [17]:
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
    
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Consider rewriting this model with the Functional API.


  numdigits = int(np.log10(self.target)) + 1


OverflowError: cannot convert float infinity to integer

---

##### model.fit(cached_train, epochs=10)

In [None]:
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
    
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")