In [1]:
import os
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

from os.path import join

## Data load & process

In [2]:
dataset_parsed_path = "/home/jupyter/mlspec-blackfriday/dataset/parsed/202104130952/"
train_path = join(dataset_parsed_path, "train.csv")
test_path = join(dataset_parsed_path, "eval.csv")

df_train=pd.read_csv(train_path)
df_test=pd.read_csv(test_path)

# Treat al columns as strings
df_train = df_train.astype(str)
df_test = df_test.astype(str)

print(f"df_train shape:{df_train.shape}")
print(f"df_test shape:{df_test.shape}")

df_train shape:(78366, 12)
df_test shape:(156733, 12)


In [3]:
# Extract user info
df = df_train.append(df_test)

product_features = ["Product_ID"]

user_features = ["Gender", 
                 "Age", 
                 "Occupation", 
                 "City_Category", 
                 "Stay_In_Current_City_Years",
                 "Marital_Status"
                ]

product_unique_values = {
    feature: df[feature].unique() for feature in product_features
}

user_unique_values = {
    feature: df[feature].unique() for feature in user_features
}

In [4]:
# Remove columns not used
all_features = user_features + product_features

df_train = df_train[all_features]
df_test = df_test[all_features]

## Model build

**Note**


In [5]:
class UserModel(tf.keras.Model):
    """
    Notes:
    - Parametric user features
    - No closure used
    """
    def __init__(self, unique_values:dict):
        super().__init__()
        
        self.user_features = {}
        for feature_name, unique_list in unique_values.items():
            feature_layer = tf.keras.Sequential([
                                tf.keras.layers.experimental.preprocessing.StringLookup(
                                    vocabulary=unique_list, mask_token=None),
                                    tf.keras.layers.Embedding(len(unique_list) + 1, 32),
                            ], feature_name )
            self.user_features[feature_name] = feature_layer
            


    def call(self, inputs):
        layers_stack = []
        for feature_name, feature_layer in self.user_features.items():
            print(f"Creating layer for feature {feature_name}")
            layer_valorized = feature_layer(inputs[feature_name])
            layers_stack.append(layer_valorized)
        return tf.concat(layers_stack, axis=1)

In [6]:
class ProductModel(tf.keras.Model):
  
    def __init__(self, product_unique_ids:np.ndarray):
        super().__init__()
        
        self.product_embedding = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
              vocabulary=product_unique_ids, mask_token=None),
          tf.keras.layers.Embedding(len(product_unique_ids) + 1, 32)
        ])


    def call(self, products_id):
        return tf.concat([
            self.product_embedding(products_id)
        ], axis=1)

In [53]:
class BlackFridayModel(tfrs.models.Model): # note the main package is tfrs
    """
    Note:
    - no closure required
    """
    def __init__(self, 
                 topk_candidates: tf.data.Dataset,
                 user_unique_values: dict,
                 product_unique_ids: np.ndarray,
                ):
        super().__init__()
        
        self.user_features = user_unique_values.keys()
        self.query_model = tf.keras.Sequential([
                              UserModel(user_unique_values),
                              tf.keras.layers.Dense(32)
                            ])
        self.candidate_model = tf.keras.Sequential([
                              ProductModel(product_unique_ids),
                              tf.keras.layers.Dense(32)
                            ])
        # See https://www.tensorflow.org/recommenders/api_docs/python/tfrs/tasks/Retrieval
        self.task = tfrs.tasks.Retrieval( # Loss function. Defaults to tf.keras.losses.CategoricalCrossentropy.
            metrics=tfrs.metrics.FactorizedTopK( # TODO are we forced to use batch? - REMOVE for training
                candidates=topk_candidates.batch(128).map(self.candidate_model), # dataset of candidate embeddings from which candidates should be retrieved (embedded)
            ),
        )

    def compute_loss(self, features, training=False):
        query_data = {feature_name: features[feature_name] for feature_name in self.user_features}
        query_embeddings = self.query_model(query_data)
        product_embeddings = self.candidate_model(features["Product_ID"])
        
        # Retrieval call: https://www.tensorflow.org/recommenders/api_docs/python/tfrs/tasks/Retrieval
        # "The task will try to maximize the affinity of these query, candidate pairs while minimizing 
        # the affinity between the query and candidates belonging to other queries in the batch."
        return self.task(query_embeddings=query_embeddings, 
                         candidate_embeddings=product_embeddings,
                         compute_metrics=True, # disable for better performances
                         candidate_ids = None
                        )
    def call(self, features):
        query_data = {feature_name: features[feature_name] for feature_name in self.user_features}
        query_embeddings = self.query_model(query_data)
        self.task = tfrs.tasks.Retrieval( # Loss function. Defaults to tf.keras.losses.CategoricalCrossentropy.
            metrics=tfrs.metrics.FactorizedTopK( # TODO are we forced to use batch? - REMOVE for training
                candidates=topk_candidates.batch(128).map(self.candidate_model), # dataset of candidate embeddings from which candidates should be retrieved (embedded)
            ),
        )
        return query_data, query_embeddings, self.task

## Train

In [54]:
# Candidates to use for metrics
tf_unique_products = tf.data.Dataset.from_tensor_slices(product_unique_values["Product_ID"])
tf_unique_products.element_spec

TensorSpec(shape=(), dtype=tf.string, name=None)

In [55]:
model = BlackFridayModel(
                        topk_candidates = tf_unique_products,
                        user_unique_values = user_unique_values,
                        product_unique_ids = product_unique_values["Product_ID"]
                        )

In [56]:
# Tf dataset
tf.random.set_seed(42)
train = tf.data.Dataset.from_tensor_slices(dict(df_train)) # dict is important
train = train.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

test = tf.data.Dataset.from_tensor_slices(dict(df_test))

cached_train = train.shuffle(100_000).batch(2048) # TODO: double shuffle?
cached_test = test.batch(4096).cache()

In [57]:
cached_train.element_spec, cached_test.element_spec

({'Gender': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Age': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Occupation': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'City_Category': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Stay_In_Current_City_Years': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Marital_Status': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Product_ID': TensorSpec(shape=(None,), dtype=tf.string, name=None)},
 {'Gender': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Age': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Occupation': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'City_Category': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Stay_In_Current_City_Years': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Marital_Status': TensorSpec(shape=(None,), dtype=tf.string, name=None),
  'Product_ID': TensorSpec(shape=(None,), dtype=tf.s

In [None]:
# Create a callback that saves the model's weights
from datetime import datetime
run_id = datetime.today().strftime('%Y%m%d%H%M%S')

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=f"./models/{run_id}/",
                                                 save_weights_only=True,
                                                 verbose=1)

In [71]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=1, callbacks=cp_callback)

Consider rewriting this model with the Functional API.
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status
Consider rewriting this model with the Functional API.
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status

Epoch 00001: saving model to ./


<tensorflow.python.keras.callbacks.History at 0x7f5c896f3890>

In [48]:
test_accuracy = model.evaluate(
    cached_test, return_dict=True)



In [13]:
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
    
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Consider rewriting this model with the Functional API.
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status
Top-100 accuracy (train): 0.08.
Top-100 accuracy (test): 0.06.


---
## Train more epochs

### 10 epochs

In [128]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=10)
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
    
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Epoch 1/10
Consider rewriting this model with the Functional API.
Creating layer for feature User_ID
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status
Consider rewriting this model with the Functional API.
Creating layer for feature User_ID
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Consider rewriting this model with the Functional API.
Creating layer for feature User_ID
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Cate

### 30 epochs

In [129]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=30)
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
    
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Epoch 1/30
Consider rewriting this model with the Functional API.
Creating layer for feature User_ID
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status
Consider rewriting this model with the Functional API.
Creating layer for feature User_ID
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
E

---
- check pip install netron