# Retrieval stage
> "The retrieval stage is responsible for selecting an initial set of hundreds of candidates from all possible candidates. The main objective of this model is to efficiently weed out all candidates that the user is not interested in. Because the retrieval model may be dealing with millions of candidates, it has to be computationally efficient. - [tf](https://github.com/tensorflow/recommenders/blob/main/docs/examples/basic_retrieval.ipynb)

In [1]:
import os
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

from os.path import join

## Data load & process

In [2]:
dataset_parsed_path = "/home/jupyter/mlspec-blackfriday/dataset/parsed/202104130952/"
train_path = join(dataset_parsed_path, "train.csv")
test_path = join(dataset_parsed_path, "eval.csv")

df_train=pd.read_csv(train_path)
df_test=pd.read_csv(test_path)

# Treat al columns as strings
df_train = df_train.astype(str)
df_test = df_test.astype(str)

print(f"df_train shape:{df_train.shape}")
print(f"df_test shape:{df_test.shape}")

df_train shape:(78366, 12)
df_test shape:(156733, 12)


In [3]:
# Define used features

product_features = ["Product_ID"]

user_features = ["Gender", 
                 "Age", 
                 "Occupation", 
                 "City_Category", 
                 "Stay_In_Current_City_Years",
                 "Marital_Status"
                ]

# Remove columns not used
all_features = user_features + product_features

df_train = df_train[all_features]
df_test = df_test[all_features]

# Extract unique info
df = df_train.append(df_test)

product_unique_values = {
    feature: df[feature].unique() for feature in product_features
}

user_unique_values = {
    feature: df[feature].unique() for feature in user_features
}

In [4]:
print("Training set:")
df_train.head(3)

Training set:


Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_ID
0,M,26-35,14,B,2,1,P00235842
1,M,26-35,0,C,1,0,P00180442
2,M,36-45,2,A,2,0,P00303842


## Model build

In [5]:
class UserModel(tf.keras.Model):

    def __init__(self, unique_values:dict): # [Gender, [M, F]]
        super().__init__()
        
        self.user_features = {}
        for feature_name, unique_list in unique_values.items():
            feature_layer = tf.keras.Sequential([
                                tf.keras.layers.experimental.preprocessing.StringLookup(
                                    vocabulary=unique_list, mask_token=None),
                                    tf.keras.layers.Embedding(len(unique_list) + 1, 32),
                            ], feature_name )
            self.user_features[feature_name] = feature_layer
            


    def call(self, inputs):
        layers_stack = []
        for feature_name, feature_layer in self.user_features.items():
            print(f"Creating layer for feature {feature_name}")
            layer_valorized = feature_layer(inputs[feature_name])
            layers_stack.append(layer_valorized)
        return tf.concat(layers_stack, axis=1)

In [6]:
class ProductModel(tf.keras.Model):
    
    def __init__(self, product_unique_ids:np.ndarray):
        super().__init__()
        
        self.product_embedding = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
              vocabulary=product_unique_ids, mask_token=None),
          tf.keras.layers.Embedding(len(product_unique_ids) + 1, 32)
        ])


    def call(self, products_id):
        return tf.concat([
            self.product_embedding(products_id)
        ], axis=1)

In [7]:
class BlackFridayModel(tfrs.models.Model): # note the main package is tfrs
    """
    Note:
    - no closure required
    """
    def __init__(self, 
                 user_model,
                 product_model,
                 topk_candidates: tf.data.Dataset,
                 user_unique_values: dict,
                 product_unique_ids: np.ndarray,
                ):
        super().__init__()
        self.product_embedder = product_model
        self.user_embedder = user_model
        
        self.user_features = user_unique_values.keys()
        self.query_model = tf.keras.Sequential([
                              user_model(user_unique_values),
                              tf.keras.layers.Dense(32)
                            ])
        self.candidate_model = tf.keras.Sequential([
                              product_model(product_unique_ids),
                              tf.keras.layers.Dense(32)
                            ])
        # See https://www.tensorflow.org/recommenders/api_docs/python/tfrs/tasks/Retrieval
        self.task = tfrs.tasks.Retrieval( # Loss function. Defaults to tf.keras.losses.CategoricalCrossentropy.
            metrics=tfrs.metrics.FactorizedTopK( # TODO are we forced to use batch? - REMOVE for training
                candidates=topk_candidates.batch(128).map(self.candidate_model), # dataset of candidate embeddings from which candidates should be retrieved (embedded)
            ),
        )
    def get_user_tower(self):
        return self.query_model
    
    def get_product_tower(self):
        return self.candidate_model
    
    def compute_loss(self, features, training=False):
        query_data = {feature_name: features[feature_name] for feature_name in self.user_features}
        query_embeddings = self.query_model(query_data)
        product_embeddings = self.candidate_model(features["Product_ID"])
        
        # Retrieval call: https://www.tensorflow.org/recommenders/api_docs/python/tfrs/tasks/Retrieval
        # "The task will try to maximize the affinity of these query, candidate pairs while minimizing 
        # the affinity between the query and candidates belonging to other queries in the batch."
        return self.task(query_embeddings=query_embeddings, 
                         candidate_embeddings=product_embeddings,
                         compute_metrics=True, # disable for better performances
                         candidate_ids = None
                        )

## Train

In [8]:
# Candidates to use for metrics
tf_unique_products = tf.data.Dataset.from_tensor_slices(product_unique_values["Product_ID"])
tf_unique_products.element_spec

TensorSpec(shape=(), dtype=tf.string, name=None)

In [9]:
model = BlackFridayModel(UserModel, ProductModel,
                        topk_candidates = tf_unique_products,
                        user_unique_values = user_unique_values,
                        product_unique_ids = product_unique_values["Product_ID"]
                        )

In [10]:
# Tf dataset
tf.random.set_seed(42)
train = tf.data.Dataset.from_tensor_slices(dict(df_train)) # [!] dict is important
train = train.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

test = tf.data.Dataset.from_tensor_slices(dict(df_test))

cached_train = train.shuffle(100_000).batch(2048) # TODO: double shuffle?
cached_test = test.batch(4096).cache()

In [11]:
# Create a callback that saves the model's weights
from datetime import datetime
run_id = datetime.today().strftime('%Y%m%d%H%M%S')
model_path = f"./models/{run_id}/"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=model_path,
                                                 save_weights_only=False,
                                                 verbose=1)

In [112]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=1, callbacks=cp_callback)

Consider rewriting this model with the Functional API.
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status
Consider rewriting this model with the Functional API.
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status

Epoch 00001: saving model to ./models/20210416153408/


<tensorflow.python.keras.callbacks.History at 0x7fae8e1eb650>

## Prediction

In [113]:
# Specify query embedder:
index = tfrs.layers.factorized_top_k.BruteForce(model.get_user_tower())

# Create the vector space
index.index(tf_unique_products.batch(100).map(model.get_product_tower()), tf_unique_products)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fae8e1fde50>

In [115]:
# Get recommendations.
input_data = {
    "Gender": tf.constant(["M"]), # <-- [!] require list inside constant
    "Age": tf.constant(["26-35"]),
    "Occupation": tf.constant(["0"]),
    "City_Category": tf.constant(["B"]),
    "Stay_In_Current_City_Years": tf.constant(["4+"]),
    "Marital_Status": tf.constant(["0"]),
}
index(input_data)
# _, products = index(tf.constant(["42"]))
# print(f"Recommendations for user 42: {titles[0, :3]}")

Consider rewriting this model with the Functional API.
Creating layer for feature Gender
Creating layer for feature Age
Creating layer for feature Occupation
Creating layer for feature City_Category
Creating layer for feature Stay_In_Current_City_Years
Creating layer for feature Marital_Status


(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[0.83927035, 0.7979298 , 0.61289454, 0.56090105, 0.5586682 ,
         0.55229056, 0.5514853 , 0.5412506 , 0.5301919 , 0.52725685]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 10), dtype=string, numpy=
 array([[b'P00221142', b'P00222442', b'P00218242', b'P00321342',
         b'P00147642', b'P00077842', b'P00129142', b'P00229942',
         b'P00043642', b'P00037342']], dtype=object)>)

-----

In [None]:
class UserModel(tf.keras.Model):

    def __init__(self, unique_values:dict): # [Gender, [M, F]]
        super().__init__()
        
        self.user_features = {}
        for feature_name, unique_list in unique_values.items():
            feature_layer = tf.keras.Sequential([
                                tf.keras.layers.experimental.preprocessing.StringLookup(
                                    vocabulary=unique_list, mask_token=None),
                                    tf.keras.layers.Embedding(len(unique_list) + 1, 32),
                            ], feature_name )
            self.user_features[feature_name] = feature_layer
            


    def call(self, inputs):
        layers_stack = []
        for feature_name, feature_layer in self.user_features.items():
            print(f"Creating layer for feature {feature_name}")
            layer_valorized = feature_layer(inputs[feature_name])
            layers_stack.append(layer_valorized)
        return tf.concat(layers_stack, axis=1)

In [100]:
input_data = {
    "Gender": tf.constant("M"),
    "Age": tf.constant("26-35"),
    "Occupation": tf.constant("0"),
    "City_Category": tf.constant("B"),
    "Stay_In_Current_City_Years": tf.constant("4+"),
    "Marital_Status": tf.constant("0"),
}

model_user = model.get_user_tower()
model_user(input_data)
# model_user.summary() # [!] da errore

Consider rewriting this model with the Functional API.


TypeError: 'Sequential' object is not subscriptable

---
# Test: train with less inputs

In [88]:
class UserModelReduced(tf.keras.Model):
    
    def __init__(self, unique_values:dict):
        super().__init__()
        unique_list = unique_values["Gender"]
        feature_layer = tf.keras.Sequential([
                            tf.keras.layers.experimental.preprocessing.StringLookup(
                                                            vocabulary=unique_list, 
                                                            mask_token=None),
                            tf.keras.layers.Embedding(len(unique_list) + 1, 32),
                        ], 
#             feature_name
        )
        self.feature_layer = feature_layer
            


    def call(self, inputs):
        # https://github.com/tensorflow/recommenders/blob/main/docs/examples/featurization.ipynb
        return tf.concat(self.feature_layer["Gender"], 
                         axis=1)

In [89]:
model = BlackFridayModel(UserModelReduced, ProductModel,
                        topk_candidates = tf_unique_products,
                        user_unique_values = user_unique_values,
                        product_unique_ids = product_unique_values["Product_ID"]
                        )

In [90]:
# Create a callback that saves the model's weights
from datetime import datetime
run_id = datetime.today().strftime('%Y%m%d%H%M%S')

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=f"./models/{run_id}/",
                                                 save_weights_only=False,
                                                 verbose=1)

model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(cached_train, epochs=1, callbacks=cp_callback)

Consider rewriting this model with the Functional API.


TypeError: in user code:

    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    <ipython-input-70-4796af393f39>:21 call  *
        axis=1)

    TypeError: 'Sequential' object is not subscriptable


In [None]:
# go deep
model_user = model.get_user_tower()
input_data = {
    "Gender": tf.constant("M"),
}
model_user(input_data)

## Store
**Note**
- We need to store 2 things:
    - The model (BlackFridayModel)
    - The "data embedded" vector space 