In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ================================

## Building Recommender Systems Easily with Merlin Models

In this notebook, we are building a Two-Tower model for Item Retrieval task.

### Learning objectives
- Preparing the data with NVTabular
- Training and evaluating Two-Tower model with Merlin Models

### Feature Engineering with NVTabular

In [2]:
import os
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"
import cudf
import glob
import gc

import nvtabular as nvt
from nvtabular.ops import *
from example_utils import workflow_fit_transform

from merlin.schema.tags import Tags
from merlin.schema import Schema

import merlin.models.tf as mm
import merlin.models.tf.dataset as tf_dataloader

from merlin.io.dataset import Dataset
from merlin.schema.io.tensorflow_metadata import TensorflowMetadata
from merlin.models.tf.blocks.core.aggregation import CosineSimilarity

import tensorflow as tf

2022-03-16 19:41:40.198723: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:214] Using CUDA malloc Async allocator for GPU: 0
2022-03-16 19:41:40.198866: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16254 MB memory:  -> device: 0, name: Quadro GV100, pci bus id: 0000:15:00.0, compute capability: 7.0


First, we define our input and output paths.

In [3]:
train_path = '/workspace/data/train/*.parquet'
test_path = '/workspace/data/test/*.parquet'
output_path = '/workspace/retrieval/processed/'

ETL Workflow

We select only positive interaction rows therefore we remove rows where `click==0` from the dataset with `Filter()` op.

In [4]:
inputs = ["user_id", "item_id", "item_category", "item_shop", "item_brand",
          'user_shops', 'user_profile', 'user_group',
          'user_gender', 'user_age', 'user_consumption_2', 'user_is_occupied',
          'user_geography', 'user_intentions', 'user_brands', 'user_categories', 'click'] \
         >> Filter(f=lambda df: df["click"] == 1)

user_id = inputs["user_id"] >> AddMetadata(tags=[Tags.USER_ID, Tags.USER]) >> Categorify()
item_id = inputs["item_id"] >> AddMetadata(tags=[Tags.ITEM_ID, Tags.ITEM]) >> Categorify()

item_features = inputs["item_category", "item_shop", "item_brand"] \
     >> AddMetadata(tags=[Tags.ITEM]) >> nvt.ops.Categorify()

user_features = inputs['user_shops', 'user_profile', 'user_group', 
       'user_gender', 'user_age', 'user_consumption_2', 'user_is_occupied',
       'user_geography', 'user_intentions', 'user_brands', 'user_categories'] \
    >> AddMetadata(tags=[Tags.USER]) >> nvt.ops.Categorify()

outputs = user_id+item_id+item_features+user_features


workflow_fit_transform(outputs, train_path, test_path, output_path, 'workflow_retrieval')



## Building Two-Tower Model

We will use Two-Tower Model for Item retrieval task. Real-world large scale recommender systems have hundreds of millions of items (products) and users. It is Thus, these systems are often composed of two stages: candidate generation (retrieval) and ranking (scoring the retrieved items). You can read more about two stage Recommender Systems [here](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45530.pdf). In this example, we're going to focus on the retrieval stage.

A Two-Tower Model consists of item (candidate) and user (query) encoder towers. With two towers, the model can learn representations (embeddings) for queries and candidates separately. 

<img src="./images/Two-tower.png"  width="30%">

Image Adapted from: [Off-policy Learning in Two-stage Recommender Systems](https://dl.acm.org/doi/abs/10.1145/3366423.3380130)


We use the `schema` object to define our model.

In [5]:
schema = TensorflowMetadata.from_proto_text_file('/workspace/retrieval/processed/train/').to_merlin_schema()

In [6]:
schema = schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER])

In [7]:
schema.column_names

['user_id',
 'item_id',
 'item_category',
 'item_shop',
 'item_brand',
 'user_shops',
 'user_profile',
 'user_group',
 'user_gender',
 'user_age',
 'user_consumption_2',
 'user_is_occupied',
 'user_geography',
 'user_intentions',
 'user_brands',
 'user_categories']

We expect the label names to be empty.

Now, let's build our Two-Tower model. In a nutshell, we aggregate all user features to feed in user tower and feed the item features to the item tower. Then we compute the positive score by multiplying the user embedding with the item embedding and sample negative items (read more about negative sampling [here](https://openreview.net/pdf?id=824xC-SgWgU) and [here](https://medium.com/mlearning-ai/overview-negative-sampling-on-recommendation-systems-230a051c6cd7)), whose item embeddings are also multiplied by the user embedding. Then we apply the loss function on top of the positive and negative scores.

In [8]:
model = mm.TwoTowerModel(
    schema,
    query_tower=mm.MLPBlock([128, 64], no_activation_last_layer=True),        
    loss="categorical_crossentropy",  
    samplers=[mm.InBatchSampler()],
    embedding_options = mm.EmbeddingOptions(infer_embedding_sizes=True),
    metrics=[mm.RecallAt(10), mm.NDCGAt(10)]
)

Let's explain the parameters in the TwoTowerModel():
- no_activation_last_layer: when set True, no activation is used for top hidden layer. Learn more [here](https://storage.googleapis.com/pub-tools-public-publication-data/pdf/b9f4e78a8830fe5afcf2f0452862fb3c0d6584ea.pdf).
- infer_embedding_sizes: when set True, automatically defines the embedding dimension from the feature cardinality in the schema

**Metrics:**

The following information retrieval metrics are used to compute the Top-10 accuracy of recommendation lists containing all items:

- **Normalized Discounted Cumulative Gain (NDCG@10)**: NDCG accounts for rank of the relevant item in the recommendation list and is a more fine-grained metric than HR, which only verifies whether the relevant item is among the top-k items.

- **Recall@10**: Also known as HitRate@n when there is only one relevant item in the recommendation list. Recall just verifies whether the relevant item is among the top-n items.

We need to initialize the dataloaders.

In [9]:
output_path + '/train/*.parquet'

'/workspace/retrieval/processed//train/*.parquet'

In [10]:
batch_size = 4096

train_dl = tf_dataloader.BatchedDataset(
    Dataset(output_path + 'train/*.parquet', part_size="500MB", schema=schema),
    batch_size = batch_size,
    shuffle= True, 
)

test_dl = tf_dataloader.BatchedDataset(
    Dataset(output_path + 'valid/*.parquet', part_size="500MB", schema=schema),
    batch_size = batch_size,
    shuffle = False,
)

In [11]:
opt = tf.keras.optimizers.Adagrad(learning_rate=0.003)
model.compile(opt, run_eagerly=False)

model.fit(train_dl, epochs=2, validation_data=test_dl)

2022-03-16 19:41:58.771434: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


The sampler InBatchSampler returned no samples for this batch.




2022-03-16 19:42:18.646812: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: cond/then/_0/cond/cond/branch_executed/_168


Epoch 2/2


<keras.callbacks.History at 0x7f1f580394c0>

#### Save user tower

In [12]:
query_tower = model.retrieval_block.query_block()
query_tower.save('query_tower')



INFO:tensorflow:Assets written to: query_tower/assets


INFO:tensorflow:Assets written to: query_tower/assets


#### Extract and save user features

In [13]:
# we are using ranking processed parquet file because it has more rows including both clicks ==0 and 1 (positives and negatives)

In [14]:
user_features = model._ensure_unique(Dataset('/workspace/ranking/processed/train/*.parquet', schema=schema), tag=Tags.USER, id_tag=Tags.USER_ID).compute()



In [15]:
user_features.head()

Unnamed: 0,user_id,user_shops,user_profile,user_group,user_gender,user_age,user_consumption_2,user_is_occupied,user_geography,user_intentions,user_brands,user_categories
0,0,0,1,5,2,2,2,1,0,0,0,0
1,1,109,0,0,0,0,0,0,0,69,131,9
2,2,301,1,1,1,1,1,1,2,57,4709,57
3,3,1876,23,7,2,3,1,1,1,5,63,3
4,4,534,1,2,1,2,1,1,0,40,22,108


In [16]:
user_features.shape

(294736, 12)

In [17]:
#user_features.to_parquet('user_features.parquet')

#### Extract and save item features

In [18]:
# train path should be the path for the processed parquet files.
item_features = model._ensure_unique(Dataset('/workspace/ranking/processed/train/*.parquet', schema=schema), tag=Tags.ITEM, id_tag=Tags.ITEM_ID).compute()



In [19]:
item_features.head()

Unnamed: 0,item_id,item_category,item_shop,item_brand
0,0,0,0,0
1,1,441,432,474
2,2,193,1159,125
3,3,3,1463,872
4,4,282,2479,555


In [20]:
item_features.shape

(3078306, 4)

In [21]:
#item_features.to_parquet('item_features.parquet')

#### Extract and save item embeddings

In [22]:
item_embs = model.item_embeddings(Dataset(item_features, schema=schema), batch_size=1024)
item_embs_df = item_embs.compute(scheduler="synchronous")



INFO:tensorflow:Assets written to: /tmp/tmp91c6hze1/assets


INFO:tensorflow:Assets written to: /tmp/tmp91c6hze1/assets






In [23]:
item_embs_df

Unnamed: 0,item_id,item_category,item_shop,item_brand,0,1,2,3,4,5,...,54,55,56,57,58,59,60,61,62,63
0,0,0,0,0,0.045441,0.059819,0.129809,-0.108830,-0.082256,-0.084359,...,0.137708,0.153404,0.070102,0.032150,-0.020099,0.059464,-0.155438,0.052263,-0.249170,0.128384
1,1,441,432,474,-0.062309,-0.175517,-0.118690,-0.251775,-0.009162,0.116605,...,-0.134853,0.039249,-0.058282,-0.070432,0.029771,0.018966,-0.180021,0.113205,-0.029373,0.245459
2,2,193,1159,125,0.054750,-0.138316,0.152897,0.015473,-0.073939,-0.012988,...,0.012525,-0.043029,0.075899,0.020771,0.127448,0.031545,-0.030108,-0.019186,-0.153289,0.185170
3,3,3,1463,872,-0.052466,-0.033155,0.109827,-0.064776,0.064255,0.123888,...,0.077336,-0.061820,0.053175,-0.014500,0.048533,0.173381,0.027293,-0.044810,0.018886,0.145906
4,4,282,2479,555,-0.104201,-0.153385,0.098410,-0.190748,-0.135593,0.215999,...,0.182969,-0.001762,0.004312,-0.048311,-0.001306,-0.010827,0.019868,0.079796,-0.360648,0.206935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078301,3078301,2378,75198,24666,0.005531,0.091458,0.001629,-0.168319,0.054186,-0.121674,...,-0.039214,-0.016966,0.181637,-0.019679,-0.059735,-0.024524,-0.017943,0.012964,-0.200432,0.177761
3078302,3078302,1604,366075,175177,0.036174,-0.090309,0.062095,0.003157,0.132412,0.111471,...,0.014639,0.056640,0.085006,0.060594,0.059827,0.025134,0.135456,0.080058,-0.003671,0.096373
3078303,3078303,253,206410,0,0.048918,-0.165730,0.042044,-0.066073,-0.123633,0.108967,...,-0.016723,-0.210796,0.117924,-0.006236,-0.106710,0.037314,-0.014166,-0.073964,-0.218900,0.067258
3078304,3078304,475,255840,335,-0.107561,-0.069512,0.126090,-0.283694,-0.005332,-0.080390,...,0.075199,0.189646,-0.063537,-0.027494,0.009694,0.104136,-0.179885,0.033065,-0.201278,0.072276


In [26]:
item_embeddings = item_embs_df.iloc[:, 4:]

In [27]:
item_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.045441,0.059819,0.129809,-0.10883,-0.082256,-0.084359,-0.002202,0.11914,0.130306,-0.020843,...,0.137708,0.153404,0.070102,0.03215,-0.020099,0.059464,-0.155438,0.052263,-0.24917,0.128384
1,-0.062309,-0.175517,-0.11869,-0.251775,-0.009162,0.116605,-0.16252,0.042893,0.110591,0.105749,...,-0.134853,0.039249,-0.058282,-0.070432,0.029771,0.018966,-0.180021,0.113205,-0.029373,0.245459
2,0.05475,-0.138316,0.152897,0.015473,-0.073939,-0.012988,-0.059543,0.132076,0.021028,0.045453,...,0.012525,-0.043029,0.075899,0.020771,0.127448,0.031545,-0.030108,-0.019186,-0.153289,0.18517
3,-0.052466,-0.033155,0.109827,-0.064776,0.064255,0.123888,-0.098999,0.118428,-0.066744,0.043861,...,0.077336,-0.06182,0.053175,-0.0145,0.048533,0.173381,0.027293,-0.04481,0.018886,0.145906
4,-0.104201,-0.153385,0.09841,-0.190748,-0.135593,0.215999,-0.100042,-0.025396,-0.043621,-0.036547,...,0.182969,-0.001762,0.004312,-0.048311,-0.001306,-0.010827,0.019868,0.079796,-0.360648,0.206935


In [28]:
item_embeddings.shape

(3078306, 64)

In [25]:
#item_embeddings.to_parquet('item_embeddings')