In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

## Training a DLRM model with TensorFlow

In the previous notebooks, we have downloaded the movielens data, converted it to parquet files and then used NVTabular library to process the data, join data frames, and create input features. In this notebook we will use NVIDIA Merlin Models library to build and train a Deep Learning Recommendation Model [(DLRM)](https://arxiv.org/abs/1906.00091) architecture originally proposed by Facebook in 2019.

Figure 1 illustrates DLRM architecture. The model was introduced as a personalization deep learning model that uses embeddings to process sparse features that represent categorical data and a multilayer perceptron (MLP) to process dense features, then interacts these features explicitly using the statistical techniques proposed in [here](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5694074).

![DLRM](images/DLRM.png)

## Import Libraries

In [2]:
import os
import glob
import nvtabular
import numpy as np

import merlin_models.tf as ml
from merlin_standard_lib import Schema, Tag

from nvtabular.loader.tensorflow import KerasSequenceLoader
import tensorflow as tf

In [3]:
import logging
# disable INFO and DEBUG logging everywhere
logging.disable(logging.WARNING) 

Merlin Models library relies on a `schema` object to automatically build all necessary layers to represent, normalize and aggregate input features. As you can see below, schema.pb is a protobuf file that contains metadata including statistics about features such as cardinality, min and max values and also tags features based on their characteristics and dtypes (e.g., categorical, continuous, list, integer).

We have already generated our `schema.pbtxt` file in the previous notebook using NVTabular. Not we read this schema file to create a `schema` object.

In [4]:
from merlin_standard_lib import Schema
SCHEMA_PATH = "/workspace/data/movielens/train/schema.pbtxt"
schema = Schema().from_proto_text(SCHEMA_PATH)
#!cat $SCHEMA_PATH

## Define the Input module

Below we define our input block using the `ml.ContinuousEmbedding` function. The from_schema() method processes the schema and creates the necessary layers to represent features and aggregate them.

In [5]:
con_schema = schema.select_by_tag(Tag.CONTINUOUS)
cat_schema = schema.select_by_tag(Tag.CATEGORICAL)

top_block_inputs = {}

top_block_inputs["continuous"] = ml.ContinuousFeatures.from_schema(con_schema).connect(ml.MLPBlock([128, 64]))


embedding_dim = 64
top_block_inputs["categorical"] = ml.EmbeddingFeatures.from_schema(
    cat_schema, embedding_dim_default=embedding_dim
)


dot_product = ml.TabularBlock(aggregation="stack").connect(ml.DotProductInteraction())
top_block_outputs = (ml.merge(top_block_inputs).connect_with_shortcut
                     (
                         dot_product, shortcut_filter=ml.Filter("continuous"), aggregation="concat"
                     ).connect(ml.MLPBlock([128, 64])
                              )
                    )
model = top_block_outputs.connect(ml.BinaryClassificationTask("rating"))

2021-12-07 22:13:42.941979: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-07 22:13:44.040094: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16254 MB memory:  -> device: 0, name: Quadro GV100, pci bus id: 0000:15:00.0, compute capability: 7.0


In [6]:
#model

merge is going to return a dictionary of blocks for categs and continous

if we remove the `cosine` aggregation, this will create a dictionary.. we can use ItemRetrivealTask and it will take care of the aggregation and negative sampling. current default is `in-batch-negative sampling"

### Define Data Loader

We're ready to get trainin.. We'll use the NVTabular `KerasSequenceLoader` for reading chunks of parquet files. `KerasSequenceLoader` manages shuffling by loading in chunks of data from different parts of the full dataset, concatenating them and then shuffling, then iterating through this super-chunk sequentially in batches. The number of "parts" of the dataset that get sample, or "partitions", is controlled by the `parts_per_chunk` kwarg, while the size of each one of these parts is controlled by the `buffer_size` kwarg, which refers to a fraction of available GPU memory (you can read more about it [here](https://nvidia-merlin.github.io/NVTabular/main/training/tensorflow.html) and [here](https://nvidia-merlin.github.io/NVTabular/main/api/tensorflow_dataloader.html?highlight=kerassequence#nvtabular.loader.tensorflow.KerasSequenceLoader)). Using more chunks leads to better randomness, especially at the epoch level where physically disparate samples can be brought into the same batch, but can impact throughput if you use too many. In any case, the speed of the parquet reader makes feasible buffer sizes much larger.

In [7]:
# Define categorical and continuous columns
x_cat_names, x_cont_names = ['userId', 'movieId', 'genres'], ['TE_movieId_rating','userId_count']

def get_dataloader(paths_or_dataset, batch_size=4096):
    dataloader = KerasSequenceLoader(
        paths_or_dataset,
        batch_size=batch_size,
        label_names=['rating'],
        cat_names=x_cat_names,
        cont_names=x_cont_names,
    )
    #return dataloader.map(lambda X, y: (X, []))
    return dataloader.map(lambda X, y: (X, tf.reshape(y, (-1,))))
    #return dataloader

In [8]:
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "/workspace/data/movielens/")
train_paths = glob.glob(os.path.join(OUTPUT_DIR, "train/*.parquet"))
eval_paths = glob.glob(os.path.join(OUTPUT_DIR, "valid/*.parquet"))

In [9]:
train_paths, eval_paths

(['/workspace/data/movielens/train/part_0.parquet'],
 ['/workspace/data/movielens/valid/part_0.parquet'])

In [10]:
model.compile(optimizer="adam", run_eagerly=False)

In [11]:
print('*'*20)
print("Launch training")
print('*'*20 + '\n')
train_loader = get_dataloader(train_paths) 
losses = model.fit(train_loader, epochs=1)
model.reset_metrics()
# Evaluate
eval_loader = get_dataloader(eval_paths) 
eval_metrics = model.evaluate(eval_loader, return_dict=True)
print('*'*20)
print("Eval results")
print('\n' + '*'*20 + '\n')
for key in sorted(eval_metrics.keys()):
    print(" %s = %s" % (key, str(eval_metrics[key]))) 

********************
Launch training
********************



TypeError: Error converting shape to a TensorShape: Dimension value must be integer or None or have an __index__ method, got value 'TensorShape([11139, 1])' with type '<class 'tensorflow.python.framework.tensor_shape.TensorShape'>'.