In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
import os

import cudf
import rmm

import nvtabular as nvt

In [3]:
from merlin_models.tensorflow.models.retrieval import YouTubeDNN

In [4]:
rmm.reinitialize(managed_memory=True)

In [5]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("~/nvt-examples/end-to-end-poc/data/")
)

In [6]:
examples = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "grouped_examples.parquet"))
examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sampled_tag,timestamp,movieId,movieId_count,target_item,genre
userId,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[41673, 48558, 65151, 39605, 53617, 721, 35459...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361,"[6, 6, 3, 6, 17, 9, 16, 3, 9, 2, 3, 6, 10, 3, ..."
2,4071,"[52780, 50622, 50055, 64349, 21820, 62132, 397...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150,"[6, 6, 6, 6, 6, 9, 9, 7, 9, 15, 7, 9, 2, 6, 6,..."
3,7521,"[20978, 55603, 52055, 61112, 39917, 64877, 513...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",178,37729,"[6, 9, 16, 9, 2, 6, 7, 9, 17, 3, 4, 5, 6, 10, ..."
3,7688,"[36028, 71080, 28377, 50807, 52690, 39197, 467...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 40815, 36529, 45186...",9,4344,"[12, 12, 18, 7, 9, 18, 7, 9, 6, 9, 1, 1, 12, 2..."
3,8045,"[18451, 13153, 32067, 30064, 15745, 31465, 522...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773,"[7, 9, 17, 18, 7, 9, 18, 7, 9, 9, 7, 9, 9, 18,..."


In [7]:
examples["movieId_count"].max()

1720

In [8]:
examples[["target_item", "timestamp"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,target_item,timestamp
userId,day,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4146,7361,"[1147868053, 1147868097, 1147868414, 114786846..."
2,4071,2150,"[1141415528, 1141415566, 1141415576, 114141558..."
3,7521,37729,"[1439472199, 1439472203, 1439472211, 143947221..."
3,7688,4344,"[1453904021, 1453904031, 1453904046, 145390404..."
3,8045,27773,"[1484753654, 1484753766, 1484753808, 148475384..."
...,...,...,...
162538,7513,6668,"[1438780751, 1438780754, 1438780759, 143878083..."
162539,2378,1676,"[995149720, 995149760, 995149788, 995149788, 9..."
162540,5315,44022,"[1248854959, 1248855584, 1248856442, 124885702..."
162540,5317,37720,"[1249028584, 1249028593, 1249029673, 1249029678]"


In [9]:
# TODO: Build feature columns for the input features

## Hyper-parameters

In [10]:
BATCH_SIZE = 16  # Batch Size
CATEGORICAL_COLUMNS = []  # Single-hot
CATEGORICAL_MH_COLUMNS = ["sampled_tag", "movieId", "genre"]  # Multi-hot
NUMERIC_COLUMNS = ["movieId_count"]

In [11]:
movie_workflow = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "movie_features_workflow"))

In [12]:
EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(movie_workflow)
EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
EMBEDDING_TABLE_SHAPES

{'movieId': (45252, 512), 'genres': (21, 16), 'tags_unique': (73051, 512)}

In [13]:
# TODO: Figure out how to make column names align (maybe just hack it)

In [14]:
EMBEDDING_TABLE_SHAPES["sampled_tag"] = EMBEDDING_TABLE_SHAPES.pop('tags_unique', None)
EMBEDDING_TABLE_SHAPES["genre"] = EMBEDDING_TABLE_SHAPES.pop('genres', None)
EMBEDDING_TABLE_SHAPES

{'movieId': (45252, 512), 'sampled_tag': (73051, 512), 'genre': (21, 16)}

## DataLoader

In [15]:
import os
import tensorflow as tf

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.7"  # fraction of free memory
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater

In [16]:
train_dataset_tf = KerasSequenceLoader(
    os.path.join(INPUT_DATA_DIR, "grouped_examples.parquet"),  
    batch_size=BATCH_SIZE,
    label_names=["target_item"],
    cat_names=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
    cont_names=NUMERIC_COLUMNS,
    engine="parquet",
    shuffle=True,
    buffer_size=0.25,
    parts_per_chunk=1,
)

In [17]:
batch = next(iter(train_dataset_tf))
batch

({'genre__values': <tf.Tensor: shape=(1454, 1), dtype=int64, numpy=
  array([[ 6],
         [ 6],
         [ 3],
         ...,
         [10],
         [ 9],
         [ 2]])>,
  'genre__nnzs': <tf.Tensor: shape=(16, 1), dtype=int32, numpy=
  array([[102],
         [288],
         [377],
         [ 16],
         [ 49],
         [ 73],
         [153],
         [  1],
         [ 44],
         [ 12],
         [  5],
         [ 62],
         [ 45],
         [ 25],
         [202],
         [  0]], dtype=int32)>,
  'movieId__values': <tf.Tensor: shape=(664, 1), dtype=int64, numpy=
  array([[ 5952],
         [ 1653],
         [ 1250],
         [ 6539],
         [ 6377],
         [ 3448],
         [ 1088],
         [  899],
         [ 2161],
         [ 6711],
         [ 3949],
         [ 8360],
         [ 5878],
         [  306],
         [ 1175],
         [  307],
         [ 1237],
         [ 7327],
         [ 8154],
         [ 7234],
         [ 2843],
         [ 4144],
         [ 7365],
      

In [18]:
continuous_cols = []

for col in NUMERIC_COLUMNS:
    continuous_cols.append(
        tf.feature_column.numeric_column(col)
    )
    
continuous_cols

[NumericColumn(key='movieId_count', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [19]:
embedding_dims = {}

for key, value in EMBEDDING_TABLE_SHAPES.items():
    embedding_dims[key] = value[1] # Latent dimensions

In [20]:
categorical_cols = []

for col in CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS:
    categorical_cols.append(
        tf.feature_column.categorical_column_with_identity(
            col, EMBEDDING_TABLE_SHAPES[col][0] # Cardinalities
        )
    )
    
categorical_cols

[IdentityCategoricalColumn(key='sampled_tag', number_buckets=73051, default_value=None),
 IdentityCategoricalColumn(key='movieId', number_buckets=45252, default_value=None),
 IdentityCategoricalColumn(key='genre', number_buckets=21, default_value=None)]

In [21]:
model = YouTubeDNN(continuous_cols, categorical_cols, embedding_dims=embedding_dims, hidden_dims=[512,256,128])

In [22]:
# TODO: Add optimizer and other training config

In [23]:
model.input_layer.build({})
item_embeddings = model.input_layer.embedding_tables["movieId"]

def sampled_softmax_loss(y_true, y_pred):
    return tf.nn.sampled_softmax_loss(
        weights=item_embeddings,
        biases=tf.zeros((item_embeddings.shape[0],)),
        labels=y_true,
        inputs=y_pred,
        num_sampled=20,
        num_classes=item_embeddings.shape[0],
    )

model.compile("sgd", sampled_softmax_loss)

In [24]:
tf.keras.utils.plot_model(model)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [25]:
# TODO: Train the model!

In [27]:
# validation_callback = KerasSequenceValidater(valid_dataset_tf)

history = model.fit(train_dataset_tf, callbacks=[], epochs=1)

ValueError: in user code:

    /home/karl/miniconda3/envs/nvtabular_dev_11.0/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /home/karl/Projects/nvidia/models/merlin_models/tensorflow/models/retrieval/youtube_dnn.py:55 call  *
        x = layer(x)
    /home/karl/miniconda3/envs/nvtabular_dev_11.0/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1008 __call__  **
        self._maybe_build(inputs)
    /home/karl/miniconda3/envs/nvtabular_dev_11.0/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:2710 _maybe_build
        self.build(input_shapes)  # pylint:disable=not-callable
    /home/karl/miniconda3/envs/nvtabular_dev_11.0/lib/python3.8/site-packages/tensorflow/python/keras/layers/core.py:1182 build
        raise ValueError('The last dimension of the inputs to `Dense` '

    ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.


In [None]:
MODEL_NAME_TF = os.environ.get("MODEL_NAME_TF", "movielens_tf")
MODEL_PATH_TEMP_TF = os.path.join(MODEL_BASE_DIR, MODEL_NAME_TF, "1/model.savedmodel")

model.save(MODEL_PATH_TEMP_TF)

In [None]:
rmm.reinitialize(managed_memory=False)