In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
import os

import cudf

import nvtabular as nvt

In [3]:
from merlin_models.tensorflow.models.retrieval import YouTubeDNN

In [4]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("~/nvt-examples/end-to-end-poc/data/")
)

In [5]:
examples = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "grouped_examples.parquet"))
examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sampled_tag,timestamp,movieId,movieId_count,target_item,genre
userId,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[41673, 48558, 65151, 39605, 53617, 721, 35459...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361,"[6, 6, 3, 6, 17, 9, 16, 3, 9, 2, 3, 6, 10, 3, ..."
2,4071,"[52780, 50622, 50055, 64349, 21820, 62132, 397...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150,"[6, 6, 6, 6, 6, 9, 9, 7, 9, 15, 7, 9, 2, 6, 6,..."
3,7521,"[20978, 55603, 52055, 61112, 39917, 64877, 513...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",178,37729,"[6, 9, 16, 9, 2, 6, 7, 9, 17, 3, 4, 5, 6, 10, ..."
3,7688,"[36028, 71080, 28377, 50807, 52690, 39197, 467...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 40815, 36529, 45186...",9,4344,"[12, 12, 18, 7, 9, 18, 7, 9, 6, 9, 1, 1, 12, 2..."
3,8045,"[18451, 13153, 32067, 30064, 15745, 31465, 522...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773,"[7, 9, 17, 18, 7, 9, 18, 7, 9, 9, 7, 9, 9, 18,..."


In [6]:
# TODO: Build feature columns for the input features

## Hyper-parameters

In [7]:
BATCH_SIZE = 1024  # Batch Size
CATEGORICAL_COLUMNS = []  # Single-hot
CATEGORICAL_MH_COLUMNS = ["sampled_tag", "movieId", "genre"]  # Multi-hot
NUMERIC_COLUMNS = ["movieId_count"]

In [10]:
movie_workflow = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "movie_features_workflow"))

In [12]:
EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(movie_workflow)
EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
EMBEDDING_TABLE_SHAPES

{'movieId': (45252, 512), 'genres': (21, 16), 'tags_unique': (73051, 512)}

In [None]:
# TODO: Figure out how to make column names align (maybe just hack it)

In [14]:
EMBEDDING_TABLE_SHAPES["sampled_tag"] = EMBEDDING_TABLE_SHAPES.pop('tags_unique', None)
EMBEDDING_TABLE_SHAPES

{'movieId': (45252, 512), 'genres': (21, 16), 'sampled_tag': (73051, 512)}

## DataLoader

In [None]:
# TODO: Construct a KSL DataLoader to load the data

In [15]:
import os
import tensorflow as tf

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.7"  # fraction of free memory
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater

In [17]:
train_dataset_tf = KerasSequenceLoader(
    os.path.join(INPUT_DATA_DIR, "grouped_examples.parquet"),  
    batch_size=BATCH_SIZE,
    label_names=["target_item"],
    cat_names=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
    cont_names=NUMERIC_COLUMNS,
    engine="parquet",
    shuffle=True,
    buffer_size=0.06,
    parts_per_chunk=1,
)

In [None]:
batch = next(iter(train_dataset_tf))
batch[0]

In [None]:
inputs = {}
emb_layers = []

for col in CATEGORICAL_COLUMNS:
    inputs[col] = tf.keras.Input(name=col, dtype=tf.int32, shape=(1,))

# Note that we need two input tensors for multi-hot categorical features
for col in CATEGORICAL_MH_COLUMNS:
    inputs[col + "__values"] = tf.keras.Input(name=f"{col}__values", dtype=tf.int64, shape=(1,))
    inputs[col + "__nnzs"] = tf.keras.Input(name=f"{col}__nnzs", dtype=tf.int64, shape=(1,))

In [None]:
# TODO: Remove embedding column creation here

for col in CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS:
    emb_layers.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_identity(
                col, EMBEDDING_TABLE_SHAPES[col][0]
            ),  # Input dimension (vocab size)
            EMBEDDING_TABLE_SHAPES[col][1],  # Embedding output dimension
        )
    )
emb_layers

In [None]:
# TODO: Add optimizer and other training config

In [None]:
model = tf.keras.Model(inputs=inputs, outputs=x)
model.compile("sgd", "binary_crossentropy")

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
# TODO: Train the model!

In [None]:
validation_callback = KerasSequenceValidater(valid_dataset_tf)

history = model.fit(train_dataset_tf, callbacks=[validation_callback], epochs=1)

In [None]:
MODEL_NAME_TF = os.environ.get("MODEL_NAME_TF", "movielens_tf")
MODEL_PATH_TEMP_TF = os.path.join(MODEL_BASE_DIR, MODEL_NAME_TF, "1/model.savedmodel")

model.save(MODEL_PATH_TEMP_TF)