In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
import os

import cudf
import rmm

import nvtabular as nvt

In [3]:
from merlin_models.tensorflow.models.retrieval import YouTubeDNN

In [4]:
rmm.reinitialize(managed_memory=True)

In [5]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)
MODEL_BASE_DIR = os.environ.get(
    "MODEL_BASE_DIR", os.path.expanduser("./models/")
)
MODEL_LOG_DIR = os.environ.get(
    "MODEL_LOG_DIR", os.path.expanduser("./logs/")
)

In [6]:
examples = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "ranking_training.parquet"))
examples.head()

Unnamed: 0,user_id,day,user_search_terms,user_genres,user_timestamps,user_movie_ids,user_movie_id_count,movie_tags_unique,movie_genres,movie_tags_nunique,movie_id,label
55102,25690,2383,"[40621, 64668]","[9, 6, 9]","[995542104, 995542222]","[3314, 3471]",2,"[1423, 2817, 3607, 4050, 5274, 6224, 8530, 123...","[9, 15, 18]",98,2621,1
673186,85254,2855,"[40952, 816, 57559, 48023, 52666, 18291, 2816,...","[6, 16, 4, 9, 10, 9, 2, 3, 6, 9, 16, 19, 6, 19...","[1036287980, 1036288019, 1036288083, 103628815...","[5192, 4768, 5723, 5685, 5688, 5690, 5691, 5634]",8,"[457, 472, 3300, 3428, 4494, 11716, 13596, 184...","[7, 9]",38,4274,0
116216,45463,4900,[60889],"[9, 18]",[1212986201],[3849],1,"[19263, 22052, 23714, 34063, 35238, 37615, 391...","[8, 14]",21,12353,1
604409,55998,7937,[45296],"[7, 9]",[1475418791],[315],1,"[256, 2817, 3281, 3284, 3965, 5048, 6191, 9537...","[9, 16]",119,1641,0
11111,464,7627,"[60020, 31525, 968, 2435, 57898, 52589, 36831,...","[3, 10, 3, 10, 2, 3, 17, 9, 11, 2, 3, 3, 6, 9,...","[1448668575, 1448668634, 1448668636, 144866863...","[4888, 5841, 258, 13358, 1169, 16830, 11446, 1...",15,"[379, 389, 685, 721, 1136, 1266, 1916, 1964, 2...","[2, 3, 10]",92,2026,1


## Hyper-parameters

In [7]:
BATCH_SIZE = 512  # Batch Size
CATEGORICAL_COLUMNS = ["movie_id"]  # Single-hot
CATEGORICAL_MH_COLUMNS = ["movie_tags_unique", "movie_genres", "user_search_terms", "user_genres", "user_movie_ids"]  # Multi-hot
NUMERIC_COLUMNS = ["user_movie_id_count", "movie_tags_nunique"]

In [8]:
movie_workflow = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "movie_features_workflow"))

In [9]:
EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(movie_workflow)
EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
EMBEDDING_TABLE_SHAPES

{'movie_id': (62424, 512), 'tags_unique': (73051, 512), 'genres': (21, 16)}

In [10]:
EMBEDDING_TABLE_SHAPES["user_movie_ids"] = EMBEDDING_TABLE_SHAPES["movie_id"]

EMBEDDING_TABLE_SHAPES["movie_tags_unique"] = EMBEDDING_TABLE_SHAPES["tags_unique"]
EMBEDDING_TABLE_SHAPES["user_search_terms"] = EMBEDDING_TABLE_SHAPES["tags_unique"]

EMBEDDING_TABLE_SHAPES["movie_genres"] = EMBEDDING_TABLE_SHAPES["genres"]
EMBEDDING_TABLE_SHAPES["user_genres"] = EMBEDDING_TABLE_SHAPES["genres"]

EMBEDDING_TABLE_SHAPES.pop('tags_unique', None)
EMBEDDING_TABLE_SHAPES

{'movie_id': (62424, 512),
 'genres': (21, 16),
 'user_movie_ids': (62424, 512),
 'movie_tags_unique': (73051, 512),
 'user_search_terms': (73051, 512),
 'movie_genres': (21, 16),
 'user_genres': (21, 16)}

## DataLoader

In [11]:
import os
import tensorflow as tf

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.7"  # fraction of free memory
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater

In [12]:
train_dataset_tf = KerasSequenceLoader(
    os.path.join(INPUT_DATA_DIR, "ranking_training.parquet"),  
    batch_size=BATCH_SIZE,
    label_names=["label"],
    cat_names=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
    cont_names=NUMERIC_COLUMNS,
    engine="parquet",
    shuffle=True,
    buffer_size=0.25,
    parts_per_chunk=1,
)

In [13]:
batch = next(iter(train_dataset_tf))

In [14]:
continuous_cols = []

for col in NUMERIC_COLUMNS:
    continuous_cols.append(
        tf.feature_column.numeric_column(col)
    )
    
continuous_cols

[NumericColumn(key='user_movie_id_count', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='movie_tags_nunique', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [15]:
EMBEDDING_TABLE_SHAPES

{'movie_id': (62424, 512),
 'genres': (21, 16),
 'user_movie_ids': (62424, 512),
 'movie_tags_unique': (73051, 512),
 'user_search_terms': (73051, 512),
 'movie_genres': (21, 16),
 'user_genres': (21, 16)}

In [16]:
categorical_cols = []

for col in CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS:
    categorical_cols.append(
        tf.feature_column.categorical_column_with_identity(
            col, EMBEDDING_TABLE_SHAPES[col][0] # Cardinalities
        )
    )
    
categorical_cols

[IdentityCategoricalColumn(key='movie_id', number_buckets=62424, default_value=None),
 IdentityCategoricalColumn(key='movie_tags_unique', number_buckets=73051, default_value=None),
 IdentityCategoricalColumn(key='movie_genres', number_buckets=21, default_value=None),
 IdentityCategoricalColumn(key='user_search_terms', number_buckets=73051, default_value=None),
 IdentityCategoricalColumn(key='user_genres', number_buckets=21, default_value=None),
 IdentityCategoricalColumn(key='user_movie_ids', number_buckets=62424, default_value=None)]

In [17]:
embedding_dims = {}

for key, value in EMBEDDING_TABLE_SHAPES.items():
    embedding_dims[key] = 128  #value[1] # Latent dimensions
    
embedding_dims

{'movie_id': 128,
 'genres': 128,
 'user_movie_ids': 128,
 'movie_tags_unique': 128,
 'user_search_terms': 128,
 'movie_genres': 128,
 'user_genres': 128}

In [25]:
model = YouTubeDNN(continuous_cols, categorical_cols, embedding_dims=embedding_dims, hidden_dims=[512,256,128,1],activations=["swish", "swish", "swish", "sigmoid"])

In [26]:
model.compile("nadam", "binary_crossentropy")

In [27]:
# validation_callback = KerasSequenceValidater(valid_dataset_tf)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=MODEL_LOG_DIR)

history = model.fit(train_dataset_tf, callbacks=[tensorboard_callback], epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
MODEL_NAME_TF = os.environ.get("MODEL_NAME_TF", "movielens_ranking_tf")
MODEL_PATH_TEMP_TF = os.path.join(MODEL_BASE_DIR, MODEL_NAME_TF, "1/model.savedmodel")

model.save(MODEL_PATH_TEMP_TF)
# tf.keras.models.save_model(model, MODEL_PATH_TEMP_TF)

INFO:tensorflow:Assets written to: ./models/movielens_ranking_tf/1/model.savedmodel/assets


In [29]:
model.summary()

Model: "you_tube_dnn_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_1 (DenseFeatu multiple                  34686976  
_________________________________________________________________
dense_4 (Dense)              multiple                  394752    
_________________________________________________________________
dense_5 (Dense)              multiple                  131328    
_________________________________________________________________
dense_6 (Dense)              multiple                  32896     
_________________________________________________________________
dense_7 (Dense)              multiple                  129       
Total params: 35,246,081
Trainable params: 35,246,081
Non-trainable params: 0
_________________________________________________________________


In [30]:
rmm.reinitialize(managed_memory=False)