In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
import os

import cudf
import rmm

import nvtabular as nvt

In [3]:
from merlin_models.tensorflow.models.retrieval import YouTubeDNN

In [4]:
rmm.reinitialize(managed_memory=True)

In [5]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)
MODEL_BASE_DIR = os.environ.get(
    "MODEL_BASE_DIR", os.path.expanduser("./models/")
)

In [6]:
examples = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "training_examples.parquet"))
examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[2124, 48808, 31732, 44035, 6150, 71933, 0, 15...","[9, 12, 18, 4, 5, 10, 14, 8, 9, 9, 12, 18, 9, ...","[1147868053, 1147868097, 1147868414, 114786846...","[5952, 1653, 1250, 6539, 6377, 3448, 1088, 899...",52,7361
2,4071,"[23266, 64668, 39517, 53849, 9506, 10820, 6627...","[9, 12, 9, 7, 9, 16, 18, 7, 9, 9, 12, 17, 9, 1...","[1141415528, 1141415566, 1141415576, 114141558...","[5952, 497, 1374, 1653, 2640, 5445, 151, 236, ...",124,2150
3,7521,"[54707, 46964, 40789, 41791, 18575, 8371, 1483...","[2, 6, 2, 6, 7, 3, 9, 3, 4, 5, 6, 10, 2, 9, 17...","[1439472199, 1439472203, 1439472211, 143947221...","[356, 593, 1270, 1, 2571, 260, 318, 1196, 527,...",220,37729
3,7688,"[35375, 60889, 15821, 30192, 0, 15866, 64283, ...","[9, 2, 17, 18, 6, 6, 9, 9, 8, 1, 8, 9, 10, 12,...","[1453904021, 1453904031, 1453904046, 145390404...","[1206, 1208, 44191, 32587, 51662, 55820, 40815...",18,4344
3,8045,"[56053, 50289, 58994, 0, 71594, 68845, 61483, ...","[9, 9, 8, 9, 2, 3, 6, 7, 9, 16, 6, 7, 9, 18, 7...","[1484753654, 1484753766, 1484753808, 148475384...","[1089, 4011, 741, 778, 111, 214, 293, 1252, 33...",21,27773


In [7]:
examples["movie_id_count"].max()

1924

In [8]:
examples[["target_item", "timestamps"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,target_item,timestamps
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4146,7361,"[1147868053, 1147868097, 1147868414, 114786846..."
2,4071,2150,"[1141415528, 1141415566, 1141415576, 114141558..."
3,7521,37729,"[1439472199, 1439472203, 1439472211, 143947221..."
3,7688,4344,"[1453904021, 1453904031, 1453904046, 145390404..."
3,8045,27773,"[1484753654, 1484753766, 1484753808, 148475384..."
...,...,...,...
162538,7513,6668,"[1438780751, 1438780754, 1438780759, 143878083..."
162539,2378,1676,"[995149720, 995149760, 995149788, 995149788, 9..."
162540,5315,55995,"[1248854959, 1248855507, 1248855584, 124885572..."
162540,5317,37720,"[1249028584, 1249028593, 1249029673, 124902967..."


## Hyper-parameters

In [9]:
BATCH_SIZE = 16  # Batch Size
CATEGORICAL_COLUMNS = []  # Single-hot
CATEGORICAL_MH_COLUMNS = ["search_terms", "movie_ids", "genres"]  # Multi-hot
NUMERIC_COLUMNS = ["movie_id_count"]

In [10]:
movie_workflow = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "movie_features_workflow"))

In [11]:
EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(movie_workflow)
EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
EMBEDDING_TABLE_SHAPES

{'movie_id': (62424, 512), 'genres': (21, 16), 'tags_unique': (73051, 512)}

In [12]:
EMBEDDING_TABLE_SHAPES["movie_ids"] = EMBEDDING_TABLE_SHAPES.pop('movie_id', None)
EMBEDDING_TABLE_SHAPES["search_terms"] = EMBEDDING_TABLE_SHAPES.pop('tags_unique', None)
EMBEDDING_TABLE_SHAPES

{'genres': (21, 16), 'movie_ids': (62424, 512), 'search_terms': (73051, 512)}

## DataLoader

In [13]:
import os
import tensorflow as tf

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.7"  # fraction of free memory
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater

In [14]:
train_dataset_tf = KerasSequenceLoader(
    os.path.join(INPUT_DATA_DIR, "training_examples.parquet"),  
    batch_size=BATCH_SIZE,
    label_names=["target_item"],
    cat_names=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
    cont_names=NUMERIC_COLUMNS,
    engine="parquet",
    shuffle=True,
    buffer_size=0.25,
    parts_per_chunk=1,
)

In [15]:
continuous_cols = []

for col in NUMERIC_COLUMNS:
    continuous_cols.append(
        tf.feature_column.numeric_column(col)
    )
    
continuous_cols

[NumericColumn(key='movie_id_count', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [16]:
EMBEDDING_TABLE_SHAPES

{'genres': (21, 16), 'movie_ids': (62424, 512), 'search_terms': (73051, 512)}

In [17]:
embedding_dims = {}

for key, value in EMBEDDING_TABLE_SHAPES.items():
    embedding_dims[key] = 128  #value[1] # Latent dimensions

In [18]:
categorical_cols = []

for col in CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS:
    categorical_cols.append(
        tf.feature_column.categorical_column_with_identity(
            col, EMBEDDING_TABLE_SHAPES[col][0] # Cardinalities
        )
    )
    
categorical_cols

[IdentityCategoricalColumn(key='search_terms', number_buckets=73051, default_value=None),
 IdentityCategoricalColumn(key='movie_ids', number_buckets=62424, default_value=None),
 IdentityCategoricalColumn(key='genres', number_buckets=21, default_value=None)]

In [19]:
model = YouTubeDNN(continuous_cols, categorical_cols, embedding_dims=embedding_dims, hidden_dims=[512,256,128])

In [20]:
model.input_layer.build({})
item_embeddings = model.input_layer.embedding_tables["movie_ids"]

def sampled_softmax_loss(y_true, y_pred):
    return tf.nn.sampled_softmax_loss(
        weights=item_embeddings,
        biases=tf.zeros((item_embeddings.shape[0],)),
        labels=y_true,
        inputs=y_pred,
        num_sampled=20,
        num_classes=item_embeddings.shape[0],
    )

model.compile("nadam", sampled_softmax_loss)

In [22]:
# validation_callback = KerasSequenceValidater(valid_dataset_tf)

history = model.fit(train_dataset_tf, callbacks=[], epochs=1)



In [23]:
MODEL_NAME_TF = os.environ.get("MODEL_NAME_TF", "movielens_retrieval_tf")
MODEL_PATH_TEMP_TF = os.path.join(MODEL_BASE_DIR, MODEL_NAME_TF, "1/model.savedmodel")

model.save(MODEL_PATH_TEMP_TF)

INFO:tensorflow:Assets written to: ./models/movielens_retrieval_tf/1/model.savedmodel/assets


In [24]:
rmm.reinitialize(managed_memory=False)