In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
import os

import cudf
import rmm

import nvtabular as nvt

In [3]:
from merlin_models.tensorflow.models.retrieval import YouTubeDNN

In [4]:
rmm.reinitialize(managed_memory=True)

In [5]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("./data/")
)
MODEL_BASE_DIR = os.environ.get(
    "MODEL_BASE_DIR", os.path.expanduser("./models/")
)
MODEL_LOG_DIR = os.environ.get(
    "MODEL_LOG_DIR", os.path.expanduser("./logs/")
)

In [6]:
examples = cudf.read_parquet(os.path.join(INPUT_DATA_DIR, "training_examples.parquet"))
examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,search_terms,genres,timestamps,movie_ids,movie_id_count,target_item
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4146,"[17594, 40872, 33339, 57686, 24188, 53948, 397...","[3, 10, 9, 17, 18, 3, 9, 19, 2, 3, 6, 10, 3, 4...","[1147868053, 1147868097, 1147868414, 114786846...","[5841, 1592, 1218, 6417, 6259, 3353, 1062, 879...",52,7237
2,4071,"[54673, 19997, 44011, 67380, 36218, 48078, 469...","[3, 10, 6, 16, 2, 3, 17, 18, 9, 17, 18, 2, 3, ...","[1141415528, 1141415566, 1141415576, 114141558...","[5841, 493, 1339, 1592, 2550, 5338, 150, 234, ...",124,2061
3,7521,"[38027, 927, 30849, 71320, 48032, 41101, 43529...","[6, 9, 16, 19, 7, 12, 18, 3, 6, 17, 3, 4, 5, 6...","[1439472199, 1439472203, 1439472211, 143947221...","[352, 586, 1238, 1, 2481, 258, 315, 1167, 523,...",314,24542
3,7688,"[39905, 26066, 62981, 47185, 41868, 39232, 668...","[7, 9, 17, 18, 2, 9, 19, 6, 16, 8, 20, 6, 10, ...","[1453904021, 1453904031, 1453904046, 145390404...","[1176, 1178, 10678, 9777, 11446, 11930, 10407,...",19,4240
3,8045,"[591, 51913, 67271, 43501, 35740, 20286, 45708...","[7, 15, 18, 2, 18, 6, 7, 18, 4, 17, 6, 7, 9, 7...","[1484753654, 1484753762, 1484753766, 148475380...","[1063, 29365, 3908, 726, 763, 110, 213, 29375,...",47,9335


In [7]:
examples["search_terms"].dtype

ListDtype(int32)

In [8]:
examples["movie_id_count"].max()

5524

In [9]:
examples[["target_item", "timestamps"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,target_item,timestamps
user_id,day,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4146,7237,"[1147868053, 1147868097, 1147868414, 114786846..."
2,4071,2061,"[1141415528, 1141415566, 1141415576, 114141558..."
3,7521,24542,"[1439472199, 1439472203, 1439472211, 143947221..."
3,7688,4240,"[1453904021, 1453904031, 1453904046, 145390404..."
3,8045,9335,"[1484753654, 1484753762, 1484753766, 148475380..."
...,...,...,...
162538,7513,6546,"[1438780751, 1438780754, 1438780759, 143878083..."
162539,2378,1615,"[995149720, 995149760, 995149788, 995149788, 9..."
162540,5315,11946,"[1248854959, 1248855507, 1248855584, 124885572..."
162540,5317,13355,"[1249028189, 1249028584, 1249028593, 124902967..."


## Hyper-parameters

In [10]:
BATCH_SIZE = 16  # Batch Size
CATEGORICAL_COLUMNS = []  # Single-hot
CATEGORICAL_MH_COLUMNS = ["search_terms", "movie_ids", "genres"]  # Multi-hot
NUMERIC_COLUMNS = ["movie_id_count"]

In [11]:
movie_workflow = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "movie_features_workflow"))

In [12]:
EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(movie_workflow)
EMBEDDING_TABLE_SHAPES.update(MH_EMBEDDING_TABLE_SHAPES)
EMBEDDING_TABLE_SHAPES

{'movie_id': (62424, 512), 'tags_unique': (73051, 512), 'genres': (21, 16)}

In [13]:
EMBEDDING_TABLE_SHAPES["movie_ids"] = EMBEDDING_TABLE_SHAPES.pop('movie_id', None)
EMBEDDING_TABLE_SHAPES["search_terms"] = EMBEDDING_TABLE_SHAPES.pop('tags_unique', None)
EMBEDDING_TABLE_SHAPES

{'genres': (21, 16), 'movie_ids': (62424, 512), 'search_terms': (73051, 512)}

## DataLoader

In [14]:
import os
import tensorflow as tf

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.7"  # fraction of free memory
from nvtabular.loader.tensorflow import KerasSequenceLoader, KerasSequenceValidater

In [15]:
train_dataset_tf = KerasSequenceLoader(
    os.path.join(INPUT_DATA_DIR, "training_examples.parquet"),  
    batch_size=BATCH_SIZE,
    label_names=["target_item"],
    cat_names=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
    cont_names=NUMERIC_COLUMNS,
    engine="parquet",
    shuffle=True,
    buffer_size=0.25,
    parts_per_chunk=1,
)

In [16]:
batch = next(iter(train_dataset_tf))

In [17]:
continuous_cols = []

for col in NUMERIC_COLUMNS:
    continuous_cols.append(
        tf.feature_column.numeric_column(col)
    )
    
continuous_cols

[NumericColumn(key='movie_id_count', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [18]:
EMBEDDING_TABLE_SHAPES

{'genres': (21, 16), 'movie_ids': (62424, 512), 'search_terms': (73051, 512)}

In [19]:
embedding_dims = {}

for key, value in EMBEDDING_TABLE_SHAPES.items():
    embedding_dims[key] = 128  #value[1] # Latent dimensions

In [20]:
categorical_cols = []

for col in CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS:
    categorical_cols.append(
        tf.feature_column.categorical_column_with_identity(
            col, EMBEDDING_TABLE_SHAPES[col][0] # Cardinalities
        )
    )
    
categorical_cols

[IdentityCategoricalColumn(key='search_terms', number_buckets=73051, default_value=None),
 IdentityCategoricalColumn(key='movie_ids', number_buckets=62424, default_value=None),
 IdentityCategoricalColumn(key='genres', number_buckets=21, default_value=None)]

In [21]:
embedding_dims

{'genres': 128, 'movie_ids': 128, 'search_terms': 128}

In [22]:
model = YouTubeDNN(continuous_cols, categorical_cols, embedding_dims=embedding_dims, hidden_dims=[512,256,128])

In [23]:
model.input_layer.build({})
item_embeddings = model.input_layer.embedding_tables["movie_ids"]

def sampled_softmax_loss(y_true, y_pred):
    return tf.nn.sampled_softmax_loss(
        weights=item_embeddings,
        biases=tf.fill((item_embeddings.shape[0],), 0.01),
        labels=y_true,
        inputs=y_pred,
        num_sampled=20,
        num_classes=item_embeddings.shape[0],
    )

model.compile("nadam", sampled_softmax_loss)

In [24]:
# validation_callback = KerasSequenceValidater(valid_dataset_tf)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=MODEL_LOG_DIR)

history = model.fit(train_dataset_tf, callbacks=[tensorboard_callback], epochs=1)



In [25]:
MODEL_NAME_TF = os.environ.get("MODEL_NAME_TF", "movielens_retrieval_tf")
MODEL_PATH_TEMP_TF = os.path.join(MODEL_BASE_DIR, MODEL_NAME_TF, "1/model.savedmodel")

# model.save(MODEL_PATH_TEMP_TF)
tf.keras.models.save_model(model, MODEL_PATH_TEMP_TF)

INFO:tensorflow:Assets written to: ./models/movielens_retrieval_tf/1/model.savedmodel/assets


In [26]:
rmm.reinitialize(managed_memory=False)

In [27]:
model.summary()

Model: "you_tube_dnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features (DenseFeature multiple                  17343488  
_________________________________________________________________
dense (Dense)                multiple                  197632    
_________________________________________________________________
dense_1 (Dense)              multiple                  131328    
_________________________________________________________________
dense_2 (Dense)              multiple                  32896     
Total params: 17,705,344
Trainable params: 17,705,344
Non-trainable params: 0
_________________________________________________________________


In [28]:
user_id = [[6]]
movie_id_count = [[1]]
movie_ids = [[882], [841], [1641], [523], [258], [315], [586], [601], [2767], [1167], [1174], [1183], [904], [1169], [892], [893], [1863], [899], [1071]]
genres = [[9], [16], [7], [9], [9], [16], [9], [19], [2], [3], [17], [7], [9], [7], [12], [18], [6], [7], [9], [18], [9], [16], [2], [3], [17], [9], [7], [9], [3], [9], [17], [2], [3], [9], [16], [11], [15], [9], [14], [3], [5], [10], [14], [5], [9], [17]]
search_terms = [[50389], [ 968], [10263], [8259], [20445], [58863], [18916], [66212], [63819], [9384], [45278], [20555], [39349], [18846], [39990], [53832], [2816], [62617], [42691]]

In [29]:
x = {
    "user_id": user_id,
    "movie_id_count": movie_id_count,
    "movie_ids": movie_ids,
    "genres": genres,
    "search_terms": search_terms,
}

for key, value in x.items():
    x[key] = tf.transpose(tf.convert_to_tensor(value))
    print(f"{key} shape: {x[key].shape}")

user_vector = model.predict(x)
user_vector

user_id shape: (1, 1)
movie_id_count shape: (1, 1)
movie_ids shape: (1, 19)
genres shape: (1, 46)
search_terms shape: (1, 19)


array([[-0.1383512 , -0.05716562, -0.12354594,  0.13409431, -0.10257582,
         0.11927378, -0.03737998, -0.15381287, -0.0724098 , -0.10208561,
         0.21632077,  0.17246972, -0.14380702, -0.1398768 , -0.13346617,
        -0.09590293,  0.15704662, -0.13829824, -0.0993218 ,  0.16822325,
         0.10212812,  0.22724546,  0.15726045, -0.02961836,  0.16176428,
        -0.06514155, -0.06683823,  0.14190286, -0.12948775, -0.17168887,
         0.23065363,  0.21334466,  0.16438077, -0.07132617,  0.20561174,
        -0.05111433,  0.17571762, -0.10241346,  0.22389354,  0.17004342,
        -0.04548051, -0.06756829, -0.14260867,  0.19560201,  0.18748654,
         0.23943728,  0.12494116,  0.22873135,  0.22130905, -0.07256585,
         0.20321113, -0.06308627, -0.14939538, -0.13626337, -0.13091938,
        -0.0744931 ,  0.21433242,  0.15492506,  0.23045251, -0.13307036,
        -0.06050499, -0.06900201,  0.16676171,  0.19484402, -0.13856459,
        -0.04030246,  0.14304031, -0.16113551, -0.1

In [30]:
len(user_vector[user_vector == 0])

0