In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Training HugeCTR Model with Pretrained Embeddings

In this notebook, we will train a deep neural network for predicting user's rating (binary target with 1 for ratings `>3` and 0 for  ratings `<=3`). The two categorical features are `userId` and `movieId`.

We will also make use of movie's pretrained embeddings, extracted in the previous notebooks.

## Loading pretrained movie features into non-trainable embedding layer

In [2]:
# loading NVTabular movie encoding
import pandas as pd
import os

INPUT_DATA_DIR = './data'
movie_mapping = pd.read_parquet(os.path.join(INPUT_DATA_DIR, "workflow-hugectr/categories/unique.movieId.parquet"))

In [3]:
movie_mapping.tail()

Unnamed: 0,movieId,movieId_size
56581,209155,1
56582,209157,1
56583,209159,1
56584,209169,1
56585,209171,1


In [4]:
feature_df = pd.read_parquet('feature_df.parquet')
print(feature_df.shape)
feature_df.head()

(62423, 3073)


Unnamed: 0,movieId,poster_feature_0,poster_feature_1,poster_feature_2,poster_feature_3,poster_feature_4,poster_feature_5,poster_feature_6,poster_feature_7,poster_feature_8,...,text_feature_1014,text_feature_1015,text_feature_1016,text_feature_1017,text_feature_1018,text_feature_1019,text_feature_1020,text_feature_1021,text_feature_1022,text_feature_1023
0,1.0,0.02626,0.857608,0.410247,0.066654,0.382803,0.899998,0.511562,0.592291,0.565434,...,0.636716,0.578369,0.996169,0.402107,0.412318,0.859952,0.293852,0.341114,0.727113,0.085829
1,2.0,0.141265,0.721758,0.679958,0.955634,0.391091,0.324611,0.505211,0.258331,0.048264,...,0.161505,0.431864,0.836532,0.525013,0.654566,0.823841,0.818313,0.85628,0.638048,0.685537
2,3.0,0.119418,0.911146,0.470762,0.762258,0.626335,0.768947,0.241833,0.775992,0.23634,...,0.865548,0.387806,0.668321,0.552122,0.750238,0.863707,0.382173,0.894487,0.565142,0.164083
3,4.0,0.538184,0.980678,0.643513,0.928519,0.794906,0.201022,0.744666,0.962188,0.91532,...,0.777534,0.9042,0.167337,0.875194,0.180481,0.815904,0.808288,0.036711,0.902779,0.580946
4,5.0,0.772951,0.239788,0.061874,0.162997,0.38831,0.236311,0.162757,0.207134,0.111078,...,0.250022,0.335043,0.091674,0.121507,0.418124,0.15002,0.803506,0.059504,0.002342,0.932321


In [5]:
feature_df.set_index('movieId', inplace=True)

In [6]:
from tqdm import tqdm
import numpy as np

num_tokens = len(movie_mapping)
embedding_dim = 2048+1024
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))

print("Loading pretrained embedding matrix...")
for i, row in tqdm(movie_mapping.iterrows(), total=len(movie_mapping)):
    movieId = row['movieId']
    if movieId in feature_df.index: 
        embedding_vector = feature_df.loc[movieId]
        # embedding found
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Found features for %d movies (%d misses)" % (hits, misses))


Loading pretrained embedding matrix...


100%|████████████████████████████████████| 56586/56586 [00:14<00:00, 3967.46it/s]

Found features for 56585 movies (1 misses)





In [7]:
embedding_dim

3072

In [8]:
embedding_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.17294852, 0.15285189, 0.26095702, ..., 0.75369112, 0.29602144,
        0.78917433],
       [0.13539355, 0.84843078, 0.70951219, ..., 0.10441725, 0.72871966,
        0.11719463],
       ...,
       [0.18514273, 0.72422918, 0.04273015, ..., 0.1404219 , 0.54169348,
        0.96875489],
       [0.08307642, 0.3673532 , 0.15777258, ..., 0.01297393, 0.36267638,
        0.14848055],
       [0.82188376, 0.56516905, 0.70838085, ..., 0.45119769, 0.9273439 ,
        0.42464321]])

Next, we write the pretrained embedding to a raw format supported by HugeCTR.

Note: As of version 3.2, HugeCTR only supports a maximum embedding size of 1024. Hence, we shall be using the first 512 elememt of image embedding plus 512 element of text embedding.

In [9]:
import struct

PRETRAINED_EMBEDDING_SIZE = 1024

def convert_pretrained_embeddings_to_sparse_model(keys, pre_trained_sparse_embeddings, hugectr_sparse_model, embedding_vec_size):
    os.system("mkdir -p {}".format(hugectr_sparse_model))
    with open("{}/key".format(hugectr_sparse_model), 'wb') as key_file, \
        open("{}/emb_vector".format(hugectr_sparse_model), 'wb') as vec_file:
                
        for i, key in enumerate(keys):
            vec = np.concatenate([pre_trained_sparse_embeddings[i,:int(PRETRAINED_EMBEDDING_SIZE/2)], pre_trained_sparse_embeddings[i, 1024:1024+int(PRETRAINED_EMBEDDING_SIZE/2)]])
            key_struct = struct.pack('q', key)
            vec_struct = struct.pack(str(embedding_vec_size) + "f", *vec)
            key_file.write(key_struct)
            vec_file.write(vec_struct)

keys = list(movie_mapping.index)
convert_pretrained_embeddings_to_sparse_model(keys, embedding_matrix, 'hugectr_pretrained_embedding.model', embedding_vec_size=PRETRAINED_EMBEDDING_SIZE) # HugeCTR not supporting embedding size > 1024

## Define and train model

In this section, we define and train the model. The model comprise trainable embedding layers for categorical features (`userId`, `movieId`) and pretrained (non-trainable) embedding layer for movie features.

We will write the model to `./model.py` and execute it afterwards.

First, we need the cardinalities of each categorical feature to assign as `slot_size_array` in the model below.

In [10]:
import nvtabular as nvt
from nvtabular.ops import get_embedding_sizes

workflow = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "workflow-hugectr"))

embeddings = get_embedding_sizes(workflow)
print(embeddings)

#{'userId': (162542, 512), 'movieId': (56586, 512), 'movieId_duplicate': (56586, 512)}

{'userId': (162542, 512), 'movieId': (56586, 512), 'movieId_duplicate': (56586, 512)}


We use `graph_to_json` to convert the model to a JSON configuration, required for the inference.

In [11]:
%%writefile './model.py'

import hugectr
from mpi4py import MPI  # noqa
INPUT_DATA_DIR = './data/'

solver = hugectr.CreateSolver(
    vvgpu=[[0]],
    batchsize=2048,
    batchsize_eval=2048,
    max_eval_batches=160,
    i64_input_key=True,
    use_mixed_precision=False,
    repeat_dataset=True,
)
optimizer = hugectr.CreateOptimizer(optimizer_type=hugectr.Optimizer_t.Adam)
reader = hugectr.DataReaderParams(
    data_reader_type=hugectr.DataReaderType_t.Parquet,
    source=[INPUT_DATA_DIR + "train-hugectr/_file_list.txt"],
    eval_source=INPUT_DATA_DIR + "valid-hugectr/_file_list.txt",
    check_type=hugectr.Check_t.Non,
    slot_size_array=[162542, 56586, 21, 56586],
)

model = hugectr.Model(solver, reader, optimizer)

model.add(
    hugectr.Input(
        label_dim=1,
        label_name="label",
        dense_dim=0,
        dense_name="dense",
        data_reader_sparse_param_array=[
            hugectr.DataReaderSparseParam("data1", nnz_per_slot=[1, 1, 2], is_fixed_length=False, slot_num=3),
            hugectr.DataReaderSparseParam("movieId", nnz_per_slot=[1], is_fixed_length=True, slot_num=1)
        ],
    )
)
model.add(
    hugectr.SparseEmbedding(
        embedding_type=hugectr.Embedding_t.LocalizedSlotSparseEmbeddingHash,
        workspace_size_per_gpu_in_mb=3000,
        embedding_vec_size=16,
        combiner="sum",
        sparse_embedding_name="sparse_embedding1",
        bottom_name="data1",
        optimizer=optimizer,
    )
)

# pretrained embedding
model.add(
    hugectr.SparseEmbedding(
        embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
        workspace_size_per_gpu_in_mb=3000,
        embedding_vec_size=1024,
        combiner="sum",
        sparse_embedding_name="pretrained_embedding",
        bottom_name="movieId",
        optimizer=optimizer,
    )
)

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape1"],
                            leading_dim=48))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["pretrained_embedding"],
                            top_names = ["reshape2"],
                            leading_dim=1024))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["reshape1", "reshape2"],
                            top_names = ["concat1"]))

model.add(
    hugectr.DenseLayer(
        layer_type=hugectr.Layer_t.InnerProduct,
        bottom_names=["concat1"],
        top_names=["fc1"],
        num_output=128,
    )
)
model.add(
    hugectr.DenseLayer(
        layer_type=hugectr.Layer_t.ReLU,
        bottom_names=["fc1"],
        top_names=["relu1"],
    )
)
model.add(
    hugectr.DenseLayer(
        layer_type=hugectr.Layer_t.InnerProduct,
        bottom_names=["relu1"],
        top_names=["fc2"],
        num_output=128,
    )
)
model.add(
    hugectr.DenseLayer(
        layer_type=hugectr.Layer_t.ReLU,
        bottom_names=["fc2"],
        top_names=["relu2"],
    )
)
model.add(
    hugectr.DenseLayer(
        layer_type=hugectr.Layer_t.InnerProduct,
        bottom_names=["relu2"],
        top_names=["fc3"],
        num_output=1,
    )
)
model.add(
    hugectr.DenseLayer(
        layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
        bottom_names=["fc3", "label"],
        top_names=["loss"],
    )
)
model.compile()
model.summary()

# Load the pretrained embedding layer
model.load_sparse_weights({"pretrained_embedding": "./hugectr_pretrained_embedding.model"})
model.freeze_embedding("pretrained_embedding")

model.fit(max_iter=10001, display=100, eval_interval=200, snapshot=5000)
model.graph_to_json(graph_config_file="hugectr-movielens.json")

Overwriting ./model.py


We train our model.

In [None]:
!python model.py

HugeCTR Version: 3.2
[HUGECTR][01:09:00][INFO][RANK0]: Global seed is 476440390
[HUGECTR][01:09:00][INFO][RANK0]: Device to NUMA mapping:
  GPU 0 ->  node 0

[HUGECTR][01:09:01][INFO][RANK0]: Start all2all warmup
[HUGECTR][01:09:01][INFO][RANK0]: End all2all warmup
[HUGECTR][01:09:01][INFO][RANK0]: Using All-reduce algorithm: NCCL
[HUGECTR][01:09:01][INFO][RANK0]: Device 0: Tesla V100-SXM2-16GB
[HUGECTR][01:09:01][INFO][RANK0]: num of DataReader workers: 1
[HUGECTR][01:09:01][INFO][RANK0]: Vocabulary size: 275735
[HUGECTR][01:09:01][INFO][RANK0]: max_vocabulary_size_per_gpu_=16384000
[HUGECTR][01:09:01][INFO][RANK0]: max_vocabulary_size_per_gpu_=256000
[HUGECTR][01:09:01][INFO][RANK0]: Graph analysis to resolve tensor dependency
[HUGECTR][01:09:04][INFO][RANK0]: gpu0 start to init embedding
[HUGECTR][01:09:04][INFO][RANK0]: gpu0 init embedding done
[HUGECTR][01:09:04][INFO][RANK0]: gpu0 start to init embedding
[HUGECTR][01:09:04][INFO][RANK0]: gpu0 init embedding done
[HUGECTR][01:09:0