In [2]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="https://developer.download.nvidia.com/notebooks/dlsw-notebooks/merlin_merlin_getting-started-movielens-03-training-with-pytorch/nvidia_logo.png" style="width: 90px; float: right;">

# Getting Started MovieLens: Training with PyTorch

This notebook is created using the latest stable [merlin-pytorch-training](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch-training/tags) container.

## Overview

We observed that PyTorch training pipelines can be slow as the dataloader is a bottleneck. The native dataloader in PyTorch randomly sample each item from the dataset, which is very slow. In our experiments, we are able to speed-up existing PyTorch pipelines using a highly optimized dataloader.<br><br>

In this tutorial we will be using the highly optimized Merlin Dataloader. To learn more about it, please consult the examples in its repository [here](https://github.com/NVIDIA-Merlin/dataloader/tree/main/examples).

### Learning objectives

This notebook explains, how to use the NVTabular dataloader to accelerate PyTorch training.

1. Use **Merlin dataloader** with PyTorch
2. Leverage **multi-hot encoded input features**

### MovieLens25M

The [MovieLens25M](https://grouplens.org/datasets/movielens/25m/) is a popular dataset for recommender systems and is used in academic publications. The dataset contains 25M movie ratings for 62,000 movies given by 162,000 users. Many projects use only the user/item/rating information of MovieLens, but the original dataset provides metadata for the movies, as well. For example, which genres a movie has. Although we may not improve state-of-the-art results with our neural network architecture, the purpose of this notebook is to explain how to integrate multi-hot categorical features into a neural network.

In [3]:
# External dependencies
import os
import gc
import glob

import nvtabular as nvt

  from .autonotebook import tqdm as notebook_tqdm


We define our base directory, containing the data.

In [4]:
INPUT_DATA_DIR = os.environ.get(
    "INPUT_DATA_DIR", os.path.expanduser("~/nvt-examples/movielens/data/")
)

### Defining Hyperparameters

First, we define the data schema and differentiate between single-hot and multi-hot categorical features. Note, that we do not have any numerical input features. 

In [5]:
BATCH_SIZE = 1024 * 32  # Batch Size
CATEGORICAL_COLUMNS = ["userId", "movieId"]  # Single-hot
CATEGORICAL_MH_COLUMNS = ["genres"]  # Multi-hot
NUMERIC_COLUMNS = []

# Output from ETL-with-NVTabular
TRAIN_PATHS = sorted(glob.glob(os.path.join(INPUT_DATA_DIR, "train", "*.parquet")))
VALID_PATHS = sorted(glob.glob(os.path.join(INPUT_DATA_DIR, "valid", "*.parquet")))

In the previous notebook, we used NVTabular for ETL and stored the workflow to disk. We can load the NVTabular workflow to extract important metadata for our training pipeline.

In [6]:
proc = nvt.Workflow.load(os.path.join(INPUT_DATA_DIR, "workflow"))

The embedding table shows the cardinality of each categorical variable along with its associated embedding size. Each entry is of the form `(cardinality, embedding_size)`.

In [7]:
EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES = nvt.ops.get_embedding_sizes(proc)
EMBEDDING_TABLE_SHAPES, MH_EMBEDDING_TABLE_SHAPES

({'userId': (162542, 512), 'movieId': (56635, 512)}, {'genres': (21, 16)})

### Initializing NVTabular Dataloader for PyTorch

We import PyTorch and the Merlin Dataloader for PyTorch.

In [11]:
import torch
from merlin.loader.torch import Loader

from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.framework_utils.torch.models import Model
from nvtabular.framework_utils.torch.utils import process_epoch

First, we take a look on our dataloader and how the data is represented as tensors. The NVTabular dataloaders are initialized as usual and we specify both single-hot and multi-hot categorical features as cats. The dataloader can automatically recognize the single/multi-hot columns and represent them accordingly.

In [None]:
# # TensorItrDataset returns a single batch of x_cat, x_cont, y.

# train_dataset = Loader(
#     nvt.Dataset(TRAIN_PATHS),
#     batch_size=BATCH_SIZE,
# )

# train_loader = DLDataLoader(
#     train_dataset, batch_size=None, collate_fn=lambda x: x, pin_memory=False, num_workers=0
# )

# valid_dataset = TorchAsyncItr(
#     nvt.Dataset(VALID_PATHS),
#     batch_size=BATCH_SIZE,
#     cats=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
#     conts=NUMERIC_COLUMNS,
#     labels=["rating"],
# )
# valid_loader = DLDataLoader(
#     valid_dataset, batch_size=None, collate_fn=lambda x: x, pin_memory=False, num_workers=0
# )

In [12]:
# TensorItrDataset returns a single batch of x_cat, x_cont, y.

train_dataset = TorchAsyncItr(
    nvt.Dataset(TRAIN_PATHS),
    batch_size=BATCH_SIZE,
    cats=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
    conts=NUMERIC_COLUMNS,
    labels=["rating"],
)
train_loader = DLDataLoader(
    train_dataset, batch_size=None, collate_fn=lambda x: x, pin_memory=False, num_workers=0
)

valid_dataset = TorchAsyncItr(
    nvt.Dataset(VALID_PATHS),
    batch_size=BATCH_SIZE,
    cats=CATEGORICAL_COLUMNS + CATEGORICAL_MH_COLUMNS,
    conts=NUMERIC_COLUMNS,
    labels=["rating"],
)
valid_loader = DLDataLoader(
    valid_dataset, batch_size=None, collate_fn=lambda x: x, pin_memory=False, num_workers=0
)

Let's generate a batch and take a look on the input features.<br><br>
The single-hot categorical features (`userId` and `movieId`) have a shape of `(32768, 1)`, which is the batch size (as usually). For the multi-hot categorical feature `genres`, we receive two Tensors `genres__values` and `genres__nnzs`.<br><br>
- `values` are the actual data, containing the genre IDs. Note that the Tensor has more values than the batch_size. The reason is, that one datapoint in the batch can contain more than one genre (multi-hot).<br>
- `nnzs` are a supporting Tensor, describing how many genres are associated with each datapoint in the batch.<br><br>
For example,
- if the first two values in `nnzs` is `0`, `2`, then the first 2 values (0, 1) in `values` are associated with the first datapoint in the batch (movieId/userId).<br>
- if the next value in `nnzs` is `6`, then the 3rd, 4th and 5th value in `values` are associated with the second datapoint in the batch (continuing after the previous value stopped).<br> 
- if the third value in `nnzs` is `7`, then the 6th value in `values` are associated with the third datapoint in the batch. 
- and so on

In [13]:
batch = next(iter(train_loader))
batch

({'genres': (tensor([ 2,  3,  4,  ...,  1, 16,  4], device='cuda:0'),
   tensor([[    0],
           [    1],
           [    3],
           ...,
           [88629],
           [88630],
           [88634]], device='cuda:0', dtype=torch.int32)),
  'userId': tensor([[101587],
          [   227],
          [  2441],
          ...,
          [ 40914],
          [ 23715],
          [112017]], device='cuda:0'),
  'movieId': tensor([[ 662],
          [ 243],
          [2930],
          ...,
          [ 862],
          [  10],
          [5302]], device='cuda:0')},
 tensor([0., 1., 0.,  ..., 1., 0., 0.], device='cuda:0'))

In [14]:
del train_dataset

In [15]:
ds = nvt.Dataset(TRAIN_PATHS)

In [16]:
ds.compute()

Unnamed: 0,userId,movieId,genres,rating
0,101587,662,[2],0.0
1,227,243,"[3, 4]",1.0
2,2441,2930,"[1, 16, 6]",0.0
3,101761,578,"[1, 6]",1.0
4,84264,181,"[1, 7]",0.0
...,...,...,...,...
20000071,3125,320,"[3, 5, 1, 7, 4]",1.0
20000072,28915,57,"[13, 10, 9, 16, 6, 15]",1.0
20000073,45526,1536,"[1, 6]",1.0
20000074,143912,5379,"[12, 4]",1.0


In [17]:
ds.npartitions

1

In [34]:
train = Loader(nvt.Dataset(TRAIN_PATHS), batch_size=2**16)

In [37]:
train

AttributeError: 'Loader' object has no attribute 'map'

In [19]:
# !pip install -U merlin-dataloader

In [20]:
for batch in train: break

In [21]:
batch

({'genres': (tensor([ 2,  3,  4,  ...,  7,  1, 14], device='cuda:0'),
   tensor([[     0],
           [     1],
           [     3],
           ...,
           [177340],
           [177343],
           [177345]], device='cuda:0', dtype=torch.int32)),
  'userId': tensor([[101587],
          [   227],
          [  2441],
          ...,
          [  2972],
          [   403],
          [ 24978]], device='cuda:0'),
  'movieId': tensor([[ 662],
          [ 243],
          [2930],
          ...,
          [2631],
          [2814],
          [   8]], device='cuda:0'),
  'rating': tensor([0., 1., 0.,  ..., 0., 1., 1.], device='cuda:0')},
 None)

`X_cat_multihot` is a tuple of two Tensors. For the multi-hot categorical feature `genres`, we receive two Tensors `values` and `nnzs`.

In [22]:
X_cat_multihot = batch[0]['genres']
X_cat_multihot

(tensor([ 2,  3,  4,  ...,  7,  1, 14], device='cuda:0'),
 tensor([[     0],
         [     1],
         [     3],
         ...,
         [177340],
         [177343],
         [177345]], device='cuda:0', dtype=torch.int32))

In [23]:
X_cat_multihot[0].shape

torch.Size([177347])

In [24]:
X_cat_multihot[1].shape

torch.Size([65536, 1])

As each datapoint can have a different number of genres, it is more efficient to represent the genres as two flat tensors: One with the actual values (`values`) and one with the length for each datapoint (`nnzs`).

In [25]:
del batch
gc.collect()

56

### Defining Neural Network Architecture

We implemented a simple PyTorch architecture.

* Single-hot categorical features are fed into an Embedding Layer
* Each value of a multi-hot categorical features is fed into an Embedding Layer and the multiple Embedding outputs are combined via summing
* The output of the Embedding Layers are concatenated
* The concatenated layers are fed through multiple feed-forward layers (Dense Layers, BatchNorm with ReLU activations)

You can see more details by checking out the implementation.

In [26]:
# ??Model

We initialize the model. `EMBEDDING_TABLE_SHAPES` needs to be a Tuple representing the cardinality for single-hot and multi-hot input features.

In [27]:
EMBEDDING_TABLE_SHAPES_TUPLE = (
    {
        CATEGORICAL_COLUMNS[0]: EMBEDDING_TABLE_SHAPES[CATEGORICAL_COLUMNS[0]],
        CATEGORICAL_COLUMNS[1]: EMBEDDING_TABLE_SHAPES[CATEGORICAL_COLUMNS[1]],
    },
    {CATEGORICAL_MH_COLUMNS[0]: MH_EMBEDDING_TABLE_SHAPES[CATEGORICAL_MH_COLUMNS[0]]},
)
EMBEDDING_TABLE_SHAPES_TUPLE

({'userId': (162542, 512), 'movieId': (56635, 512)}, {'genres': (21, 16)})

In [28]:
model = Model(
    embedding_table_shapes=EMBEDDING_TABLE_SHAPES_TUPLE,
    num_continuous=0,
    emb_dropout=0.0,
    layer_hidden_dims=[128, 128, 128],
    layer_dropout_rates=[0.0, 0.0, 0.0],
).to("cuda")
model

Model(
  (initial_cat_layer): ConcatenatedEmbeddings(
    (embedding_layers): ModuleList(
      (0): Embedding(162542, 512)
      (1): Embedding(56635, 512)
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (mh_cat_layer): MultiHotEmbeddings(
    (embedding_layers): ModuleList(
      (0): EmbeddingBag(21, 16, mode=sum)
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (initial_cont_layer): BatchNorm1d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=1040, out_features=128, bias=True)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Dropout(p=0.0, inplace=False)
    )
    (1): Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Dropout(p=0.0, in

We initialize the optimizer.

In [29]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [30]:
train_loader = train

We use the `process_epoch` function to train and validate our model. It iterates over the dataset and calculates as usually the loss and optimizer step.

In [32]:
%%time
from time import time
EPOCHS = 1
for epoch in range(EPOCHS):
    start = time()
    train_loss, y_pred, y = process_epoch(train_loader,
                                          model,
                                          train=True,
                                          optimizer=optimizer)
    valid_loss, y_pred, y = process_epoch(valid_loader,
                                          model,
                                          train=False)
    print(f"Epoch {epoch:02d}. Train loss: {train_loss:.4f}. Valid loss: {valid_loss:.4f}.")
t_final = time() - start
total_rows = train_dataset.num_rows_processed + valid_dataset.num_rows_processed
print(
    f"run_time: {t_final} - rows: {total_rows * EPOCHS} - epochs: {EPOCHS} - dl_thru: {(total_rows * EPOCHS) / t_final}"
)

AttributeError: 'NoneType' object has no attribute 'detach'

In [33]:
%debug

> [0;32m/usr/local/lib/python3.8/dist-packages/nvtabular/framework_utils/torch/utils.py[0m(101)[0;36mprocess_epoch[0;34m()[0m
[0;32m     99 [0;31m                [0mx_cont[0m [0;34m=[0m [0mx_cont[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m                [0my[0m [0;34m=[0m [0my[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 101 [0;31m            [0my_list[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0my[0m[0;34m.[0m[0mdetach[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m            [0;31m# maybe autocast goes here?[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    103 [0;31m            [0;32mif[0m [0mamp[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> ll
[1;32m     59 [0mdef process_epoch(
[1;32m     60 [0m    [0mdataloader[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[1;3

ipdb> a
dataloader = <merlin.dataloader.torch.Loader object at 0x7f61103b3ee0>
model = Model(
  (initial_cat_layer): ConcatenatedEmbeddings(
    (embedding_layers): ModuleList(
      (0): Embedding(162542, 512)
      (1): Embedding(56635, 512)
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (mh_cat_layer): MultiHotEmbeddings(
    (embedding_layers): ModuleList(
      (0): EmbeddingBag(21, 16, mode=sum)
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (initial_cont_layer): BatchNorm1d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=1040, out_features=128, bias=True)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Dropout(p=0.0, inplace=False)
    )
    (1): Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(128, eps=1

In [None]:
%debug

> [0;32m/usr/local/lib/python3.8/dist-packages/nvtabular/framework_utils/torch/utils.py[0m(101)[0;36mprocess_epoch[0;34m()[0m
[0;32m     99 [0;31m                [0mx_cont[0m [0;34m=[0m [0mx_cont[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m                [0my[0m [0;34m=[0m [0my[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 101 [0;31m            [0my_list[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0my[0m[0;34m.[0m[0mdetach[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m            [0;31m# maybe autocast goes here?[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    103 [0;31m            [0;32mif[0m [0mamp[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> y_list
[]
ipdb> y
ipdb> ll
[1;32m     59 [0mdef process_epoch(
[1;32m     60 [0m    [0mdataloader[0m[0;34m,[0m[0;34m

ipdb> batch


In [18]:
%%time
from time import time
EPOCHS = 1
for epoch in range(EPOCHS):
    start = time()
    train_loss, y_pred, y = process_epoch(train_loader,
                                          model,
                                          train=True,
                                          optimizer=optimizer)
    valid_loss, y_pred, y = process_epoch(valid_loader,
                                          model,
                                          train=False)
    print(f"Epoch {epoch:02d}. Train loss: {train_loss:.4f}. Valid loss: {valid_loss:.4f}.")
t_final = time() - start
total_rows = train_dataset.num_rows_processed + valid_dataset.num_rows_processed
print(
    f"run_time: {t_final} - rows: {total_rows * EPOCHS} - epochs: {EPOCHS} - dl_thru: {(total_rows * EPOCHS) / t_final}"
)

Total batches: 610
Total batches: 152
Epoch 00. Train loss: 0.1909. Valid loss: 0.1693.
run_time: 17.438128232955933 - rows: 2292 - epochs: 1 - dl_thru: 131.43612487425113
CPU times: user 16.9 s, sys: 310 ms, total: 17.2 s
Wall time: 17.4 s
