In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
import os
from time import time
import re
import glob
import warnings

# tools for data preproc/loading
import torch
import rmm
import nvtabular as nvt
from nvtabular.ops import Normalize,  Categorify,  LogOp, FillMissing, Clip, get_embedding_sizes
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.utils import device_mem_size

# tools for training

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('https://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('https://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


In [3]:
# define some information about where to get our data
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', '/raid/criteo/tests/crit_int_pq')
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', '/raid/criteo/tests/test_dask') # where we'll save our procesed data to
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 32768))
AMP = os.environ.get("AMP", "true") 
AMP = True if AMP.lower() in "true" else False
PARTS_PER_CHUNK = int(os.environ.get('PARTS_PER_CHUNK', 2))
SHUFFLE = os.environ.get("SHUFFLE", False)
NUM_TRAIN_DAYS = 23 # number of days worth of data to use for training, the rest will be used for validation

# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

In [4]:
output_train_dir = os.path.join(OUTPUT_DATA_DIR, 'train/')
output_valid_dir = os.path.join(OUTPUT_DATA_DIR, 'valid/')
! mkdir -p $output_train_dir
! mkdir -p $output_valid_dir

In [5]:
# rmm.reinitialize(pool_allocator=True, initial_pool_size=0.2 * device_mem_size(kind='free'))

In [6]:
train_paths = glob.glob(os.path.join(output_train_dir, "*.parquet"))
valid_paths = glob.glob(os.path.join(output_valid_dir, "*.parquet"))


In [7]:
from nvtabular.loader.torch import TorchAsyncItr
train_data = nvt.Dataset(train_paths[:1], engine="parquet", part_mem_fraction=0.04/PARTS_PER_CHUNK)
valid_data = nvt.Dataset(valid_paths[:1], engine="parquet", part_mem_fraction=0.04/PARTS_PER_CHUNK)
train_loader = TorchAsyncItr(
    train_data,
    batch_size=BATCH_SIZE,
    cats=CATEGORICAL_COLUMNS,
    conts=CONTINUOUS_COLUMNS,
    labels=LABEL_COLUMNS,
    parts_per_chunk=PARTS_PER_CHUNK,
    shuffle=SHUFFLE,
)
valid_loader = TorchAsyncItr(
    valid_data,
    batch_size=BATCH_SIZE,
    cats=CATEGORICAL_COLUMNS,
    conts=CONTINUOUS_COLUMNS,
    labels=LABEL_COLUMNS,
    parts_per_chunk=PARTS_PER_CHUNK,
    shuffle=SHUFFLE,
)


In [8]:
EMBEDDING_DROPOUT_RATE = 0.04
DROPOUT_RATES = [0.001, 0.01]
HIDDEN_DIMS = [1000, 500]
LEARNING_RATE = 0.001
EPOCHS = 1
embeddings = {'C1': (7599500, 16),
 'C10': (5345303, 16),
 'C11': (561810, 16),
 'C12': (242827, 16),
 'C13': (11, 6),
 'C14': (2209, 16),
 'C15': (10616, 16),
 'C16': (100, 16),
 'C17': (4, 3),
 'C18': (968, 16),
 'C19': (15, 7),
 'C2': (33521, 16),
 'C20': (7838519, 16),
 'C21': (2580502, 16),
 'C22': (6878028, 16),
 'C23': (298771, 16),
 'C24': (11951, 16),
 'C25': (97, 16),
 'C26': (35, 12),
 'C3': (17022, 16),
 'C4': (7339, 16),
 'C5': (20046, 16),
 'C6': (4, 3),
 'C7': (7068, 16),
 'C8': (1377, 16),
 'C9': (63, 16)}

In [9]:
from nvtabular.framework_utils.torch.models import Model
from nvtabular.framework_utils.torch.utils import process_epoch
model = Model(
    embedding_table_shapes=embeddings,
    num_continuous=len(CONTINUOUS_COLUMNS),
    emb_dropout=EMBEDDING_DROPOUT_RATE,
    layer_hidden_dims=HIDDEN_DIMS,
    layer_dropout_rates=DROPOUT_RATES,
).to('cuda')

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
def rmspe_func(y_pred, y):
    "Return y_pred and y to non-log space and compute RMSPE"
    y_pred, y = torch.exp(y_pred) - 1, torch.exp(y) - 1
    pct_var = (y_pred - y) / y
    return (pct_var**2).mean().pow(0.5)

In [11]:
def batch_transform(batch, cont_cols=CONTINUOUS_COLUMNS, cat_cols=CATEGORICAL_COLUMNS, label_cols=LABEL_COLUMNS):
    x_cat = torch.tensor(batch[sorted(cat_cols)].values).type(torch.LongTensor).cuda() 
    x_cont = torch.tensor(batch[cont_cols].values).type(torch.FloatTensor).cuda()
    y = torch.tensor(batch[label_cols[0]].values).type(torch.FloatTensor).cuda()
    return x_cat, x_cont, y

In [12]:
for epoch in range(EPOCHS):
    start_train=time()
    train_loss, y_pred, y = process_epoch(train_loader, 
                                          model, 
                                          train=True, 
                                          optimizer=optimizer,
                                          #transform=batch_transform,
                                          amp=False,
                                         )
    train_rmspe = rmspe_func(y_pred, y)
    train_time=time() - start_train
    y_pred = None
    y = None
    start_valid=time()
    valid_loss, y_pred, y = process_epoch(valid_loader,
                                          model, 
                                          train=False, 
                                          optimizer=optimizer,
                                          #transform=batch_transform,
                                          amp=False,
                                         )
    valid_rmspe = rmspe_func(y_pred, y)
    valid_time = time() - start_valid
    y_pred = None
    y = None
    print(f"Train:{train_time} + Valid:{valid_time} = EpochTotal:{train_time + valid_time}")

> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(97)[0;36m_to_tensor[0;34m()[0m
[0;32m     95 [0;31m    [0;32mdef[0m [0m_to_tensor[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mgdf[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     96 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(99)[0;36m_to_tensor[0;34m()[0m
[0;32m     97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(100)[0;36m_to_tensor[0;34m()[0m
[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(101)[0;36m_to_tensor[0;34m()[0m
[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m[0;34m[0m[0m
[0m[0;32m    103 [0;31m    [0;31m# TODO: do we need casting or can we replace this with[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


--Return--
tensor([[1353...vice='cuda:0')
> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(101)[0;36m_to_tensor[0;34m()[0m
[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m[0;34m[0m[0m
[0m[0;32m    103 [0;31m    [0;31m# TODO: do we need casting or can we replace this with[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(97)[0;36m_to_tensor[0;34m()[0m
[0;32m     95 [0;31m    [0;32mdef[0m [0m_to_tensor[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mgdf[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     96 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(99)[0;36m_to_tensor[0;34m()[0m
[0;32m     97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(100)[0;36m_to_tensor[0;34m()[0m
[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(101)[0;36m_to_tensor[0;34m()[0m
[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m[0;34m[0m[0m
[0m[0;32m    103 [0;31m    [0;31m# TODO: do we need casting or can we replace this with[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


--Return--
<Tensor instance at 0x7f4640a4d230>
> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(101)[0;36m_to_tensor[0;34m()[0m
[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    102 [0;31m[0;34m[0m[0m
[0m[0;32m    103 [0;31m    [0;31m# TODO: do we need casting or can we replace this with[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(97)[0;36m_to_tensor[0;34m()[0m
[0;32m     95 [0;31m    [0;32mdef[0m [0m_to_tensor[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mgdf[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     96 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(99)[0;36m_to_tensor[0;34m()[0m
[0;32m     97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


MemoryError: std::bad_alloc: CUDA error at: ../include/rmm/mr/device/cuda_memory_resource.hpp:68: cudaErrorIllegalAddress an illegal memory access was encountered
> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(99)[0;36m_to_tensor[0;34m()[0m
[0;32m     97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


--Return--
None
> [0;32m/jp_docs/nvtabular/nvtabular/loader/torch.py[0m(99)[0;36m_to_tensor[0;34m()[0m
[0;32m     97 [0;31m        [0;32mif[0m [0mgdf[0m[0;34m.[0m[0mempty[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     98 [0;31m            [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 99 [0;31m        [0mdl_pack[0m [0;34m=[0m [0mgdf[0m[0;34m.[0m[0mto_dlpack[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    100 [0;31m        [0mtens[0m [0;34m=[0m [0mfrom_dlpack[0m[0;34m([0m[0mdl_pack[0m[0;34m)[0m[0;34m.[0m[0mtype[0m[0;34m([0m[0mdtype[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    101 [0;31m        [0;32mreturn[0m [0mtens[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  q


BdbQuit: 