In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
# Dask
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client

import boto3

import os

import nvtabular as nvt
from nvtabular.ops import Normalize,  Categorify,  LogOp, FillMissing, Clip

from nvtabular.utils import get_rmm_size

# Amazon Web Service - NVTabular Criteo Example 

## Dependencies and Authentication

Make sure dependencies are installed: `pip install s3fs`, `pip install dask-cloudprovider[aws]`
    
Configure credentials (`awsconfigure`)

## Starting Cluster

Cluster configuration

In [3]:
cluster_config = {
    # AWS config options
    "region": "us-east-1",
    "instance_type": "p3.2xlarge",  # Has 1 V100s
    "filesystem_size": 1000,
    
    # RAPIDS config options
    "docker_image": "nvcr.io/nvidia/nvtabular:0.3",
    "worker_class": "dask_cuda.CUDAWorker",
    
    # Dask/Python options
    "n_workers": 1,
    "env_vars": {"EXTRA_PIP_PACKAGES": "s3fs", "EXTRA_CONDA_PACKAGES": "distributed"},
}

Start the cluster and wait until the 2 GPUs are ready

In [None]:
cluster = GCPCluster(**cluster_config)
cluster

Launching cluster with the following configuration: 
  Source Image: projects/ubuntu-os-cloud/global/images/ubuntu-minimal-1804-bionic-v20201014 
  Docker Image: nvcr.io/nvidia/nvtabular:0.3 
  Machine Type: n1-standard-8 
  Filesytsem Size: 10000 
  N-GPU Type: 1 nvidia-tesla-v100
  Zone: us-east1-c 
Creating scheduler instance
dask-759aa49e-scheduler
	Internal IP: 10.142.0.14
	External IP: 104.196.12.26
Waiting for scheduler to run


In [None]:
client = Client(cluster)
client.wait_for_workers(1) # because one GPU workers per node
client

## Running NVTabular Criteo Benchmark

In [None]:
def setup_rmm_pool(client, pool_size):
    # Initialize an RMM pool allocator.
    # Note: RMM may require the pool size to be a multiple of 256.
    pool_size = get_rmm_size(pool_size)
    client.run(rmm.reinitialize, pool_allocator=True, initial_pool_size=pool_size)
    return None

### Dataset and Data schema

In [9]:
# Input dataset located in GCS
input_path = "s3://merlin-datasets/crit_int_pq/"
output_path = "s3://merlin-datasets/output"

# Output data paths
BASE_DIR = "output"
dask_workdir = os.path.join(BASE_DIR, "test_dask/workdir")
out_path = os.path.join(BASE_DIR, "test_dask/output")
stats_path = os.path.join(BASE_DIR, "test_dask/stats")

s3 = boto3.resource('s3')
bucket = s3.Bucket('merlin-datasets')

# Make sure we have a clean worker space for Dask
for key in bucket.list(prefix=dask_workdir):
    key.delete()

# Make sure we have a clean stats space for Dask
for key in bucket.list(prefix=stats_path):
    key.delete()

# Make sure we have a clean output path
for key in bucket.list(prefix=out_path):
    key.delete()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError


In [None]:
# number of days worth of data to use for training, the rest will be used for validation
NUM_DAYS = 24
NUM_TRAIN_DAYS = 23

# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

In [None]:
# Training and validation files
fname = 'day_{}.parquet'
train_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS)]
valid_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS, NUM_DAYS)]

### Create and run Workflow

In [None]:
# conts and cats operators
cat_features = CATEGORICAL_COLUMNS >> Categorify(freq_threshold=15, out_path=stats_path)
cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(min_value=0) >> LogOp() >> Normalize()

# Create Workflow
workflow = nvt.Workflow(cat_features + cont_features + LABEL_COLUMNS, client=client)

In [None]:
# Create data iterators
train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size)
valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size)
# Create output dirs
output_train_dir = os.path.join(output_path, 'train/')
output_valid_dir = os.path.join(output_path, 'valid/')

In [None]:
%%time
workflow.transform(train_dataset).to_parquet(output_path=output_train_dir,
                                         shuffle=nvt.io.Shuffle.PER_PARTITION, 
                                         out_files_per_proc=5)

In [None]:
%%time
workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir)

## Stop Cluster

Close the client and cluster so resources in GCP are releasd

In [None]:
client.close()
cluster.close()