In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [1]:
# Dask
from dask_cloudprovider.aws import EC2Cluster
from dask.distributed import Client

# Google Cloud Platform - NVTabular Criteo Example 

## Dependencies and Authentication

Make sure dependencies are installed: `pip install s3fs`, `pip install dask-cloudprovider[aws]`
    
Configure credentials (`awsconfigure`)

## Starting Cluster

Cluster configuration

In [2]:
cluster_config = {
    # AWS config options
    "region": "us-east-1",
    "instance_type": "p3.2xlarge",  # Has 1 V100s
    "filesystem_size": 1000,
    
    # RAPIDS config options
    "docker_image": "rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.8",
    "worker_class": "dask_cuda.CUDAWorker",
    
    # Dask/Python options
    "n_workers": 1,
    "env_vars": {"EXTRA_PIP_PACKAGES": "s3fs"},
}

Start the cluster and wait until the 2 GPUs are ready

In [3]:
cluster = EC2Cluster(**cluster_config)
cluster

Creating scheduler instance
Created instance i-09b0f8bd001ea2feb as dask-7958ff92-scheduler
Waiting for scheduler to run
Scheduler is running


  next(self.gen)


Creating worker instance
Created instance i-0fa8b91d2fb063975 as dask-7958ff92-worker-5f7d4d80


VBox(children=(HTML(value='<h2>EC2Cluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .d…

In [None]:
client = Client(cluster)
client.wait_for_workers(1) # because one GPU workers per node
client


+-------------+--------+-----------+---------+
| Package     | client | scheduler | workers |
+-------------+--------+-----------+---------+
| distributed | 2.30.0 | 2.30.1    | None    |
| tornado     | 6.0.4  | 6.1       | None    |
+-------------+--------+-----------+---------+


## Running NVTabular Criteo Benchmark

### Dataset and Data schema

In [None]:
# Input dataset located in GCS
input_path = "gs://merlin-datasets/crit_int_pq/"
# Output data paths
BASE_DIR = "/raid/criteo/tests/"
dask_workdir = os.path.join(BASE_DIR, "test_dask/workdir")
output_path = os.path.join(BASE_DIR, "test_dask/output")
stats_path = os.path.join(BASE_DIR, "test_dask/stats")

# number of days worth of data to use for training, the rest will be used for validation
NUM_DAYS = 24
NUM_TRAIN_DAYS = 23

# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

# Make sure we have a clean worker space for Dask
if os.path.isdir(dask_workdir):
    shutil.rmtree(dask_workdir)
os.makedirs(dask_workdir)

# Make sure we have a clean stats space for Dask
if os.path.isdir(stats_path):
    shutil.rmtree(stats_path)
os.mkdir(stats_path)
         
# Make sure we have a clean output path
if os.path.isdir(output_path):
    shutil.rmtree(output_path)
os.mkdir(output_path)

In [None]:
# Training and validation files
fname = 'day_{}.parquet'
train_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS)]
valid_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS, NUM_DAYS)]
print(train_paths)
print(valid_paths)

### Create and run Workflow

In [None]:
# Create Workflow
proc = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names=CONTINUOUS_COLUMNS,
    label_name=LABEL_COLUMNS,
    client = client)

# Apply conts and cats operators
proc.add_cont_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()])
proc.add_cat_preprocess(ops.Categorify(freq_threshold=15, out_path=stats_path))

In [None]:
# Create data iterators
train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size)
valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size)
# Create output dirs
output_train_dir = os.path.join(output_path, 'train/')
output_valid_dir = os.path.join(output_path, 'valid/')
! mkdir -p $output_train_dir
! mkdir -p $output_valid_dir

In [None]:
%%time
proc.apply(train_dataset, shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train_dir, out_files_per_proc=5)

In [None]:
%%time
proc.apply(train_dataset, shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train_dir, out_files_per_proc=5)

## Stop Cluster

Close the client and cluster so resources in GCP are releasd

In [None]:
client.close()
cluster.close()