In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [7]:
# Dask
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client

import os

ModuleNotFoundError: No module named 'nvtabular'

# Google Cloud Platform - NVTabular Criteo Example 

## Dependencies and Authentication

Make sure dependencies are installed: `pip install gcsfs`, `pip install dask-cloudprovider[gcp]`
    
Configure credentials (`gcloud auth login`) and set project (`gcloud config set project <project>`).

## Starting Cluster

Cluster configuration

In [3]:
cluster_config = {
    # GCP config options
    "projectid": "merlin-295819",
    "zone": "us-east1-c",
    "machine_type": "n1-standard-8",
    "gpu_type": "nvidia-tesla-v100",
    "ngpus": 1,
    "filesystem_size": 10000,
    
    # RAPIDS config options
    "docker_image": "nvcr.io/nvidia/nvtabular:0.3",
    "worker_class": "dask_cuda.CUDAWorker",
    
    # Dask/Python options
    "n_workers": 1,
    "env_vars": {"EXTRA_PIP_PACKAGES": "s3fs", "EXTRA_CONDA_PACKAGES": "distributed"},
}

Start the cluster and wait until the 2 GPUs are ready

In [4]:
cluster = GCPCluster(**cluster_config)
cluster

Launching cluster with the following configuration: 
  Source Image: projects/ubuntu-os-cloud/global/images/ubuntu-minimal-1804-bionic-v20201014 
  Docker Image: rapidsai/rapidsai:cuda11.0-runtime-ubuntu18.04-py3.8 
  Machine Type: n1-standard-8 
  Filesytsem Size: 10000 
  N-GPU Type: 1 nvidia-tesla-v100
  Zone: us-east1-c 
Creating scheduler instance
dask-407e2fe8-scheduler
	Internal IP: 10.142.0.12
	External IP: 35.243.231.24
Waiting for scheduler to run
Scheduler is running


  next(self.gen)


Creating worker instance
dask-407e2fe8-worker-90d99513
	Internal IP: 10.142.0.13
	External IP: 34.75.31.97


VBox(children=(HTML(value='<h2>GCPCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .d…

In [5]:
client = Client(cluster)
client.wait_for_workers(1) # because one GPU workers per node
client


+-------------+--------+-----------+---------+
| Package     | client | scheduler | workers |
+-------------+--------+-----------+---------+
| distributed | 2.30.0 | 2.30.1    | None    |
| tornado     | 6.0.4  | 6.1       | None    |
+-------------+--------+-----------+---------+


0,1
Client  Scheduler: tcp://35.243.231.24:8786  Dashboard: http://35.243.231.24:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 31.56 GB


## Running NVTabular Criteo Benchmark

### Dataset and Data schema

In [9]:
# Input dataset located in GCS
input_path = "gs://merlin-datasets/crit_int_pq/"
# Output data paths
BASE_DIR = "/tmp/"
dask_workdir = os.path.join(BASE_DIR, "test_dask/workdir")
output_path = os.path.join(BASE_DIR, "test_dask/output")
stats_path = os.path.join(BASE_DIR, "test_dask/stats")

# number of days worth of data to use for training, the rest will be used for validation
NUM_DAYS = 24
NUM_TRAIN_DAYS = 23

# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

# Make sure we have a clean worker space for Dask
if os.path.isdir(dask_workdir):
    shutil.rmtree(dask_workdir)
os.makedirs(dask_workdir)

# Make sure we have a clean stats space for Dask
if os.path.isdir(stats_path):
    shutil.rmtree(stats_path)
os.mkdir(stats_path)
         
# Make sure we have a clean output path
if os.path.isdir(output_path):
    shutil.rmtree(output_path)
os.mkdir(output_path)

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError


In [None]:
# Training and validation files
fname = 'day_{}.parquet'
train_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS)]
valid_paths = [os.path.join(input_path, fname.format(day)) for day in range(NUM_TRAIN_DAYS, NUM_DAYS)]
print(train_paths)
print(valid_paths)

### Create and run Workflow

In [None]:
# conts and cats operators
cat_features = CATEGORICAL_COLUMNS >> Categorify(freq_threshold=15, out_path=OUTPUT_DATA_DIR)
cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(min_value=0) >> LogOp() >> Normalize()
features = cat_features + cont_features + LABEL_COLUMNS

# Create Workflow
workflow = nvt.Workflow(features)

In [None]:
# Create data iterators
train_dataset = nvt.Dataset(train_paths, engine='parquet', part_size=part_size)
valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_size=part_size)
# Create output dirs
output_train_dir = os.path.join(output_path, 'train/')
output_valid_dir = os.path.join(output_path, 'valid/')
! mkdir -p $output_train_dir
! mkdir -p $output_valid_dir

In [None]:
%%time
workflow.transform(train_dataset).to_parquet(output_path=output_train_dir,
                                         shuffle=nvt.io.Shuffle.PER_PARTITION, 
                                         out_files_per_proc=5)

In [None]:
%%time
workflow.transform(valid_dataset).to_parquet(output_path=output_valid_dir)

## Stop Cluster

Close the client and cluster so resources in GCP are releasd

In [None]:
client.close()
cluster.close()