In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

### Conversion Script for Criteo Dataset (CSV-to-Parquet) 

__Step 1__: Import libraries

In [1]:
import os
import glob

import numpy as np
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

import nvtabular as nvt
from nvtabular.utils import device_mem_size, get_rmm_size

__Step 2__: Specify options

Specify the input and output paths, unless the `INPUT_DATA_DIR` and `OUTPUT_DATA_DIR` environment variables are already set. For multi-GPU systems, check that the `CUDA_VISIBLE_DEVICES` environment variable includes all desired device IDs.

In [2]:
INPUT_PATH = os.environ.get('INPUT_DATA_DIR', '/datasets/criteo/crit_orig')
OUTPUT_PATH = os.environ.get('OUTPUT_DATA_DIR', '/raid/criteo/tests/demo_out')
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
frac_size = 0.10

__Step 3__: (Optionally) Start a Dask cluster

In [3]:
cluster = LocalCUDACluster(
    CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICES,
    rmm_pool_size=get_rmm_size(0.8 * device_mem_size()),
    local_directory=os.path.join(OUTPUT_PATH, "dask-space"),
)
client = Client(cluster)

__Step 5__: Convert original data to an NVTabular Dataset

In [4]:
# Specify column names
cont_names = ["I" + str(x) for x in range(1, 14)]
cat_names = ["C" + str(x) for x in range(1, 27)]
cols = ["label"] + cont_names + cat_names

# Specify column dtypes. Note that "hex" means that
# the values will be hexadecimal strings that should
# be converted to int32
dtypes = {}
dtypes["label"] = np.int32
for x in cont_names:
    dtypes[x] = np.int32
for x in cat_names:
    dtypes[x] = "hex"

# Create an NVTabular Dataset from a CSV-file glob
file_list = glob.glob(os.path.join(INPUT_PATH, "day_*"))
dataset = nvt.Dataset(
    file_list,
    engine="csv",
    names=cols,
    part_mem_fraction=frac_size,
    sep='\t',
    dtypes=dtypes,
    client=client,
)

**__Step 6__**: Write Dataset to Parquet

In [6]:
%%time 

dataset.to_parquet(
    os.path.join(OUTPUT_PATH, "criteo"),
    preserve_files=True,
)

CPU times: user 54 s, sys: 7.09 s, total: 1min 1s
Wall time: 14min 13s
