In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# Preprocessing the Criteo Dataset
We are interested to benchmark the NVTabular data loader and compare its performance to the TensorFlow "native" data loader based on tf.records. First, we need to preprocess the dataset with NVTabular to normalize continuous features and categorify categorical ones. Afterwards, we transform the parquet files into tf.records.<br><br>
The input for this notebook is based on [optimize_criteo.ipynb](https://github.com/NVIDIA/NVTabular/blob/main/examples/optimize_criteo.ipynb).

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import os
from time import time
import re
import glob
import warnings

# tools for data preproc/loading
import torch
import rmm
import nvtabular as nvt
from nvtabular.ops import Normalize,  Categorify,  LogOp, FillMissing, Clip, get_embedding_sizes
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.utils import device_mem_size

import multiprocessing as mp
from itertools import repeat
from tqdm.notebook import trange

We define multiple helper functions.<br><br>
*preproces_criteo* defines a NVTabular workflow to preprocess the data. It fills missing values, clip them, apply the logarithm function. Finally, the continuous features are normalized and categorical features are categorify. For more details, take a look on the [Criteo example](https://github.com/NVIDIA/NVTabular/blob/main/examples/criteo-example.ipynb).<br><br>
*transform_tfrecords* loads the NVTabular output as parquet file and transforms them into tf.records format. The function organize the different components, such as TFRecordWriter, seralizing the data and using multi-processing.

In [3]:
### Helper Function

def preproces_criteo():
    fname = 'day_{}.parquet'
    num_days = len([i for i in os.listdir(INPUT_DATA_DIR) if re.match(fname.format('[0-9]{1,2}'), i) is not None])
    train_paths = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in range(1)]
    valid_paths = [os.path.join(INPUT_DATA_DIR, fname.format(day)) for day in [2]]
    train_paths, valid_paths
    
    proc = nvt.Workflow(
        cat_names=CATEGORICAL_COLUMNS,
        cont_names=CONTINUOUS_COLUMNS,
        label_name=LABEL_COLUMNS
    )
    
    proc.add_cont_feature([FillMissing(), Clip(min_value=0), LogOp()])
    proc.add_cont_preprocess(Normalize())
    proc.add_cat_preprocess(Categorify(freq_threshold=15, out_path=OUTPUT_DATA_DIR))
    
    train_dataset = nvt.Dataset(train_paths, engine='parquet', part_mem_fraction=0.15)
    valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_mem_fraction=0.15)
    
    os.system('rm -r ' + OUTPUT_DATA_DIR)
    os.system('mkdir -p ' + output_train_dir)
    os.system('mkdir -p ' + output_valid_dir)
    
    proc.apply(train_dataset, 
               shuffle=nvt.io.Shuffle.PER_PARTITION, 
               output_path=output_train_dir, 
               out_files_per_proc=20
              )
    
    proc.apply(valid_dataset, 
               record_stats=False, 
               shuffle=nvt.io.Shuffle.PER_PARTITION, 
               output_path=output_valid_dir, 
               out_files_per_proc=20
              )
    
    proc.save_stats(OUTPUT_DATA_DIR + '/stats_and_workflow')

def transform_tfrecords(parquet_files, TFRECORDS):
    os.system('rm -r ' + TFRECORDS)
    write_dir = os.path.dirname(TFRECORDS)
    if not os.path.exists(write_dir):
        os.makedirs(write_dir)
    file_idx, example_idx = 0, 0
    writer = get_writer(write_dir, file_idx)

    do_break = False
    column_names = [CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS+[LABEL_COLUMNS[0]]]
    with mp.Pool(8, pool_initializer, column_names) as pool:
        fnames = glob.glob(parquet_files)
        ds_iterator = nvt.Dataset(fnames, part_mem_fraction=0.1)
        pbar = trange(BATCH_SIZE*STEPS)

        for df in ds_iterator.to_iter():
            data = []
            for col_names in column_names:
                if len(col_names) == 0:
                    data.append(repeat(None))
                else:
                    data.append(df[col_names].to_pandas().values)
            data = zip(*data)

            record_map = pool.imap(build_and_serialize_example, data, chunksize=200)
            for record in record_map:
                writer.write(record)
                example_idx += 1

                if example_idx == EXAMPLES_PER_RECORD:
                    writer.close()
                    file_idx += 1
                    writer = get_writer(write_dir, file_idx)
                    example_idx = 0
                pbar.update(1)
                if pbar.n == BATCH_SIZE*STEPS:
                    do_break = True
                    break
            if do_break:
                del df
                break

    writer.close()

def pool_initializer(num_cols, cat_cols):
    global numeric_columns
    global categorical_columns
    numeric_columns = num_cols
    categorical_columns = cat_cols

def build_and_serialize_example(data):
    numeric_values, categorical_values = data
    feature = {}
    if numeric_values is not None:
        feature.update({
            col: tf.train.Feature(float_list=tf.train.FloatList(value=[val]))
                for col, val in zip(numeric_columns, numeric_values)
    })
    if categorical_values is not None:
        feature.update({
            col: tf.train.Feature(int64_list=tf.train.Int64List(value=[val]))
                for col, val in zip(categorical_columns, categorical_values)
    })
    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()

def get_writer(write_dir, file_idx):
    filename = str(file_idx).zfill(5) + '.tfrecords'
    return tf.io.TFRecordWriter(os.path.join(write_dir, filename))

First, we define the directory stucture. The base directory and input and output directories for .parquet and tf.records.

In [4]:
# define some information about where to get our data
INPUT_DIR = '/raid/data/criteo/input/'
OUTPUT_DIR = '/raid/data/criteo/'
INPUT_DATA_DIR = INPUT_DIR
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', OUTPUT_DIR + 'output') # where we'll save our procesed data to
TFRECORD_DIR = os.environ.get("TFRECORD_DIR", OUTPUT_DIR + 'tfrecords')
TFRECORDS_TRAIN = os.path.join(TFRECORD_DIR, 'train', '*.tfrecords')
TFRECORDS_VALID = os.path.join(TFRECORD_DIR, 'valid', '*.tfrecords')

output_train_dir = os.path.join(OUTPUT_DATA_DIR, 'train/')
output_valid_dir = os.path.join(OUTPUT_DATA_DIR, 'valid/')
tf_input_train_dir = output_train_dir + '*.parquet'
tf_input_valid_dir = output_valid_dir + '*.parquet'

2288

We need to define the data schema, which column names are continouos, categorical and label columns.

In [None]:
# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

For tf.records, we need to pre-define some hyperparameters. How many examples per tf.record file should be stored and what is the batch-size, we use for training. Unfortunately, we cannot change this afterwards.  

In [None]:
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 1024*64))
EXAMPLES_PER_RECORD = 20000000
# Max. number of steps per epoch (tf.records allows only full batches)
STEPS = int(150000000/BATCH_SIZE)

We execute the NVTabular workflow to preprocess the dataset.

In [6]:
%%time

preproces_criteo()

CPU times: user 1min 26s, sys: 1min 1s, total: 2min 28s
Wall time: 3min 2s


In [8]:
import time

import tensorflow as tf

We convert the parquet files to tf.records.

In [None]:
transform_tfrecords(tf_input_train_dir, TFRECORDS_TRAIN)
transform_tfrecords(tf_input_valid_dir, TFRECORDS_VALID)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=149946368.0), HTML(value='')))