In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">

# TensorFlow: Convert TFRecords to Parquet files

## TFRecords

[TFRecords](https://www.tensorflow.org/tutorials/load_data/tfrecord) are a popular file format to store data for deep learning training with TensorFlow. It is a "simple format for storing a sequence of binary records". In many cases the dataset is too large for the host memory and the dataset is converted into (multiple) tfrecords file to disk. TensorFlow's ecosystem enables to stream the tfrecords from disk to train the model without requiring to load the full dataset.<br><br>
That sounds great, but there are some disadvantages when working with tabular dataset. TFRecords stores the dataset as key, values. In other domains, such as computer vision, this representation is efficient as the key is `image` and the values are a the pixels. For an RGB image with 200x200 resoultion, there are 120000 (200x200x3) values. In a tabular dataset, a feature is often a single number and therefore, there is a significant overhead for using a key in each example. **In some of our experiments, we experienced that tfrecords can be ~4-5x larger than `parquet` files for the same dataset.**
<br><br>
[Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is another file format to store data. It is a free and open-source data storage format in the Hadoop ecosystem. Many popular systems, such as Spark or Pandas, support to read and write parquet files. 
<br><br>
We developed [NVTabular Data Loaders](https://nvidia.github.io/NVTabular/main/training/index.html) as a customized data loader, fully operating on the GPU. It reads the data from disk into the GPU memory and prepares the next batch on the GPU. Therefore, we do not have any CPU-GPU communication. Our data loader leverages parquet files to reduce the disk pressure. **In our experiments, we experienced that the native data loader is the bottleneck in training tabular deep learning models and by changing the native data loader to NVTabular Data Loader, we saw a 8-9x speed-up.**

### Convert TFRecords to Parquet files
That is a lot of background information. In many cases, we saw that users have their dataset stored as tfrecords files. In this notebook, we provide a tfrecords to parquet examples. Users can transform their dataset to parquet and be able to experiment with NVTabular data loader.

We leverage the library pandas-tfrecords.

In [2]:
!pip install pandas-tfrecords==0.1.5

distutils: /usr/local/lib/python3.8/dist-packages
sysconfig: /usr/lib/python3.8/site-packages[0m
distutils: /usr/local/lib/python3.8/dist-packages
sysconfig: /usr/lib/python3.8/site-packages[0m
distutils: /usr/local/include/python3.8/UNKNOWN
sysconfig: /usr/include/python3.8/UNKNOWN[0m
distutils: /usr/local/bin
sysconfig: /usr/bin[0m
distutils: /usr/local
sysconfig: /usr[0m
user = False
home = None
root = None
prefix = None[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


## Create a Synthetic Dataset

First, we will create a synthetic dataset. Afterwards, we will convert the synthetic data to a tfrecord file. The synthetic dataset contains `continuous features`, `categorical features`, `continuous features in a list with variable length`, `categorical features in a list with variable length` and the `label`.<br><br>
The features of a list have variable length, which are often used in session-based recommender systems. For example, the last page views in a session and sessions have different lengths.

In [3]:
import numpy as np
import pandas as pd

In [4]:
def create_synthetic_df(
    N_CONT_FEATURES, N_CAT_FEATURES, N_CONT_LIST_FEATURES, N_CAT_LIST_FEATURES, N_ROWS
):
    dict_features = {}
    for icont in range(N_CONT_FEATURES):
        dict_features["cont" + str(icont)] = np.random.uniform(-1, 1, size=N_ROWS)
    for icat in range(N_CAT_FEATURES):
        dict_features["cat" + str(icat)] = np.random.choice(list(range(10)), size=N_ROWS)
    for icontlist in range(N_CONT_LIST_FEATURES):
        feature_list = []
        for irow in range(N_ROWS):
            n_elements = np.random.choice(list(range(20)))
            feature_list.append(np.random.uniform(-1, 1, size=n_elements).tolist())
        dict_features["cont_list" + str(icontlist)] = feature_list
    for icatlist in range(N_CAT_LIST_FEATURES):
        feature_list = []
        for irow in range(N_ROWS):
            n_elements = np.random.choice(list(range(20)))
            feature_list.append(np.random.choice(list(range(10)), size=n_elements).tolist())
        dict_features["cat_list" + str(icatlist)] = feature_list
    dict_features["label"] = np.random.choice(list(range(2)), size=N_ROWS)
    df = pd.DataFrame(dict_features)
    return df

We can configure the size of the dataset and numbers of features of the different type. As this is just a example, we use only 20,000 rows.

In [5]:
N_ROWS = 20000
N_CONT_FEATURES = 5
N_CAT_FEATURES = 7
N_CONT_LIST_FEATURES = 2
N_CAT_LIST_FEATURES = 3

In [6]:
df = create_synthetic_df(
    N_CONT_FEATURES, N_CAT_FEATURES, N_CONT_LIST_FEATURES, N_CAT_LIST_FEATURES, N_ROWS
)

We can take a look on the dataset.

In [7]:
df.head()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cont_list0,cont_list1,cat_list0,cat_list1,cat_list2,label
0,0.495186,-0.156929,0.924507,0.831834,-0.562554,9,0,5,8,9,5,3,"[-0.08782642389819229, 0.8463009855676584, -0....","[0.7467091963784316, 0.6435026054221511, -0.14...","[1, 6, 3]","[4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9]","[8, 6, 7, 4, 1, 2, 2]",0
1,-0.868427,-0.867583,0.225373,0.649436,0.544803,4,5,3,1,9,4,6,"[0.038860353982820284, 0.19487974734607683, -0...","[0.23944333485883984, -0.9628970811058808, -0....","[4, 1, 2]","[7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5]","[5, 5, 7]",1
2,0.972268,-0.650685,-0.689674,-0.780836,0.677578,1,4,0,3,0,3,4,"[0.48108993016978885, -0.49063530543434286, 0....",[0.9539496388151523],[],"[6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1]","[3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7]",1
3,0.982108,0.806546,0.1501,0.488589,0.353447,3,8,6,1,4,1,3,"[-0.3854225587686282, 0.5811189366242433, 0.08...","[0.8969990392704366, -0.958170926962973, 0.622...","[6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4]",[8],"[6, 6, 8, 4, 0, 8]",1
4,-0.0449,0.653093,0.775169,0.230217,-0.280215,6,9,3,7,9,5,9,[],"[0.9450709782415434, -0.07168300021759921]",[2],"[7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9]","[3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7]",0


In [8]:
CONTINUOUS_COLUMNS = ["cont" + str(i) for i in range(N_CONT_FEATURES)]
CATEGORICAL_COLUMNS = ["cat" + str(i) for i in range(N_CAT_FEATURES)]
CONTINUOUS_LIST_COLUMNS = ["cont_list" + str(i) for i in range(N_CONT_LIST_FEATURES)]
CATEGORICAL_LIST_COLUMNS = ["cat_list" + str(i) for i in range(N_CAT_LIST_FEATURES)]
LABEL_COLUMNS = ["label"]

## Convert the Synthetic Dataset into TFRecords

After we created the synthetic dataset, we store it to tfrecords.

In [9]:
import tensorflow as tf

2021-09-10 18:30:07.714270: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [10]:
import os
import multiprocessing as mp
from itertools import repeat


def transform_tfrecords(
    df,
    PATH,
    CONTINUOUS_COLUMNS,
    CATEGORICAL_COLUMNS,
    CONTINUOUS_LIST_COLUMNS,
    CATEGORICAL_LIST_COLUMNS,
    LABEL_COLUMNS,
):
    write_dir = os.path.dirname(PATH)
    if not os.path.exists(write_dir):
        os.makedirs(write_dir)
    file_idx, example_idx = 0, 0
    writer = get_writer(write_dir, file_idx)
    column_names = [
        CONTINUOUS_COLUMNS,
        CATEGORICAL_COLUMNS + LABEL_COLUMNS,
        CONTINUOUS_LIST_COLUMNS,
        CATEGORICAL_LIST_COLUMNS,
    ]
    with mp.Pool(8, pool_initializer, column_names) as pool:
        data = []
        for col_names in column_names:
            if len(col_names) == 0:
                data.append(repeat(None))
            else:
                data.append(df[col_names].values)
        data = zip(*data)
        record_map = pool.imap(build_and_serialize_example, data, chunksize=200)
        for record in record_map:
            writer.write(record)
            example_idx += 1
    writer.close()


def pool_initializer(num_cols, cat_cols, num_list_cols, cat_list_cols):
    global numeric_columns
    global categorical_columns
    global numeric_list_columns
    global categorical_list_columns
    numeric_columns = num_cols
    categorical_columns = cat_cols
    numeric_list_columns = num_list_cols
    categorical_list_columns = cat_list_cols


def build_and_serialize_example(data):
    numeric_values, categorical_values, numeric_list_values, categorical_list_values = data
    feature = {}
    if numeric_values is not None:
        feature.update(
            {
                col: tf.train.Feature(float_list=tf.train.FloatList(value=[val]))
                for col, val in zip(numeric_columns, numeric_values)
            }
        )
    if categorical_values is not None:
        feature.update(
            {
                col: tf.train.Feature(int64_list=tf.train.Int64List(value=[val]))
                for col, val in zip(categorical_columns, categorical_values)
            }
        )
    if numeric_list_values is not None:
        feature.update(
            {
                col: tf.train.Feature(float_list=tf.train.FloatList(value=val))
                for col, val in zip(numeric_list_columns, numeric_list_values)
            }
        )
    if categorical_list_values is not None:
        feature.update(
            {
                col: tf.train.Feature(int64_list=tf.train.Int64List(value=val))
                for col, val in zip(categorical_list_columns, categorical_list_values)
            }
        )
    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()


def get_writer(write_dir, file_idx):
    filename = str(file_idx).zfill(5) + ".tfrecords"
    return tf.io.TFRecordWriter(os.path.join(write_dir, filename))

We define the output path.

In [11]:
PATH = "/raid/tfrecord-test/"

In [12]:
!rm -rf $PATH
!mkdir $PATH

In [13]:
transform_tfrecords(
    df,
    PATH,
    CONTINUOUS_COLUMNS,
    CATEGORICAL_COLUMNS,
    CONTINUOUS_LIST_COLUMNS,
    CATEGORICAL_LIST_COLUMNS,
    LABEL_COLUMNS,
)

We can check the file.

In [14]:
!ls $PATH

00000.tfrecords


## Convert TFRecords to parquet files

Now, we have a dataset in the tfrecords format. Let's use the `convert_tfrecords_to_parquet` function to convert a tfrecord file into parquet.

In [15]:
import glob

from nvtabular.framework_utils.tensorflow.tfrecords_to_parquet import convert_tfrecords_to_parquet

Let's select all TFRecords in the folder.

In [16]:
filenames = glob.glob(PATH + "/*.tfrecords")

Let's call the `convert_tfrecords_to_parquet`.<br><br>
Some details about the parameters:
* `compression_type` is the compression type of the tfrecords. Options: `""` (no compression), `"ZLIB"`, or `"GZIP"`
* `chunks` defines how many data points per `parquet` file should be saved. It splits a tfrecords into multiple parquet files.
* `convert_lists` defines, if feature lists should be converted into muliple feature columns. Even single dataframe series are 1 dimensional arrays when converted back from tfrecords to parquet.   

In [17]:
filenames

['/raid/tfrecord-test/00000.tfrecords']

In [18]:
!rm -r /raid/tfrecord-test/00000.parquet

rm: cannot remove '/raid/tfrecord-test/00000.parquet': No such file or directory


In [19]:
convert_tfrecords_to_parquet(
    filenames=filenames, output_dir=PATH, compression_type="", chunks=1000, convert_lists=True
)

2021-09-10 18:30:22.756211: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-09-10 18:30:22.758160: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:0b:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2021-09-10 18:30:22.758196: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-09-10 18:30:22.758265: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-09-10 18:30:22.758308: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2021-09-10 18:30:22.758350: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.

## Let's take a look

We can see that `convert_tfrecords_to_parquet` created multiple files per `tfrecord` depending on the chunk size.

In [20]:
filenames = glob.glob(PATH + "/*.parquet")
filenames

['/raid/tfrecord-test/00000.parquet']

If we load the first file, we cann see, that it has the same structure as our original synthetic dataset.

In [22]:
import pandas as pd

df = pd.read_parquet(filenames[0])
df.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat_list0,cat_list1,cat_list2,cont0,cont1,cont2,cont3,cont4,cont_list0,cont_list1,label
0,9,0,5,8,9,5,3,"[1, 6, 3]","[4, 7, 9, 6, 4, 7, 6, 6, 1, 0, 2, 7, 6, 9]","[8, 6, 7, 4, 1, 2, 2]",0.495186,-0.156929,0.924507,0.831834,-0.562554,"[-0.08782642, 0.84630096, -0.34404242, -0.7675...","[0.74670917, 0.6435026, -0.14159574, -0.590101...",0
1,4,5,3,1,9,4,6,"[4, 1, 2]","[7, 5, 5, 8, 9, 0, 4, 2, 9, 0, 5]","[5, 5, 7]",-0.868427,-0.867583,0.225373,0.649436,0.544803,"[0.038860355, 0.19487974, -0.63031155, 0.36691...","[0.23944333, -0.96289706, -0.7723948, 0.347194...",1
2,1,4,0,3,0,3,4,[],"[6, 5, 6, 2, 1, 2, 0, 1, 1, 7, 7, 8, 4, 1]","[3, 1, 4, 0, 2, 8, 5, 3, 8, 4, 7, 7, 7, 5, 7]",0.972268,-0.650685,-0.689674,-0.780836,0.677578,"[0.48108992, -0.4906353, 0.5207957, -0.5258586...",[0.95394963],1
3,3,8,6,1,4,1,3,"[6, 2, 5, 5, 9, 9, 8, 2, 6, 4, 5, 4]",[8],"[6, 6, 8, 4, 0, 8]",0.982108,0.806546,0.1501,0.488589,0.353447,"[-0.38542256, 0.58111894, 0.08629591, -0.63986...","[0.89699906, -0.95817095, 0.62256795, 0.141688...",1
4,6,9,3,7,9,5,9,[2],"[7, 2, 3, 0, 6, 8, 4, 8, 5, 1, 9, 5, 8, 6, 6, 9]","[3, 2, 8, 0, 6, 2, 3, 2, 0, 9, 1, 7]",-0.0449,0.653093,0.775169,0.230217,-0.280215,[],"[0.945071, -0.071683]",0


In [23]:
df.shape

(20000, 18)