In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======

# ETL with NVTabular

In this notebook we are going to generate synthetic data and then create sequential features with [NVTabular](https://github.com/NVIDIA/NVTabular). Such data will be used in the next notebook to train a session-based recommendation model.

NVTabular is a feature engineering and preprocessing library for tabular data designed to quickly and easily manipulate terabyte scale datasets used to train deep learning based recommender systems. It provides a high level abstraction to simplify code and accelerates computation on the GPU using the RAPIDS cuDF library.

### Import required libraries

In [2]:
import os
import glob

import torch 
import numpy as np
import pandas as pd

import cudf
import cupy as cp
import nvtabular as nvt

### Define Input/Output Path

In [3]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data/")

## Create a Synthetic Input Data

In [4]:
NUM_ROWS = 100000
long_tailed_item_distribution = np.clip(np.random.lognormal(3., 1., NUM_ROWS).astype(np.int32), 1, 50000)

# generate random item interaction features 
df = pd.DataFrame(np.random.randint(70000, 80000, NUM_ROWS), columns=['session_id'])
df['item_id'] = long_tailed_item_distribution

# generate category mapping for each item-id
df['category'] = pd.cut(df['item_id'], bins=334, labels=np.arange(1, 335)).astype(np.int32)
df['timestamp/age_days'] = np.random.uniform(0, 1, NUM_ROWS)
df['timestamp/weekday/sin']= np.random.uniform(0, 1, NUM_ROWS)

# generate day mapping for each session 
map_day = dict(zip(df.session_id.unique(), np.random.randint(1, 10, size=(df.session_id.nunique()))))
df['day'] =  df.session_id.map(map_day)

## Feature Engineering with NVTabular

Deep Learning models require dense input features. Categorical features are sparse, and need to be represented by dense embeddings in the model. To allow for that, categorical features need first to be encoded as contiguous integers `(0, ..., |C|)`, where `|C|` is the feature cardinality (number of unique values), so that their embeddings can be efficiently stored in embedding layers.  
We will use NVTabular to preprocess the categorical features, so that all categorical columns are encoded as contiguous integers. 

Here our goal is to create sequential features.  In this cell, we are creating temporal features and grouping them together at the session level, sorting the interactions by time. Note that we also trim each feature sequence in a  session to a certain length. Here, we use the NVTabular library so that we can easily preprocess and create features on GPU with a few lines.

In [5]:
# Categorify categorical features
categ_feats = ['session_id', 'item_id', 'category'] >> nvt.ops.Categorify(start_index=1)

# Define Groupby Workflow
groupby_feats = categ_feats + ['day', 'timestamp/age_days', 'timestamp/weekday/sin']

# Groups interaction features by session and sorted by timestamp
groupby_features = groupby_feats >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list", "count"],
        "category": ["list"],     
        "day": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")

# Select and truncate the sequential features
sequence_features_truncated = (groupby_features['category-list', 'item_id-list', 
                                          'timestamp/age_days-list', 'timestamp/weekday/sin-list']) >> \
                            nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')

# Filter out sessions with length 1 (not valid for next-item prediction training and evaluation)
MINIMUM_SESSION_LENGTH = 2
selected_features = groupby_features['item_id-count', 'day-first', 'session_id'] + sequence_features_truncated
filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH)


workflow = nvt.Workflow(filtered_sessions)
dataset = nvt.Dataset(df, cpu=False)
# Generating statistics for the features
workflow.fit(dataset)
# Applying the preprocessing and returning an NVTabular dataset
sessions_ds = workflow.transform(dataset)
# Converting the NVTabular dataset to a Dask cuDF dataframe (`to_ddf()`) and then to cuDF dataframe (`.compute()`)
sessions_gdf = sessions_ds.to_ddf().compute()

In [6]:
sessions_gdf.head(3)

Unnamed: 0,item_id-count,day-first,session_id,category-list_trim,item_id-list_trim,timestamp/age_days-list_trim,timestamp/weekday/sin-list_trim
0,25,9,2,"[2, 2, 5, 3, 6, 4, 10, 4, 2, 2, 66, 5, 7, 4, 3...","[3, 3, 25, 22, 28, 12, 52, 15, 10, 11, 413, 20...","[0.7136986303272446, 0.18120407184950815, 0.46...","[0.46347800502005154, 0.3510890235812454, 0.34..."
1,24,5,3,"[9, 4, 2, 4, 2, 3, 4, 6, 2, 2, 9, 4, 5, 2, 4, ...","[48, 15, 11, 12, 7, 4, 12, 28, 5, 11, 50, 15, ...","[0.8330846988133308, 0.6339836255483956, 0.818...","[0.7267929363174592, 0.6891924543129208, 0.611..."
2,23,6,4,"[3, 4, 4, 6, 5, 6, 2, 3, 12, 7, 3, 4, 4, 3, 17...","[13, 18, 16, 30, 19, 28, 8, 2, 65, 38, 6, 17, ...","[0.6549519454565732, 0.6760938177961002, 0.626...","[0.6696162303967736, 0.19386208479337608, 0.96..."


It is possible to save the preprocessing workflow. That is useful to apply the same preprocessing to other data (with the same schema) and also to deploy the session-based recommendation pipeline to Triton Inference Server.

In [7]:
workflow.save('workflow_etl')

## Export pre-processed data by day

In [8]:
OUTPUT_FOLDER = os.environ.get("OUTPUT_FOLDER",os.path.join(INPUT_DATA_DIR, "sessions_by_day"))
!mkdir -p $OUTPUT_FOLDER

In [9]:
# requires cudf + cupy + nvtabular + dask_cudf
from transformers4rec.utils.gpu_preprocessing import save_time_based_splits
save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                       output_dir= OUTPUT_FOLDER,
                       partition_col='day-first',
                       timestamp_col='session_id', 
                      )

Creating time-based splits: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 10.88it/s]


## Checking the preprocessed outputs

In [10]:
TRAIN_PATHS = sorted(glob.glob(os.path.join(OUTPUT_FOLDER, "1", "train.parquet")))

In [11]:
gdf = cudf.read_parquet(TRAIN_PATHS[0])
gdf.head()

Unnamed: 0,item_id-count,session_id,category-list_trim,item_id-list_trim,timestamp/age_days-list_trim,timestamp/weekday/sin-list_trim
0,21,8,"[3, 5, 3, 4, 5, 3, 9, 10, 2, 2, 4, 5, 16, 4, 1...","[6, 21, 4, 12, 20, 13, 50, 52, 11, 5, 15, 23, ...","[0.9719553842908865, 0.8821611053413697, 0.948...","[0.06927941994677334, 0.6817721908946835, 0.47..."
1,21,10,"[12, 7, 4, 6, 2, 12, 2, 5, 12, 2, 3, 6, 17, 6,...","[65, 35, 16, 27, 5, 63, 11, 21, 63, 11, 22, 28...","[0.7991976473964882, 0.3562884637379887, 0.689...","[0.8428343992126212, 0.9319406181028398, 0.341..."
2,20,19,"[4, 17, 4, 9, 8, 4, 3, 3, 3, 3, 4, 6, 7, 18, 2...","[14, 92, 14, 46, 39, 17, 4, 2, 13, 2, 18, 27, ...","[0.9461446273589291, 0.2636703158463797, 0.189...","[0.3645245443096057, 0.06704003381301427, 0.94..."
4,20,24,"[8, 8, 3, 7, 6, 18, 6, 2, 4, 7, 23, 2, 12, 4, ...","[40, 40, 6, 34, 26, 106, 29, 11, 17, 38, 131, ...","[0.24059078789801147, 0.4823239789692144, 0.16...","[0.41404942555231794, 0.4220584940326747, 0.91..."
5,20,26,"[6, 9, 3, 6, 4, 9, 4, 4, 18, 15, 3, 7, 5, 2, 1...","[29, 49, 13, 31, 16, 45, 12, 16, 96, 85, 13, 3...","[0.05377278816585762, 0.6957004354733369, 0.10...","[0.8547741682395857, 0.3023164938900591, 0.114..."
