In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======

# ETL with NVTabular

In this notebook we are going to generate synthetic data and then create session-based features with [NVTabular](https://github.com/NVIDIA/NVTabular) to train a session-based recommendation model for next item prediction task.

NVTabular is a feature engineering and preprocessing library for tabular data designed to quickly and easily manipulate terabyte scale datasets used to train deep learning based recommender systems. It provides a high level abstraction to simplify code and accelerates computation on the GPU using the RAPIDS cuDF library.

- Import required libraries

In [2]:
import os
import glob

import torch 
import numpy as np
import pandas as pd

import cudf
import nvtabular as nvt

- Define Input/Output Path

In [3]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data/")

## Create a Synthetic Input Data

In [4]:
NUM_ROWS = 100000

# generate random item interaction features 
df = pd.DataFrame(np.random.randint(70000, 80000, NUM_ROWS), columns=['session_id'])
df['item_id'] = np.clip(np.random.lognormal(3., 1., NUM_ROWS).astype(np.int32), 1, 50000)

# generate category mapping for each item-id
df['category'] = pd.cut(df['item_id'], bins=334, labels=np.arange(1, 335)).astype(np.int32)
df['timestamp/age_days'] = np.random.uniform(0, 1, NUM_ROWS)
df['timestamp/weekday/sin']= np.random.uniform(0, 1, NUM_ROWS)

# generate day mapping for each session 
map_day = dict(zip(df.session_id.unique(), np.random.randint(1, 10, size=(df.session_id.nunique()))))
df['day'] =  df.session_id.map(map_day)

In [5]:
# convert to cudf 
df = cudf.from_pandas(df)
df.head()

Unnamed: 0,session_id,item_id,category,timestamp/age_days,timestamp/weekday/sin,day
0,77390,71,16,0.701629,0.206774,4
1,79251,49,11,0.502681,0.699902,6
2,76285,105,24,0.632214,0.468025,4
3,73328,139,31,0.872418,0.119704,6
4,70384,6,2,0.415906,0.554932,3


## Feature Engineering with NVTabular

Deep Learning models require the input feature in a specific format. Categorical features needs to be continuous integers (0, ..., |C|) to use them with an embedding layer. We will use NVTabular to preprocess the categorical features, so that all categorical columns will be encoded to continuous integers. Note that we also add `1` after we categorify the categorical columns, the reason for that we want the encoded null values to start from `1` instead of `0` because we reserve `0` for padding the seqeunce features.

Here our goal is to create session-based features.  In this cell, we are creating temporal features and grouping them together at the session level, sorting the interactions by time. Note that we also trim each feature sequence in a  session to a certain length. Here, we use the NVTabular library so that we can easily preprocess and create features in a couple of lines on a GPU.

In [6]:
# Categorify categorical features

categ_feats = ['session_id', 'item_id', 'category'] >> nvt.ops.Categorify() >> nvt.ops.LambdaOp(lambda col: col +1)

# Define Groupby Workflow
groupby_feats = categ_feats + ['day', 'timestamp/age_days', 'timestamp/weekday/sin']

groupby_features = groupby_feats >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list", "count"],
        "category": ["list"],     
        "day": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")

groupby_features_trim = (groupby_features['category-list', 'item_id-list', 'timestamp/age_days-list', 'timestamp/weekday/sin-list']) >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')

MINIMUM_SESSION_LENGTH = 2
selected_features = groupby_features['item_id-count', 'day-first', 'session_id'] + groupby_features_trim

filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH)

workflow = nvt.Workflow(filtered_sessions)
dataset = nvt.Dataset(df, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()

In [7]:
dataset.head(1).dtypes

session_id                 int64
item_id                    int32
category                   int32
timestamp/age_days       float64
timestamp/weekday/sin    float64
day                        int64
dtype: object

In [8]:
sessions_gdf.head(3)

Unnamed: 0,item_id-count,day-first,session_id,category-list_trim,item_id-list_trim,timestamp/age_days-list_trim,timestamp/weekday/sin-list_trim
0,25,1,2,"[3, 3, 16, 4, 17, 10, 5, 2, 10, 7, 10, 5, 21, ...","[6, 6, 62, 14, 78, 36, 13, 7, 37, 31, 37, 13, ...","[0.9100894916250084, 0.756645604699169, 0.2188...","[0.7847419421279545, 0.5945018653647263, 0.347..."
1,24,4,3,"[4, 2, 11, 5, 12, 3, 3, 2, 4, 9, 5, 3, 18, 6, ...","[4, 10, 53, 16, 48, 3, 2, 9, 33, 39, 13, 2, 71...","[0.4661804466282633, 0.7722383727614037, 0.620...","[0.4865002133887476, 0.645986420766564, 0.9069..."
2,24,5,4,"[4, 5, 6, 6, 20, 5, 2, 8, 5, 5, 11, 7, 2, 19, ...","[33, 15, 21, 21, 82, 17, 11, 24, 17, 15, 49, 3...","[0.7705694252382204, 0.13105883648010097, 0.04...","[0.5391275527607033, 0.5765740803225676, 0.865..."


- we can save our workflow

In [9]:
workflow.save('workflow_etl')

## Export pre-processed data by day

In [10]:
OUTPUT_FOLDER = os.environ.get("OUTPUT_FOLDER", INPUT_DATA_DIR + "sessions_by_day")
!mkdir -p $OUTPUT_FOLDER

In [11]:
from transformers4rec.utils.gpu_preprocessing import save_time_based_splits
save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                       output_dir= OUTPUT_FOLDER,
                       partition_col='day-first',
                       timestamp_col='session_id', 
                      )

Creating time-based splits: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 10.86it/s]


## Checking the pre-processing outputs¶

In [12]:
TRAIN_PATHS = sorted(glob.glob(os.path.join(OUTPUT_FOLDER, "1", "train.parquet")))

In [13]:
gdf = cudf.read_parquet(TRAIN_PATHS[0])
gdf.head()

Unnamed: 0,item_id-count,session_id,category-list_trim,item_id-list_trim,timestamp/age_days-list_trim,timestamp/weekday/sin-list_trim
0,25,2,"[3, 3, 16, 4, 17, 10, 5, 2, 10, 7, 10, 5, 21, ...","[6, 6, 62, 14, 78, 36, 13, 7, 37, 31, 37, 13, ...","[0.9100894916250084, 0.756645604699169, 0.2188...","[0.7847419421279545, 0.5945018653647263, 0.347..."
1,22,8,"[2, 2, 14, 9, 8, 9, 8, 3, 8, 13, 27, 10, 2, 3,...","[11, 10, 51, 40, 25, 38, 26, 6, 24, 60, 117, 3...","[0.9145827797627815, 0.4003500935746652, 0.167...","[0.2016445226211706, 0.7262655850656101, 0.561..."
2,22,11,"[2, 6, 2, 2, 6, 2, 6, 5, 13, 2, 10, 5, 59, 2, ...","[12, 22, 9, 9, 22, 11, 22, 16, 57, 7, 36, 16, ...","[0.07175881656991034, 0.3889615998114859, 0.64...","[0.37966564527196056, 0.1359808551571674, 0.13..."
4,21,15,"[7, 4, 6, 8, 8, 35, 12, 8, 28, 39, 6, 6, 2, 10...","[30, 33, 18, 26, 26, 138, 48, 25, 120, 165, 21...","[0.10024194264474551, 0.7061059702958917, 0.12...","[0.5244424597811311, 0.04773987690523407, 0.67..."
5,20,36,"[8, 5, 11, 2, 21, 2, 4, 3, 15, 8, 25, 3, 4, 3,...","[25, 17, 53, 7, 96, 12, 14, 2, 67, 25, 113, 2,...","[0.6060663634476966, 0.8895306586594811, 0.337...","[0.6516966658517545, 0.3694457918420432, 0.373..."
