In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =======

# ETL with NVTabular

In this notebook we are going to generate synthetic data and then create session-based features with [NVTabular](https://github.com/NVIDIA/NVTabular) to train a session-based recommendation model for next item prediction task.

NVTabular is a feature engineering and preprocessing library for tabular data designed to quickly and easily manipulate terabyte scale datasets used to train deep learning based recommender systems. It provides a high level abstraction to simplify code and accelerates computation on the GPU using the RAPIDS cuDF library.

- Import required libraries

In [2]:
import os
import glob

import torch 
import numpy

import cudf
import cupy as cp
import nvtabular as nvt

- Define Input/Output Path

In [3]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data/")

## Create a Synthetic Input Data

In [4]:
NUM_ROWS = 100000
inputs = {
    'session_id': numpy.random.randint(70000, 80000, NUM_ROWS),
    'day': numpy.random.randint(1, 10, NUM_ROWS),
    'item_id': numpy.random.randint(1, 51996, NUM_ROWS),
    'category': numpy.random.randint(0, 332, NUM_ROWS),
    'timestamp/age_days': numpy.random.uniform(0, 1, NUM_ROWS),
    'timestamp/weekday/sin' : numpy.random.uniform(0, 1, NUM_ROWS),
    }
df = cudf.DataFrame(inputs)
df.head()

Unnamed: 0,session_id,day,item_id,category,timestamp/age_days,timestamp/weekday/sin
0,71620,3,12267,94,0.985003,0.430432
1,74490,4,10727,55,0.488696,0.208056
2,78665,2,6525,171,0.70352,0.429791
3,78260,8,38001,234,0.503243,0.036423
4,74709,7,17191,13,0.43543,0.023882


## Feature Engineering with NVTabular

Deep Learning models require the input feature in a specific format. Categorical features needs to be continuous integers (0, ..., |C|) to use them with an embedding layer. We will use NVTabular to preprocess the categorical features, so that all categorical columns will be encoded to continuous integers. Note that we also add `1` after we categorify the categorical columns, the reason for that we want the encoded null values to start from `1` instead of `0` because we reserve `0` for padding the seqeunce features.

Here our goal is to create session-based features.  In this cell, we are creating temporal features and grouping them together at the session level, sorting the interactions by time. Note that we also trim each feature sequence in a  session to a certain length. Here, we use the NVTabular library so that we can easily preprocess and create features in a couple of lines on a GPU.

In [5]:
# Categorify categorical features

categ_feats = ['session_id', 'item_id', 'category'] >> nvt.ops.Categorify() >> nvt.ops.LambdaOp(lambda col: col +1)
# Define Groupby Workflow
groupby_feats = categ_feats + ['day', 'timestamp/age_days', 'timestamp/weekday/sin']

groupby_features = groupby_feats >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list", "count"],
        "category": ["list"],     
        "day": ["first"],
        "timestamp/age_days": ["list"],
        'timestamp/weekday/sin': ["list"],
        },
    name_sep="-")


groupby_features_nonlist = [x for x in groupby_features.output_columns.names if '-list' not in x]

# Trim sessions to first 20 items 
groupby_features_trim = ((groupby_features - groupby_features_nonlist)) >> nvt.ops.ListSlice(0,20) >> nvt.ops.Rename(postfix = '_trim')

MINIMUM_SESSION_LENGTH = 2

selected_features = groupby_features[groupby_features_nonlist] + groupby_features_trim

# select sessions with session length > 1
filtered_sessions = (selected_features) >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH)

workflow = nvt.Workflow(filtered_sessions)
dataset = nvt.Dataset(df, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()



In [6]:
sessions_gdf.head(3)

Unnamed: 0,item_id-count,session_id,day-first,item_id-list_trim,category-list_trim,timestamp/weekday/sin-list_trim,timestamp/age_days-list_trim
0,21,2,9,"[840, 2951, 25526, 40443, 24932, 14475, 17653,...","[140, 150, 263, 295, 266, 106, 39, 322, 147, 1...","[0.028410835321902583, 0.9691826933247304, 0.6...","[0.2689724161281237, 0.16797748135413082, 0.21..."
1,21,3,8,"[26650, 6982, 41154, 6679, 30911, 3117, 4867, ...","[219, 130, 22, 308, 264, 208, 47, 103, 212, 28...","[0.9923099491428694, 0.7445460561955058, 0.130...","[0.43645313554838305, 0.6568401890098505, 0.13..."
2,21,4,3,"[26817, 11285, 12128, 37192, 38653, 23904, 110...","[120, 25, 18, 16, 129, 333, 164, 78, 4, 195, 2...","[0.990841258040863, 0.24163118628953406, 0.254...","[0.2304644324273546, 0.42607898450166937, 0.52..."


## Export pre-processed data by day¶

In [7]:
OUTPUT_DATA_DIR = os.environ.get("OUTPUT_DATA_DIR", INPUT_DATA_DIR + "output")

PARTITION_COL = 'day-first'

# Convert to a Dataset and write out hive-partitioned data to disk
workflow.transform(dataset).to_parquet(OUTPUT_DATA_DIR, partition_on=[PARTITION_COL])

In [8]:
OUTPUT_FOLDER = os.environ.get("OUTPUT_DATA_DIR", INPUT_DATA_DIR + "sessions_by_day")
!mkdir -p $OUTPUT_FOLDER

In [9]:
days_folders = [f for f in sorted(os.listdir(OUTPUT_DATA_DIR)) if f.startswith(PARTITION_COL)]
for day_folder in days_folders:
    df = cudf.read_parquet(os.path.join(OUTPUT_DATA_DIR, day_folder))
    out_folder = os.path.join(OUTPUT_FOLDER, day_folder.replace('day-first=', ''))
    os.makedirs(out_folder, exist_ok=True)
    cp.random.seed(1)
    random_values = cp.random.rand(len(df))
    # Extracts 80% , 10%  and 10% for train, valid and test set, respectively. 
    train_set = df[random_values <= 0.80]
    train_set.to_parquet(os.path.join(out_folder, 'train.parquet'))
    
    valid_set = df[(random_values > 0.80) &  (random_values < 0.90)]
    valid_set.to_parquet(os.path.join(out_folder, 'valid.parquet'))
    
    test_set = df[random_values >= 0.90]
    test_set.to_parquet(os.path.join(out_folder, 'test.parquet'))

## Checking the pre-processing outputs¶

In [10]:
TRAIN_PATHS = sorted(glob.glob(os.path.join(OUTPUT_FOLDER, "1", "train.parquet")))

In [12]:
gdf = cudf.read_parquet(TRAIN_PATHS[0])
gdf.head()

Unnamed: 0,item_id-count,session_id,item_id-list_trim,category-list_trim,timestamp/weekday/sin-list_trim,timestamp/age_days-list_trim
0,20,9,"[2554, 15667, 96, 1133, 1429, 735, 22710, 3014...","[50, 33, 43, 244, 192, 76, 218, 7, 167, 280, 3...","[0.3991574138561549, 0.2659848395499488, 0.289...","[0.9929986679944129, 0.4762350664816781, 0.491..."
1,19,31,"[18616, 6953, 13177, 38148, 28445, 14767, 2285...","[100, 63, 185, 253, 73, 233, 110, 34, 76, 189,...","[0.6111753639455862, 0.37635135545573795, 0.24...","[0.04225290336699772, 0.5073988359077798, 0.99..."
2,19,43,"[29532, 41644, 43485, 15190, 1155, 15879, 3009...","[182, 163, 190, 274, 160, 217, 145, 174, 124, ...","[0.3086618798967069, 0.6947151316284855, 0.918...","[0.8120340338721659, 0.6867329960142494, 0.039..."
4,18,79,"[13583, 3940, 10587, 11814, 44121, 10095, 2397...","[90, 92, 194, 38, 174, 148, 97, 96, 140, 204, ...","[0.868993975355618, 0.9613542980653605, 0.3258...","[0.029050675830974204, 0.29788855062538133, 0...."
5,18,83,"[6660, 42762, 17279, 17557, 38180, 39654, 2729...","[91, 16, 76, 68, 121, 70, 70, 33, 2, 170, 25, ...","[0.11905301048777017, 0.2247034473227557, 0.65...","[0.8323905850084493, 0.011296206141988008, 0.3..."
