# Read, Parse, Process E-Commerce data with NVTabular
eCommerce dataset: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store

## Data Download from Kaggle

In [None]:
# !pip install kaggle --upgrade

In [None]:
# # NOTE: first to get kaggle api tiken from account page in Kaggle. Place it at ~/.kaggle/kaggle.json
# !mkdir -p ~/.kaggle/ && cp /mount/workspace/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
# !mkdir -p ~/data
# !cd ~/data && kaggle datasets download mkechinov/ecommerce-behavior-data-from-multi-category-store
# !cd ~/data && unzip ecommerce-behavior-data-from-multi-category-store.zip

### Downloading additional months from Google Drive

In [None]:
# !pip install gdown
# !cd ~/data 

In [None]:
# !gdown https://drive.google.com/uc?id=1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P -O .
# !echo "Unziping" && gunzip 2019-Dec.csv.gz

In [None]:
# !cd ~/data && gdown https://drive.google.com/uc?id=1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT
# !cd ~/data && echo "Unziping" && gunzip 2020-Jan.csv.gz

In [None]:
# !cd ~/data && gdown https://drive.google.com/uc?id=1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB
# !cd ~/data && echo "Unziping" && gunzip 2020-Feb.csv.gz

In [None]:
# !cd ~/data && gdown https://drive.google.com/uc?id=1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8
# !cd ~/data && echo "Unziping" && gunzip 2020-Mar.csv.gz

In [None]:
# !cd ~/data && gdown https://drive.google.com/uc?id=1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL
# !cd ~/data && echo "Unziping" && gunzip 2020-Apr.csv.gz

## Configurations

In [9]:
import os

In [10]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

import rmm

import glob

import cudf, dask_cudf
import cupy
import nvtabular as nvt
from nvtabular import ColumnGroup

import pandas as pd
import numpy as np
import shutil

## Set up Dask Cuda Cluster

In [11]:
# define some information about where to get our data
BASE_DIR = os.environ.get("BASE_DIR", "/workspace/")
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", BASE_DIR + "ecommerce/")
dask_workdir = os.path.join(BASE_DIR, "test_dask/workdir")
OUTPUT_DATA_DIR = os.environ.get("OUTPUT_DATA_DIR", BASE_DIR + "/ecommerce/output")
stats_path = os.path.join(BASE_DIR, "test_dask/stats")

# Make sure we have a clean worker space for Dask
if os.path.isdir(dask_workdir):
    shutil.rmtree(dask_workdir)
os.makedirs(dask_workdir)

# Make sure we have a clean stats space for Dask
if os.path.isdir(stats_path):
    shutil.rmtree(stats_path)
os.mkdir(stats_path)

# Make sure we have a clean output path
if os.path.isdir(OUTPUT_DATA_DIR):
    shutil.rmtree(OUTPUT_DATA_DIR)
os.mkdir(OUTPUT_DATA_DIR)

In [12]:
# Dask dashboard
from nvtabular.utils import _pynvml_mem_size, device_mem_size
dashboard_port = "8787"

# Deploy a Single-Machine Multi-GPU Cluster
protocol = "tcp"  # "tcp" or "ucx"
NUM_GPUS = [0, 1] # list here the number of gpus available on your system.
visible_devices = ",".join([str(n) for n in NUM_GPUS])  # Delect devices to place workers
device_limit_frac = 0.7  # Spill GPU-Worker memory to host at this limit.
device_pool_frac = 0.8
part_mem_frac = 0.15

# Use total device size to calculate args.device_limit_frac
device_size = device_mem_size(kind="total")
device_limit = int(device_limit_frac * device_size)
device_pool_size = int(device_pool_frac * device_size)
part_size = int(part_mem_frac * device_size)

# Check if any device memory is already occupied
for dev in visible_devices.split(","):
    fmem = _pynvml_mem_size(kind="free", index=int(dev))
    used = (device_size - fmem) / 1e9
    if used > 1.0:
        warnings.warn(f"BEWARE - {used} GB is already occupied on device {int(dev)}!")

cluster = None  # (Optional) Specify existing scheduler port
if cluster is None:
    cluster = LocalCUDACluster(
        protocol=protocol,
        n_workers=len(visible_devices.split(",")),
        CUDA_VISIBLE_DEVICES=visible_devices,
        device_memory_limit=device_limit,
        local_directory=dask_workdir,
        dashboard_address=":" + dashboard_port,
    )

# Create the distributed client
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:36817  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 100.00 GiB


In [13]:
NUM_MONTHS_TO_PREPROCESS = 1 #For the eCommerce dataset there are up to 7 months (2019-Oct to 2020-Apr)
KEEP_REPEATED_USER_INTERACTIONS = False

In [14]:
MONTHS_FILES = ["2019-Oct.csv", "2019-Nov.csv", "2019-Dec.csv", "2020-Jan.csv", "2020-Feb.csv", "2020-Mar.csv", "2020-Apr.csv"]

In [15]:
selected_months = MONTHS_FILES[:NUM_MONTHS_TO_PREPROCESS]
selected_months

['2019-Oct.csv']

In [16]:
files_paths = [os.path.join(INPUT_DATA_DIR, file) for file in selected_months]
files_paths

['/workspace/ecommerce/2019-Oct.csv']

#### Read through Dask-cudf from CSV

In [17]:
%%time
raw_df = dask_cudf.read_csv(files_paths, inferSchema = True) 
raw_df.head()

CPU times: user 846 ms, sys: 420 ms, total: 1.27 s
Wall time: 6.47 s


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


## Convert timestamp from datetime

In [18]:
raw_df['event_time_dt'] = raw_df['event_time'].astype('datetime64[s]')
raw_df['event_time_ts']= raw_df['event_time_dt'].astype('int')
raw_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_time_dt,event_time_ts
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,2019-10-01 00:00:00,1569888000
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2019-10-01 00:00:00,1569888000
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,2019-10-01 00:00:01,1569888001
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,2019-10-01 00:00:01,1569888001
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,2019-10-01 00:00:04,1569888004


In [19]:
# check out the columns with nulls
raw_df.isnull().any().compute()

event_time       False
event_type       False
product_id       False
category_id      False
category_code     True
brand             True
price            False
user_id          False
user_session      True
event_time_dt    False
event_time_ts    False
dtype: bool

In [20]:
# Remove rows where `user_session` is null.

raw_df = raw_df[raw_df['user_session'].isnull()==False]
len(raw_df)

42448762

In [21]:
raw_df.dtypes

event_time              object
event_type              object
product_id               int64
category_id              int64
category_code           object
brand                   object
price                  float64
user_id                  int64
user_session            object
event_time_dt    datetime64[s]
event_time_ts            int64
dtype: object

## Removing repeated (user,item) interactions

### Categorify `user_session` column

In [22]:
cols = list(raw_df.columns)
cols.remove('user_session')
cols

['event_time',
 'event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id',
 'event_time_dt',
 'event_time_ts']

In [23]:
#  load data 
df_event = nvt.Dataset(raw_df) 
# categorify features 
cat_feats = ['user_session'] >> nvt.ops.Categorify()

workflow = nvt.Workflow(cols + cat_feats)
workflow.fit(df_event)
df = workflow.transform(df_event).to_ddf()

In [24]:
df = df.drop(['event_time'],  axis=1)

In [25]:
df.head()

Unnamed: 0,user_session,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_dt,event_time_ts
0,4147851,view,44600062,2103807459595387724,,shiseido,35.79,541312140,2019-10-01 00:00:00,1569888000
1,5316339,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,2019-10-01 00:00:00,1569888000
2,3120154,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,2019-10-01 00:00:01,1569888001
3,4499322,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,2019-10-01 00:00:01,1569888001
4,7176697,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,2019-10-01 00:00:04,1569888004


In [26]:
df.dtypes

user_session     float64
event_type       float64
product_id       float64
category_id      float64
category_code    float64
brand            float64
price            float64
user_id          float64
event_time_dt    float64
event_time_ts    float64
dtype: object

In [27]:
df['user_session'] = df['user_session'].astype('int64')
df['event_time_ts'] = df['event_time_ts'].astype('int64')
df['product_id'] = df['product_id'].astype('int64')
df['category_id'] = df['category_id'].astype('int64')
df['category_code'] = df['category_id'].astype(object)
df['brand'] = df['brand'].astype(object)
df['event_type'] = df['event_type'].astype(object)
df['event_time_dt'] = df['event_time_dt'].astype('datetime64[s]')

In [28]:
df.dtypes

user_session             int64
event_type              object
product_id               int64
category_id              int64
category_code           object
brand                   object
price                  float64
user_id                float64
event_time_dt    datetime64[s]
event_time_ts            int64
dtype: object

In [29]:
df = df.sort_values(['user_session', 'event_time_ts'])

In [30]:
#Keeping only the first user interaction with an item (ignores all future repeated interactions)
if not KEEP_REPEATED_USER_INTERACTIONS:
    df_first_user_item_interaction_df = df.groupby(['user_id', 'product_id']).agg({'event_time_ts': 'min'})
    df_first_user_item_interaction_df = df_first_user_item_interaction_df.compute().reset_index().rename(columns={'event_time_ts': 'first_user_item_event_time_ts'})
    df = df.merge(df_first_user_item_interaction_df, how='inner', left_on=['user_id', 'product_id', 'event_time_ts'], right_on=['user_id', 'product_id','first_user_item_event_time_ts'])
    df = df.drop(columns=['first_user_item_event_time_ts'])

#Keeps repeated interactions on the same items, removing only consecutive interactions, because it might be due to browser tab refreshes or different interaction types (e.g. click, add-to-card, purchase)
else:
    print("Count with in-session repeated interactions: {}".format(len(df)))
    # Sorts the dataframe by session and timestamp, to remove consective repetitions
    df['product_id_past'] = df['product_id'].shift(1)
    df['session_id_past'] = df['user_session'].shift(1)
    #Keeping only no consecutive repeated in session interactions
    df = df[~((df['user_session'] == df['session_id_past']) & \
                 (df['product_id'] == df['product_id_past']))]
    print("Count after removed in-session repeated interactions: {}".format(len(df)))
    del(df['product_id_past'])
    del(df['session_id_past'])

In [31]:
df.head()

Unnamed: 0,user_session,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_dt,event_time_ts
0,354747,view,12709169,2053013553559896355,2053013553559896355,michelin,67.44,561510226,2019-10-18 09:56:13,1571392573
1,161997,view,12600014,2053013554751078769,2053013554751078769,clatronic,40.67,533888320,2019-10-17 18:53:25,1571338405
2,414407,view,1004839,2053013555631882655,2053013555631882655,oppo,179.92,545041406,2019-10-25 13:39:41,1572010781
3,159643,view,51800038,2135658542386905834,2135658542386905834,tomfarr,152.64,514826663,2019-10-31 08:43:46,1572511426
4,66646,view,4802400,2053013554658804075,2053013554658804075,samsung,45.75,547206054,2019-10-03 16:16:05,1570119365


**Full Dataset (7 months) Stats - Number of interactions**
- No filter: 411709736
- Removed iser consecutive repeated interactions in the same items: 261390136
- Removing all user repeated interactions with the same items: 204098003

**1 Months Stats - Number of interactions**
- No filter: 42448764
- Removed iser consecutive repeated interactions in the same items: 26565608 
- Removing all user repeated interactions with the same items: 23312920

### Include the item first time seen feature (for recency calculation)¶

In [32]:
item_first_interaction_df = df.groupby('product_id').agg({'event_time_ts': 'min'}).reset_index().rename(columns={'event_time_ts': 'prod_first_event_time_ts'})
item_first_interaction_df['prod_first_event_time_ts'] = item_first_interaction_df['prod_first_event_time_ts'].astype('datetime64[s]')
item_first_interaction_df.head()

Unnamed: 0,product_id,prod_first_event_time_ts
0,32403886,2019-10-18 15:47:15
1,5801301,2019-10-01 11:21:27
2,12719653,2019-10-01 04:17:46
3,13400621,2019-10-01 05:52:12
4,27800167,2019-10-01 05:44:26


In [33]:
df = df.merge(item_first_interaction_df, on=['product_id'], how='left')

In [34]:
# # save df as parquet files on disk
# df.to_parquet('./NVTabular/input_df')

### Categorical features encoding


In [35]:
cols= ['event_time_dt', 'event_time_ts', 'user_session']

In [36]:
# categorify features 

## filll NA sttrings with unknown.
# fill missing product_actions
fill_na_categs = ['category_code', 'brand'] >> nvt.ops.FillMissing(fill_val='unknown')
cat_feats = fill_na_categs + ['user_id', 'product_id', 'category_id', 'event_type'] >> nvt.ops.Rename(postfix = '_idx') >> nvt.ops.Categorify()

In [37]:
cat_feats.columns

['category_code_idx',
 'brand_idx',
 'user_id_idx',
 'product_id_idx',
 'category_id_idx',
 'event_type_idx']

### Extract Temporal Features

In [38]:
# calculate item recency 
# create custom op
from nvtabular.ops import Operator

class ItemRecency(Operator):
    def transform(self, columns, gdf):
        for column in columns:
            col = gdf[column]
            item_first_timestamp = gdf['prod_first_event_time_ts']
            delta_days = (col - item_first_timestamp).dt.days
            gdf[column + "_age_days"] = delta_days * (delta_days >=0)
        return gdf
            
    def output_column_names(self, columns):
        return [column + "_age_days" for column in columns]
            
    def dependencies(self):
        return ["prod_first_event_time_ts"]

In [39]:
# create time features
sessionTime = ['event_time_dt']

sessionTime_hour = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: col.dt.hour) >> 
    nvt.ops.Rename(name = 'et_hour')
)
sessionTime_weekday = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: col.dt.weekday) >> 
    nvt.ops.Rename(name ='et_dayofweek')
)
sessionTime_day = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: col.dt.day) >> 
    nvt.ops.Rename(name ="et_dayofmonth")
)
sessionTime_month = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: col.dt.month) >> 
    nvt.ops.Rename(name ="et_month")
)

In [40]:
def get_cycled_feature_value_sin(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    return value_sin

def get_cycled_feature_value_cos(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_cos = np.cos(2*np.pi*value_scaled)
    return value_cos

In [41]:
hour_sin = sessionTime_hour >> (lambda col: get_cycled_feature_value_sin(col, 24)) >> nvt.ops.Rename(name='et_hour_sin')
hour_cos = sessionTime_hour >> (lambda col: get_cycled_feature_value_cos(col, 24)) >> nvt.ops.Rename(name ='et_hour_cos')
weekday_sin = sessionTime_weekday >> (lambda col: get_cycled_feature_value_sin(col+1, 7)) >> nvt.ops.Rename(name = 'et_dayofweek_sin')
weekday_cos= sessionTime_weekday >> (lambda col: get_cycled_feature_value_cos(col+1, 7)) >> nvt.ops.Rename(name = 'et_dayofweek_cos')

dayofmonth_sin = sessionTime_hour >> (lambda col: get_cycled_feature_value_sin(col, 31)) >> nvt.ops.Rename(name ='et_dayofmonth_sin')
dayofmonth_cos = sessionTime_hour >> (lambda col: get_cycled_feature_value_cos(col, 31)) >> nvt.ops.Rename(name='et_dayofmonth_cos')
month_sin = sessionTime_weekday >> (lambda col: get_cycled_feature_value_sin(col, 12)) >> nvt.ops.Rename(name='et_month_sin')
month_cos= sessionTime_weekday >> (lambda col: get_cycled_feature_value_cos(col, 12)) >> nvt.ops.Rename(name = 'et_month_cos')

In [42]:
cycled_features = hour_sin + hour_cos + weekday_sin + weekday_cos + dayofmonth_sin + dayofmonth_cos + month_sin + month_cos

In [43]:
recency_features = ["event_time_dt"] >> ItemRecency() >>  nvt.ops.Rename(name='product_recency_days')
recency_features_norm = recency_features >> nvt.ops.LogOp() >> nvt.ops.Normalize() >> nvt.ops.Rename(name='product_recency_days_log_norm')

In [44]:
time_features = (
    sessionTime_hour +
    sessionTime_day + 
    sessionTime_month + 
    sessionTime_weekday +
    recency_features +
    recency_features_norm + 
    cycled_features
)
#time_features.graph

### Computing elapsed time since last interaction (on non-repeated items)

In [45]:
# compute the delta in timestamp for each users session
diff_features = (["event_time_ts"] >> nvt.ops.DifferenceLag(partition_cols=["user_session"]) >> (lambda col: col.astype("float32"))
                                   >> nvt.ops.FillMissing(fill_val=0)
                                   >> nvt.ops.Rename(name="delta_event_secs")
                )
diff_features_log_norm = diff_features >> nvt.ops.LogOp() >> nvt.ops.Normalize() >> nvt.ops.Rename(name='delta_event_secs_log_norm')

### Normalize Continuous Features¶

In [46]:
#Smoothing price long-tailed distribution
price_log = ['price'] >> nvt.ops.LogOp() >> nvt.ops.Normalize() >> nvt.ops.Rename(name='price_log_norm')

In [47]:
# Relative Price to the average price for the category_id
def relative_price_to_avg_categ(col, gdf):
    col = (gdf['price'] - col) / col
    return col
    
avg_category_id_pr = ['category_id'] >> nvt.ops.JoinGroupby(cont_cols =['price'], stats=["mean"]) >> nvt.ops.Rename(name='avg_category_id_price')
relative_price_to_avg_category = avg_category_id_pr >> nvt.ops.LambdaOp(relative_price_to_avg_categ, dependency=['price']) >> nvt.ops.Rename(name="relative_price_to_avg_categ_id")

### Grouping interactions into sessions¶

In [48]:
groupby_feats = cols + cat_feats + time_features + avg_category_id_pr + price_log + relative_price_to_avg_category + diff_features + diff_features_log_norm

In [49]:
groupby_feats.columns

['category_code_idx',
 'brand_idx',
 'user_id_idx',
 'product_id_idx',
 'category_id_idx',
 'event_type_idx',
 'event_time_dt',
 'event_time_ts',
 'user_session',
 'et_hour',
 'et_dayofmonth',
 'et_month',
 'et_dayofweek',
 'product_recency_days',
 'product_recency_days_log_norm',
 'et_hour_sin',
 'et_hour_cos',
 'et_dayofweek_sin',
 'et_dayofweek_cos',
 'et_dayofmonth_sin',
 'et_dayofmonth_cos',
 'et_month_sin',
 'et_month_cos',
 'avg_category_id_price',
 'price_log_norm',
 'relative_price_to_avg_categ_id',
 'delta_event_secs',
 'delta_event_secs_log_norm']

#### Aggregate by session id (create sequence as type of array)

In [50]:
# Define Groupby Workflow
groupby_features = groupby_feats  >> nvt.ops.Groupby(
    groupby_cols=["user_session"], 
    sort_cols=["event_time_ts"],
    aggs={
        'user_id_idx': ['first'],
        'product_id_idx': ["list", "count"],
        'category_code_idx': ["list"],  
        'event_type_idx': ["list"], 
        'brand_idx': ["list"], 
        'category_id_idx': ["list"], 
        'event_time_ts': ["list", "first", "last"],
        'event_time_dt': ["first"],
        'et_month': ["list"],
        'et_hour': ["list"],
        'et_dayofmonth': ["list"],
        'et_dayofweek': ["list"],
        'product_recency_days': ["list"],
        'product_recency_days_log_norm': ["list"],
        'et_hour_sin': ["list"],
        'et_hour_cos': ["list"],
        'et_dayofweek_sin': ["list"],
        'et_dayofweek_cos': ["list"],
        'et_dayofmonth_sin': ["list"],
        'et_dayofmonth_cos': ["list"],
        'et_month_sin': ["list"],
        'et_month_cos': ["list"], 
        'avg_category_id_price': ["list"], 
        'relative_price_to_avg_categ_id': ["list"], 
        'price_log_norm': ["list"], 
        'delta_event_secs': ["list"], 
        'delta_event_secs_log_norm': ["list"], 
        },
    name_sep="-")

In [51]:
groupby_features.columns

['et_dayofmonth-list',
 'price_log_norm-list',
 'product_recency_days-list',
 'product_id_idx-list',
 'et_hour-list',
 'et_dayofmonth_cos-list',
 'user_id_idx-first',
 'relative_price_to_avg_categ_id-list',
 'product_recency_days_log_norm-list',
 'et_dayofweek_sin-list',
 'category_id_idx-list',
 'et_dayofweek_cos-list',
 'event_time_dt-first',
 'et_dayofweek-list',
 'event_time_ts-list',
 'category_code_idx-list',
 'delta_event_secs_log_norm-list',
 'product_id_idx-count',
 'user_session',
 'et_dayofmonth_sin-list',
 'et_month_cos-list',
 'event_time_ts-first',
 'et_month-list',
 'event_type_idx-list',
 'et_hour_cos-list',
 'brand_idx-list',
 'event_time_ts-last',
 'avg_category_id_price-list',
 'delta_event_secs-list',
 'et_hour_sin-list',
 'et_month_sin-list']

In [52]:
groupby_features_nonlist = [x for x in groupby_features.columns if '-list' not in x]
groupby_features_nonlist

['user_id_idx-first',
 'event_time_dt-first',
 'product_id_idx-count',
 'user_session',
 'event_time_ts-first',
 'event_time_ts-last']

In [53]:
SESSIONS_MAX_LENGTH = 20 
MINIMUM_SESSION_LENGTH = 2

In [54]:
groupby_features_trim = ((groupby_features - groupby_features_nonlist)) >> nvt.ops.ListSlice(0,SESSIONS_MAX_LENGTH) >> nvt.ops.Rename(postfix = '_trim')

In [55]:
groupby_features_trim.columns

['et_dayofmonth-list_trim',
 'price_log_norm-list_trim',
 'product_recency_days-list_trim',
 'product_id_idx-list_trim',
 'et_hour-list_trim',
 'et_dayofmonth_cos-list_trim',
 'relative_price_to_avg_categ_id-list_trim',
 'product_recency_days_log_norm-list_trim',
 'et_dayofweek_sin-list_trim',
 'category_id_idx-list_trim',
 'et_dayofweek_cos-list_trim',
 'et_dayofweek-list_trim',
 'event_time_ts-list_trim',
 'category_code_idx-list_trim',
 'delta_event_secs_log_norm-list_trim',
 'et_dayofmonth_sin-list_trim',
 'et_month_cos-list_trim',
 'et_month-list_trim',
 'event_type_idx-list_trim',
 'et_hour_cos-list_trim',
 'brand_idx-list_trim',
 'avg_category_id_price-list_trim',
 'delta_event_secs-list_trim',
 'et_hour_sin-list_trim',
 'et_month_sin-list_trim']

In [56]:
# calculate session day index based on 'timestamp-first' column
remaining_columns = [x for x in groupby_features.columns if x!= 'event_time_dt-first']
day_index = ((groupby_features - remaining_columns)  >> 
    nvt.ops.LambdaOp(lambda col: (col - col.min()).dt.days +1) >> 
    nvt.ops.Rename(f = lambda col: "day_index")
)

In [57]:
day_idx_padded = day_index >> (lambda col: col.astype(str).str.pad(4,fillchar='0')) >> nvt.ops.Rename(f = lambda col: "day_idx_padded")

In [58]:
rename_cols = {"product_id_idx-count": "session_size"} 
groupby_features = groupby_features >> nvt.ops.Rename(lambda col: rename_cols.get(col, col))

In [59]:
filtered_sessions = (groupby_features + groupby_features_trim + day_index + day_idx_padded) >> \
                     nvt.ops.Filter(f=lambda df: df["session_size"] >= MINIMUM_SESSION_LENGTH)

In [60]:
# define the path of the saved parquet files 
path = '/workspace/Transformers4Rec/datasets/ecommerce_rees46/preprocessing/NVTabular/input_df/'
input_paths = glob.glob(os.path.join(path, '*.parquet'))
#input_paths

In [61]:
dataset = nvt.Dataset(input_paths, part_size="500MB")
workflow = nvt.Workflow(filtered_sessions, client=client)
workflow.fit(dataset)
df_indexed = workflow.transform(dataset).to_ddf()

In [62]:
cast_dtypes_cols =  [x for x in df_indexed.columns if ('-list' not in x) & (x!= 'event_time_dt-first') & (x!= 'day_idx_padded')]
cast_dtypes_cols

['user_id_idx-first',
 'session_size',
 'user_session',
 'event_time_ts-first',
 'event_time_ts-last',
 'day_index']

In [63]:
for col in cast_dtypes_cols:
    df_indexed[col] = df_indexed[col].astype('int64')

In [64]:
df_indexed.dtypes

et_dayofmonth-list                          float64
price_log_norm-list                         float64
product_recency_days-list                   float64
product_id_idx-list                         float64
et_hour-list                                float64
et_dayofmonth_cos-list                      float64
user_id_idx-first                             int64
relative_price_to_avg_categ_id-list         float64
product_recency_days_log_norm-list          float64
et_dayofweek_sin-list                       float64
category_id_idx-list                        float64
et_dayofweek_cos-list                       float64
event_time_dt-first                         float64
et_dayofweek-list                           float64
event_time_ts-list                          float64
category_code_idx-list                      float64
delta_event_secs_log_norm-list              float64
session_size                                  int64
user_session                                  int64
et_dayofmont

In [65]:
df_indexed.head()

Unnamed: 0,et_dayofmonth-list,price_log_norm-list,product_recency_days-list,product_id_idx-list,et_hour-list,et_dayofmonth_cos-list,user_id_idx-first,relative_price_to_avg_categ_id-list,product_recency_days_log_norm-list,et_dayofweek_sin-list,...,et_month-list_trim,event_type_idx-list_trim,et_hour_cos-list_trim,brand_idx-list_trim,avg_category_id_price-list_trim,delta_event_secs-list_trim,et_hour_sin-list_trim,et_month_sin-list_trim,day_index,day_idx_padded
0,"[31, 31, 31, 31, 31, 31, 31]","[1.4872231, 1.7600029, -1.2381928, -0.07776256...","[30, 30, 30, 30, 30, 30, 21]","[1127, 1117, 19138, 926, 1116, 153, 1223]","[6, 6, 6, 6, 6, 6, 6]","[0.3473051, 0.3473051, 0.3473051, 0.3473051, 0...",219023,"[0.8999288074661269, 1.6823296038282975, -0.22...","[1.1646589, 1.1646589, 1.1646589, 1.1646589, 1...","[-0.43388462, -0.43388462, -0.43388462, -0.433...",...,"[10, 10, 10, 10, 10, 10, 10]","[3, 3, 3, 3, 3, 3, 3]","[-2.8212997e-07, -2.8212997e-07, -2.8212997e-0...","[166, 166, 3359, 2701, 166, 166, 2701]","[503.0925349643877, 503.0925349643877, 38.0222...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",31,31
1,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]","[0.4324528, 0.038939036, 1.6355649, 0.53558946...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]","[849, 1110, 1086, 935, 839, 766, 1042, 212, 11...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-0.6121061, -0.6121061, -0.6121061, -0.612106...",1597284,"[-0.5001515973243357, -0.6967158337763834, 1.2...","[-0.710463, -0.710463, -0.710463, -0.710463, -...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...",...,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]","[-0.96592593, -0.96592593, -0.96592593, -0.965...","[2701, 2701, 2701, 2701, 2701, 2701, 2701, 335...","[503.0925349643877, 503.0925349643877, 503.092...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2588187, 0.2588187, 0.2588187, 0.2588187, 0...","[-5.6425995e-07, -5.6425995e-07, -5.6425995e-0...",6,6
2,"[27, 27]","[0.5510725, 0.6363106]","[26, 26]","[8611, 8673]","[12, 12]","[-0.7587582, -0.7587582]",1811923,"[-0.3850561413909375, -0.3149102941005784]","[1.0069168, 1.0069168]","[1.1285199e-06, 1.1285199e-06]",...,"[10, 10]","[3, 3]","[-1.0, -1.0]","[1449, 1449]","[475.2954207252451, 475.2954207252451]","[0.0, 0.0]","[-5.6425995e-07, -5.6425995e-07]","[-5.6425995e-07, -5.6425995e-07]",27,27
3,"[14, 14]","[1.8049271, 1.9645911]","[13, 13]","[1117, 1147]","[13, 13]","[-0.87434673, -0.87434673]",1412504,"[1.8390602140441328, 2.473913601448517]","[0.25699535, 0.25699535]","[0.7818321, 0.7818321]",...,"[10, 10]","[3, 3]","[-0.96592575, -0.96592575]","[166, 166]","[503.0925349643877, 503.0925349643877]","[0.0, 0.0]","[-0.2588193, -0.2588193]","[5.2359877e-07, 5.2359877e-07]",14,14
4,"[13, 13, 13, 13, 13, 13]","[1.25176, -0.11356461, 1.057411, 0.2789118, 0....","[12, 12, 12, 12, 12, 12]","[122348, 120741, 122296, 122900, 118720, 122608]","[7, 7, 7, 7, 7, 7]","[0.15142754, 0.15142754, 0.15142754, 0.1514275...",1064821,"[0.8467763701904679, -0.3728416757390223, 0.44...","[0.1723775, 0.1723775, 0.1723775, 0.1723775, 0...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...",...,"[10, 10, 10, 10, 10, 10]","[3, 3, 3, 3, 3, 3]","[-0.25881937, -0.25881937, -0.25881937, -0.258...","[1846, 2848, 1846, 1846, 3163, 3163]","[384.2750056016842, 200.3800238928269, 384.275...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.96592575, 0.96592575, 0.96592575, 0.9659257...","[-5.6425995e-07, -5.6425995e-07, -5.6425995e-0...",13,13


In [66]:
cat_feats.op.get_embedding_sizes(cat_feats.columns)

{'category_code_idx': (625, 59),
 'brand_idx': (3445, 153),
 'user_id_idx': (3022291, 512),
 'product_id_idx': (166795, 512),
 'category_id_idx': (625, 59),
 'event_type_idx': (4, 16)}

# Exporting data

In [67]:
OUTPUT_FOLDER = "/workspace/ecommerce-rees/preproc_sessions_by_day_ts"
!mkdir -p $OUTPUT_FOLDER

In [79]:
SELECTED_COLS = ['user_id_idx-first', 'user_session', 'event_time_ts-first', 'product_id_idx-list_trim', 'event_time_ts-list_trim', 'event_type_idx-list_trim',
                 'category_id_idx-list_trim', 'category_code_idx-list_trim', 'brand_idx-list_trim', 'price_log_norm-list_trim',  'delta_event_secs-list_trim',
                 'delta_event_secs_log_norm-list_trim', 'product_recency_days-list_trim', 'product_recency_days_log_norm-list_trim', 'relative_price_to_avg_categ_id-list_trim',
                 'et_hour_sin-list_trim', 'et_hour_cos-list_trim', 'et_month_sin-list_trim', 'et_month_cos-list_trim', 'et_dayofweek_sin-list_trim', 'et_dayofweek_cos-list_trim',
                 'et_dayofmonth_sin-list_trim', 'et_dayofmonth_cos-list_trim', 'session_size', 'day_idx_padded']


In [69]:
sessions_gdf = df_indexed[SELECTED_COLS]
# sessions_gdf.head()

In [70]:
#sessions_gdf.groupby('day_idx_padded').size().compute().sort_index()

#### Export dataset to parquet, partioned by the session day_idx

In [73]:
PARTITION_COL = 'day_idx_padded'

In [74]:
# Convert to a Dataset and write out hive-partitioned data to disk
nvt_output_path_tmp ='./output_nvt_tmp/'
nvt.Dataset(sessions_gdf).to_parquet(nvt_output_path_tmp, partition_on=[PARTITION_COL])

In [101]:
!ls $nvt_output_path_tmp/

 _file_list.txt		'day_idx_padded2=0010'	'day_idx_padded2=0022'
 _metadata		'day_idx_padded2=0011'	'day_idx_padded2=0023'
 _metadata.json		'day_idx_padded2=0012'	'day_idx_padded2=0024'
'day_idx_padded2=0001'	'day_idx_padded2=0013'	'day_idx_padded2=0025'
'day_idx_padded2=0002'	'day_idx_padded2=0014'	'day_idx_padded2=0026'
'day_idx_padded2=0003'	'day_idx_padded2=0015'	'day_idx_padded2=0027'
'day_idx_padded2=0004'	'day_idx_padded2=0016'	'day_idx_padded2=0028'
'day_idx_padded2=0005'	'day_idx_padded2=0017'	'day_idx_padded2=0029'
'day_idx_padded2=0006'	'day_idx_padded2=0018'	'day_idx_padded2=0030'
'day_idx_padded2=0007'	'day_idx_padded2=0019'	'day_idx_padded2=0031'
'day_idx_padded2=0008'	'day_idx_padded2=0020'
'day_idx_padded2=0009'	'day_idx_padded2=0021'


In [75]:
rename_columns = ['user_idx', 'user_session', 'session_start_ts', 'sess_pid_seq', 'sess_etime_seq', 'sess_etype_seq', 'sess_csid_seq', 'sess_ccid_seq', 'sess_bid_seq',
            'sess_price_log_norm_seq', 'sess_dtime_secs_seq','sess_dtime_secs_log_norm_seq', 'sess_prod_recency_days_seq','sess_prod_recency_days_log_norm_seq',
            'sess_relative_price_to_avg_category_seq', 'sess_et_hour_sin_seq', 'sess_et_hour_cos_seq', 'sess_et_month_sin_seq', 'sess_et_month_cos_seq', 
            'sess_et_dayofweek_sin_seq', 'sess_et_dayofweek_cos_seq', 'sess_et_dayofmonth_sin_seq', 'sess_et_dayofmonth_cos_seq', 'session_size']

### Converting to the HF4Rec dir structure and splitting dataset¶
- move to pandas to be able to save as required file structure to load back in the model.

In [84]:
days_folders = [f for f in sorted(os.listdir(nvt_output_path_tmp)) if f.startswith(PARTITION_COL)]
for day_folder in days_folders:
    df = pd.read_parquet(os.path.join(nvt_output_path_tmp, day_folder))
    df = df.sort_values('event_time_ts-first')
    df.columns = rename_columns
    print("train_set:", len(df))
    out_folder = os.path.join(OUTPUT_FOLDER, day_folder.replace('day_idx_padded=', ''))
    os.makedirs(out_folder, exist_ok=True)
    
    df.to_parquet(os.path.join(out_folder, 'train.parquet'), engine='pyarrow', row_group_size=10000)
    
    random_values = np.random.rand(len(df))
    
    #Extracts 10% for valid and test set. Those sessions are also in the train set, but as evaluation
    #happens only for the subsequent day of training, that is not an issue, and we can keep the train set larger.
    valid_set = df[random_values <= 0.10]
    valid_set.to_parquet(os.path.join(out_folder, 'valid.parquet'))
    print("valid_set:", len(valid_set))
    test_set = df[random_values >= 0.90]
    print("test_set:", len(test_set))
    test_set.to_parquet(os.path.join(out_folder, 'test.parquet'))

train_set: 133105
valid_set: 13377
test_set: 13314
train_set: 121623
valid_set: 12160
test_set: 12070
train_set: 109534
valid_set: 10911
test_set: 10927
train_set: 137280
valid_set: 13707
test_set: 13826
train_set: 125959
valid_set: 12435
test_set: 12572
train_set: 123606
valid_set: 12215
test_set: 12343
train_set: 114435
valid_set: 11287
test_set: 11522
train_set: 137286
valid_set: 13847
test_set: 13753
train_set: 130990
valid_set: 13024
test_set: 13116
train_set: 119762
valid_set: 12038
test_set: 12116
train_set: 141178
valid_set: 14108
test_set: 14124
train_set: 136945
valid_set: 13656
test_set: 13600
train_set: 149435
valid_set: 14853
test_set: 14881
train_set: 131733
valid_set: 13235
test_set: 13103
train_set: 143179
valid_set: 14265
test_set: 14354
train_set: 140935
valid_set: 14039
test_set: 14380
train_set: 127738
valid_set: 12916
test_set: 12833
train_set: 135473
valid_set: 13500
test_set: 13543
train_set: 127588
valid_set: 12871
test_set: 12698
train_set: 132022
valid_set: 13

#### Check exported dataset

In [86]:
# Check one file
df = cudf.read_parquet(os.path.join(OUTPUT_FOLDER, '0001/train.parquet'))
df.head()

Unnamed: 0,user_idx,user_session,session_start_ts,sess_pid_seq,sess_etime_seq,sess_etype_seq,sess_csid_seq,sess_ccid_seq,sess_bid_seq,sess_price_log_norm_seq,...,sess_relative_price_to_avg_category_seq,sess_et_hour_sin_seq,sess_et_hour_cos_seq,sess_et_month_sin_seq,sess_et_month_cos_seq,sess_et_dayofweek_sin_seq,sess_et_dayofweek_cos_seq,sess_et_dayofmonth_sin_seq,sess_et_dayofmonth_cos_seq,session_size
15877,1744966,4499322,1569888001,"[2451, 2142, 1827, 2733, 1983]","[1569888001, 1569888019, 1569888065, 156988816...","[3, 3, 3, 3, 3]","[159, 159, 159, 159, 159]","[159, 159, 159, 159, 159]","[1778, 1387, 1387, 13, 13]","[0.43329915, 1.0934848, 1.8503833, 0.45006597,...",...,"[-0.6638423781795151, -0.2243163544160583, 1.0...","[2.6179939e-07, 2.6179939e-07, 2.6179939e-07, ...","[1.0, 1.0, 1.0, 1.0, 1.0]","[0.5000004, 0.5000004, 0.5000004, 0.5000004, 0...","[0.86602515, 0.86602515, 0.86602515, 0.8660251...","[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...","[-0.22252177, -0.22252177, -0.22252177, -0.222...","[2.0268341e-07, 2.0268341e-07, 2.0268341e-07, ...","[1.0, 1.0, 1.0, 1.0, 1.0]",5
116692,676889,3120154,1569888001,"[75671, 69217, 69041, 69226, 69055, 69159, 691...","[1569888001, 1569888069, 1569888086, 156988820...","[3, 3, 3, 3, 3, 3, 3, 3, 3]","[180, 107, 107, 107, 107, 107, 107, 107, 107]","[180, 107, 107, 107, 107, 107, 107, 107, 107]","[3163, 3163, 502, 3163, 3163, 3163, 3163, 3163...","[1.0403255, 0.45006597, 0.9571255, 0.9735843, ...",...,"[-0.14619387999067904, -0.5671669560450453, -0...","[2.6179939e-07, 2.6179939e-07, 2.6179939e-07, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[0.5000004, 0.5000004, 0.5000004, 0.5000004, 0...","[0.86602515, 0.86602515, 0.86602515, 0.8660251...","[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...","[-0.22252177, -0.22252177, -0.22252177, -0.222...","[2.0268341e-07, 2.0268341e-07, 2.0268341e-07, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",9
54304,1220411,7176697,1569888004,"[525, 1147]","[1569888004, 1569888019]","[3, 3]","[85, 85]","[85, 85]","[166, 166]","[1.5852594, 1.9646319]",...,"[1.1506580296934636, 2.474092494979518]","[2.6179939e-07, 2.6179939e-07]","[1.0, 1.0]","[0.5000004, 0.5000004]","[0.86602515, 0.86602515]","[0.9749277, 0.9749277]","[-0.22252177, -0.22252177]","[2.0268341e-07, 2.0268341e-07]","[1.0, 1.0]",2
6679,78430,471501,1569888005,"[3278, 3367, 3385, 3169, 24684]","[1569888005, 1569888022, 1569888093, 156988810...","[3, 3, 3, 3, 3]","[211, 211, 211, 211, 52]","[211, 211, 211, 211, 52]","[2502, 2502, 2502, 2502, 1201]","[1.4471576, 1.4582802, 1.5095989, 1.5360643, 0...",...,"[0.10425298946218896, 0.11989400107802217, 0.1...","[2.6179939e-07, 2.6179939e-07, 2.6179939e-07, ...","[1.0, 1.0, 1.0, 1.0, 1.0]","[0.5000004, 0.5000004, 0.5000004, 0.5000004, 0...","[0.86602515, 0.86602515, 0.86602515, 0.8660251...","[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...","[-0.22252177, -0.22252177, -0.22252177, -0.222...","[2.0268341e-07, 2.0268341e-07, 2.0268341e-07, ...","[1.0, 1.0, 1.0, 1.0, 1.0]",5
123065,1780029,3557184,1569888008,"[145429, 145489, 145566]","[1569888008, 1569888027, 1569888044]","[3, 3, 3]","[131, 131, 131]","[131, 131, 131]","[1849, 3163, 1849]","[-0.98448616, 0.10355003, -0.096378]",...,"[-0.6828770962654591, 0.2761962530269521, -0.0...","[2.6179939e-07, 2.6179939e-07, 2.6179939e-07]","[1.0, 1.0, 1.0]","[0.5000004, 0.5000004, 0.5000004]","[0.86602515, 0.86602515, 0.86602515]","[0.9749277, 0.9749277, 0.9749277]","[-0.22252177, -0.22252177, -0.22252177]","[2.0268341e-07, 2.0268341e-07, 2.0268341e-07]","[1.0, 1.0, 1.0]",3


In [87]:
df.shape

(133105, 24)