In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =====

# Training and Deploying Multi-Stage Recommender Systems

In this notebook, we are going to use a subset of a publicly available eCommerce dataset. The full dataset contains 7 months data (from October 2019 to April 2020) from a large multi-category online store. Each row in the file represents an event. All events are related to products and users. Each event is like many-to-many relation between products and users. Data collected by Open CDP project and the source of the dataset is [REES46 Marketing Platform](https://rees46.com/).

We use csv files from 2019-Oct to 2020-April for training and validating our models, so you can visit this site and download the csv files: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store.

**Learning Objectives**

- Preliminary pre-processing and cleaning of the 7 months of Ecom-REES46 datasets.

In [2]:
import os
import pandas as pd
import glob
import cudf
import numpy as np
import gc

In [3]:
list_files = glob.glob('/workspace/data/ecom/*.csv')

In [4]:
list_files

['/workspace/data/ecom/2019-Dec.csv',
 '/workspace/data/ecom/2020-Jan.csv',
 '/workspace/data/ecom/2020-Feb.csv',
 '/workspace/data/ecom/2019-Oct.csv',
 '/workspace/data/ecom/2020-Mar.csv',
 '/workspace/data/ecom/2020-Apr.csv',
 '/workspace/data/ecom/2019-Nov.csv']

Next, we process a single .csv file and extract/filter the rows

In [5]:
# def process_files(file):
#     df_tmp = pd.read_csv(file)
#     df_tmp['session_purchase'] =  df_tmp['user_session'] + '_' + df_tmp['product_id'].astype(str)
#     df_purchase = df_tmp[df_tmp['event_type']=='purchase']
#     df_cart = df_tmp[df_tmp['event_type']=='cart']
#     df_purchase = df_purchase[df_purchase['session_purchase'].isin(df_cart['session_purchase'])]
#     df_cart = df_cart[~(df_cart['session_purchase'].isin(df_purchase['session_purchase']))]
#     df_cart['target'] = 0
#     df_purchase['target'] = 1
#     df = pd.concat([df_cart, df_purchase])
#     df = df.drop('category_id', axis=1)
#     df = df.drop('session_purchase', axis=1)
#     df[['cat_0', 'cat_1', 'cat_2', 'cat_3']] = df['category_code'].str.split("\.", n = 3, expand = True).fillna('NA')
#     df['brand'] = df['brand'].fillna('NA')
#     df = df.drop('category_code', axis=1)
#     df['timestamp'] = pd.to_datetime(df['event_time'].str.replace(' UTC', ''))
#     df['ts_month'] = df['timestamp'].dt.month
#     df['event_time_ts']= df['timestamp'].astype('int')

#     df.to_csv('./' + file.replace('/workspace/data/ecom', ''), index=False)

In [6]:
# for file in list_files:
#     print(file)
#     process_files(file)

In [7]:
lp = []
list_files = glob.glob('./*.csv')

In [8]:
for file in list_files:
    lp.append(cudf.read_csv(file))

In [9]:
df = cudf.concat(lp)

Cast dtypes. Be sure the timestamp column is a datetime object.

In [10]:
df['timestamp'] = df['timestamp'].astype('datetime64[s]')
df['user_id'] = df['user_id'].astype('int32')
df['product_id'] = df['product_id'].astype('int32')
df['target'] = df['target'].astype('int32')
df['ts_month'] = df['ts_month'].astype('int32')
df['price'] = df['price'].astype('float32')

In [11]:
df.shape

(16695562, 15)

Add day and hour columns

In [12]:
df.head(2)

Unnamed: 0,event_time,event_type,product_id,brand,price,user_id,user_session,target,cat_0,cat_1,cat_2,cat_3,timestamp,ts_month,event_time_ts
0,2019-12-01 00:00:28 UTC,cart,17800342,zeta,66.900002,550465671,22650a62-2d9c-4151-9f41-2674ec6d32d5,0,computers,desktop,,,2019-12-01 00:00:28,12,1575158428000000000
1,2019-12-01 00:00:39 UTC,cart,3701309,polaris,89.32,543733099,a65116f4-ac53-4a41-ad68-6606788e674c,0,appliances,environment,vacuum,,2019-12-01 00:00:39,12,1575158439000000000


In [13]:
# Remove repeated interactions within the same hour, keeping only the last time the user has viewed a product
df = df.groupby(['user_id', 'product_id']).last().reset_index()

In [14]:
df.shape

(9279526, 15)

In [15]:
# check if there are any duplicated (user, item) pairs
df.to_pandas().duplicated(subset=['user_id', 'product_id']).sum()

0

In [16]:
df['timestamp'].min(), df['timestamp'].max()

(numpy.datetime64('2019-10-01T00:05:14'),
 numpy.datetime64('2020-04-30T23:59:55'))

In [17]:
#df.to_parquet('/workspace/data/ecom/df_Oct_Apr.parquet')

In [18]:
df_train = df[(df['ts_month']!=4)]
df_valid = df[(df['ts_month']==4)]

In [19]:
df_train.shape, df_valid.shape

((7613270, 15), (1666256, 15))

In [20]:
df_train['event_type'].value_counts()

cart        4222002
purchase    3391268
Name: event_type, dtype: int32

In [21]:
df =None
del df
gc.collect()

33

### Select users with minimum interactions in both train and valid sets

In [22]:
df_train.user_id.nunique(), df_train.product_id.nunique()

(2969453, 186755)

In [23]:
df_train.groupby('user_id').size().describe(percentiles=np.arange(0.0, 1.05, 0.05))

count    2.969453e+06
mean     2.563863e+00
std      3.508274e+00
min      1.000000e+00
0%       1.000000e+00
5%       1.000000e+00
10%      1.000000e+00
15%      1.000000e+00
20%      1.000000e+00
25%      1.000000e+00
30%      1.000000e+00
35%      1.000000e+00
40%      1.000000e+00
45%      1.000000e+00
50%      1.000000e+00
55%      2.000000e+00
60%      2.000000e+00
65%      2.000000e+00
70%      2.000000e+00
75%      3.000000e+00
80%      3.000000e+00
85%      4.000000e+00
90%      5.000000e+00
95%      8.000000e+00
100%     4.750000e+02
max      4.750000e+02
dtype: float64

In [24]:
df_train.groupby('product_id').size().describe(percentiles=np.arange(0.0, 1.05, 0.05))

count    186755.000000
mean         40.766084
std         859.462202
min           1.000000
0%            1.000000
5%            1.000000
10%           1.000000
15%           1.000000
20%           1.000000
25%           1.000000
30%           2.000000
35%           2.000000
40%           2.000000
45%           3.000000
50%           3.000000
55%           4.000000
60%           5.000000
65%           6.000000
70%           8.000000
75%          11.000000
80%          15.000000
85%          22.000000
90%          37.000000
95%          83.000000
100%     170297.000000
max      170297.000000
dtype: float64

In [25]:
df_train.user_id.nunique(), df_train.product_id.nunique()

(2969453, 186755)

Avoid heavy users- cap users' high interactions keeping only the last 50 (max) interactions

In [26]:
df_train = df_train.sort_values(['user_id', 'event_time_ts'], ascending=False)
df_train["ones"] = 1
df_train['cumsum'] = df_train.groupby('user_id')['ones'].cumsum()
df_train = df_train[df_train['cumsum'] <= 50]

In [27]:
df_train.groupby('user_id').size().describe(percentiles=np.arange(0.0, 1.05, 0.05))

count    2.969453e+06
mean     2.553940e+00
std      3.278977e+00
min      1.000000e+00
0%       1.000000e+00
5%       1.000000e+00
10%      1.000000e+00
15%      1.000000e+00
20%      1.000000e+00
25%      1.000000e+00
30%      1.000000e+00
35%      1.000000e+00
40%      1.000000e+00
45%      1.000000e+00
50%      1.000000e+00
55%      2.000000e+00
60%      2.000000e+00
65%      2.000000e+00
70%      2.000000e+00
75%      3.000000e+00
80%      3.000000e+00
85%      4.000000e+00
90%      5.000000e+00
95%      8.000000e+00
100%     5.000000e+01
max      5.000000e+01
dtype: float64

In [28]:
def filter_by_freq(df_to_filter: cudf.DataFrame, df_for_stats: cudf.DataFrame, column: str, min_freq: int) -> cudf.DataFrame:
    # Frequencies of each value in the column.
    freq = df_for_stats[column].value_counts()
    # Select frequent values. Value is in the index.
    frequent_values = freq[freq >= min_freq].index
    # Return only rows with value frequency above threshold.
    return df_to_filter[df_to_filter[column].isin(frequent_values)]

In [29]:
NUM_ROUNDS_MIN_FREQ_FILTERING=10
MIN_USER_FREQ=5
MIN_ITEM_FREQ=5

In [30]:
print('Before filtering: ', len(df_train))
for r in range(NUM_ROUNDS_MIN_FREQ_FILTERING):
    print(f'Round #{r}')
    df_train = filter_by_freq(df_to_filter=df_train, df_for_stats=df_train, 
                                      column='user_id', min_freq=MIN_USER_FREQ)
    print('After filtering users: ',len(df_train))
    df_train = filter_by_freq(df_to_filter=df_train, df_for_stats=df_train, 
                                      column='product_id', min_freq=MIN_ITEM_FREQ)
    print('After filtering items: ',len(df_train))


Before filtering:  7583804
Round #0
After filtering users:  3302384
After filtering items:  3118824
Round #1
After filtering users:  3031772
After filtering items:  3020691
Round #2
After filtering users:  3013789
After filtering items:  3012746
Round #3
After filtering users:  3011940
After filtering items:  3011844
Round #4
After filtering users:  3011772
After filtering items:  3011754
Round #5
After filtering users:  3011746
After filtering items:  3011746
Round #6
After filtering users:  3011746
After filtering items:  3011746
Round #7
After filtering users:  3011746
After filtering items:  3011746
Round #8
After filtering users:  3011746
After filtering items:  3011746
Round #9
After filtering users:  3011746
After filtering items:  3011746


In [31]:
df_train.user_id.nunique(), df_train.product_id.nunique()

(351049, 51424)

In [32]:
gc.collect()

174

In [33]:
df_train.isnull().any()

user_id          False
product_id       False
event_time       False
event_type       False
brand             True
price            False
user_session     False
target           False
cat_0             True
cat_1             True
cat_2             True
cat_3             True
timestamp        False
ts_month         False
event_time_ts    False
ones             False
cumsum           False
dtype: bool

In [34]:
df_train = df_train.drop(['ones', 'cumsum'], axis=1)

In [35]:
df_train.head(2)

Unnamed: 0,user_id,product_id,event_time,event_type,brand,price,user_session,target,cat_0,cat_1,cat_2,cat_3,timestamp,ts_month,event_time_ts
8850950,635096898,26205398,2020-03-31 20:00:17 UTC,purchase,,178.380005,27282c23-cf25-436f-87f9-b1fefa8ecee3,1,construction,components,faucet,,2020-03-31 20:00:17,3,1585684817000000000
8850949,635096898,26205378,2020-03-31 19:58:21 UTC,purchase,,263.070007,27282c23-cf25-436f-87f9-b1fefa8ecee3,1,construction,components,faucet,,2020-03-31 19:58:21,3,1585684701000000000


In [36]:
df_train.user_id.nunique(), df_train.product_id.nunique()

(351049, 51424)

In [37]:
df_train = df_train.reset_index(drop=True)

In [38]:
df_train.shape

(3011746, 15)

In [39]:
df_train.to_parquet('/workspace/data/ecom/train.parquet')

### Valid and Test Sets

Be sure that test set does not have OOV.

In [40]:
df_valid.shape

(1666256, 15)

In [41]:
df_valid['ts_month'].value_counts()

4    1666256
Name: ts_month, dtype: int32

Be sure valid dataset does not have unseen user_id and product_id

In [42]:
df_valid = df_valid[df_valid['user_id'].isin(df_train['user_id'].unique()) & df_valid['product_id'].isin(df_train['product_id'].unique())]
len(df_valid)

181502

In [43]:
df_valid['ts_month'].value_counts()

4    181502
Name: ts_month, dtype: int32

In [44]:
df_valid.groupby('user_id').size().describe(percentiles=np.arange(0.0, 1.05, 0.05))

count    81827.000000
mean         2.218119
std          2.263735
min          1.000000
0%           1.000000
5%           1.000000
10%          1.000000
15%          1.000000
20%          1.000000
25%          1.000000
30%          1.000000
35%          1.000000
40%          1.000000
45%          1.000000
50%          1.000000
55%          2.000000
60%          2.000000
65%          2.000000
70%          2.000000
75%          3.000000
80%          3.000000
85%          4.000000
90%          4.000000
95%          6.000000
100%       132.000000
max        132.000000
dtype: float64

In [45]:
df_valid.groupby('product_id').size().describe(percentiles=np.arange(0.0, 1.05, 0.05))

count    20405.000000
mean         8.894977
std         52.264027
min          1.000000
0%           1.000000
5%           1.000000
10%          1.000000
15%          1.000000
20%          1.000000
25%          1.000000
30%          1.000000
35%          1.000000
40%          2.000000
45%          2.000000
50%          2.000000
55%          3.000000
60%          3.000000
65%          4.000000
70%          5.000000
75%          6.000000
80%          7.000000
85%          9.000000
90%         14.000000
95%         26.000000
100%      3373.000000
max       3373.000000
dtype: float64

Avoid heavy users- cap users' high interactions keeping only the last 50 (max) interactions

In [46]:
valid = df_valid.sort_values(['user_id', 'event_time_ts'], ascending=False)
valid["ones"] = 1
valid['cumsum'] = valid.groupby('user_id')['ones'].cumsum()
valid = valid[valid['cumsum'] <= 50]

In [47]:
valid.groupby('user_id').size().describe(percentiles=np.arange(0.0, 1.05, 0.05))

count    81827.000000
mean         2.215858
std          2.189142
min          1.000000
0%           1.000000
5%           1.000000
10%          1.000000
15%          1.000000
20%          1.000000
25%          1.000000
30%          1.000000
35%          1.000000
40%          1.000000
45%          1.000000
50%          1.000000
55%          2.000000
60%          2.000000
65%          2.000000
70%          2.000000
75%          3.000000
80%          3.000000
85%          4.000000
90%          4.000000
95%          6.000000
100%        50.000000
max         50.000000
dtype: float64

In [48]:
valid = valid.drop(['ones', 'cumsum'], axis=1)

In [49]:
valid['ts_month'].value_counts()

4    181317
Name: ts_month, dtype: int32

In [50]:
valid.head(2)

Unnamed: 0,user_id,product_id,event_time,event_type,brand,price,user_session,target,cat_0,cat_1,cat_2,cat_3,timestamp,ts_month,event_time_ts
8850948,635096898,1005243,2020-04-15 11:45:02 UTC,purchase,vivo,411.829987,52322b04-c196-46c0-b924-a06fff38727d,1,electronics,smartphone,,,2020-04-15 11:45:02,4,1586951102000000000
8850955,635096898,100023409,2020-04-15 11:43:07 UTC,purchase,lenovo,491.649994,52322b04-c196-46c0-b924-a06fff38727d,1,construction,tools,light,,2020-04-15 11:43:07,4,1586950987000000000


In [51]:
df_valid=None
del df_valid
gc.collect()

14

In [52]:
valid = valid.reset_index(drop=True)
valid.to_parquet('/workspace/data/ecom/valid.parquet')

In [53]:
valid=None
del valid
gc.collect()

85