In [None]:
#
# The MIT License (MIT)

# Copyright (c) 2021, NVIDIA CORPORATION

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

**N.B: This preproc-V1 generates interactions and sessions without removing repetitions** 

In [1]:
from IPython.display import display
import gc
import glob
import os
from functools import partial

import cudf
import cupy
import json
import numpy as np
import nvtabular as nvt
import pandas as pd
from tqdm import tqdm
import pickle

pd.options.display.max_columns = None
tqdm.pandas()

  from pandas import Panel


In [2]:
nvt.__version__

'0.5.3'

In [3]:
# Set data and output paths 
DATA_FOLDER = "/workspace/"
FILENAME_PATTERN_BROWSING = 'browsing_train.csv'
FILENAME_PATTERN_SEARCH = 'search_train.csv'
DATA_PATH_BROWSING = os.path.join(DATA_FOLDER, FILENAME_PATTERN_BROWSING)
DATA_PATH_SEARCH = os.path.join(DATA_FOLDER, FILENAME_PATTERN_SEARCH)
OUTPUT_DIR = '/workspace/coveo_task2_v1_phase2/sessions_with_repetitions/'

* The objective of this notebook is to create sequential features for user sessions and generate classification features: 
    - The product id of first AC event 
    - The purchase of this AC (binary label)  
    - nb_after_add 

* Four independant sections that create different parquet files : 

   - Feature engineering with Pandas to merge browsing, search and test data and create purchase and AC features: <a href ='#pandas_proc'> Section 1 </a> 

   - Preprocess row interactions to encode categoricals and normalize numerical features using NVTabular :  <a href='#row_workflow'> Section 2 </a>
   
   - Group by interactions to create sessions table using NVTabular:  <a href='#session_workflow'> Section 3 </a>
   
   - Duplicate sessions in train and validation data by truncating the sequence of interactions to different number of actions (0, 2, 4, 6, 8, 10) after the AC : <a href='#session_duplicate'> Section 4 </a>

<h1> <center> <a id='pandas_proc'> Section 1 : Create event table </a></center></h1>

- Merge test and browsing data 

In [4]:
browsing = pd.read_csv(DATA_PATH_BROWSING, sep=',')
# Add columns 'is_search' and 'is_test'
browsing['is_search'] = 0 
browsing['is_test'] = 0 
# Load test data 
with open('/workspace/intention_test_phase_2.json') as json_file:
    # read the test cases from the provided file
    test_queries = json.load(json_file)
# Add browsing events from test data
test_df = pd.json_normalize(test_queries, 'query', 'nb_after_add')
test_df['is_test'] = 1
print("There is %s unique sessions in Test table" %test_df.session_id_hash.nunique())
test_browsing = test_df[['session_id_hash', 'event_type', 'product_action', 'product_sku_hash',
       'server_timestamp_epoch_ms', 'hashed_url', 'is_search', 'is_test', 'nb_after_add']]
test_browsing = test_browsing[test_browsing.is_search==False]
# Concat train test browsing data to create event table 
event_df = pd.concat([browsing, test_browsing])
event_df.reset_index(drop=True, inplace=True)
del browsing

There is 47711 unique sessions in Test table


- Process duplicated events: which are defined as interactions that occur in the same session and at the same time 

In [5]:
tmp = event_df[(event_df.event_type == 'pageview') & (event_df.duplicated(['session_id_hash' , 'server_timestamp_epoch_ms']))]
event_df.drop(tmp.index, inplace=True)

- Keep the mapping of test sessions to their nb_after_add before merging with search table 

In [6]:
test_session_mapping= dict(zip(test_df.session_id_hash, test_df.nb_after_add))

- Generate search table from train and test data 

In [7]:
import ast
# helper function to convert string to list object
def convert_str_to_list(x): 
    if pd.isnull(x): 
        return x
    return ast.literal_eval(x)

In [8]:
# Load search data
search = pd.read_csv("/workspace/search_train.csv", sep=',')
# Add column event_type 
search['event_type'] = 'search'
# Add column 'is_search'
search['is_search'] = 1
search['is_test'] = 0
# Drop 123 rows where: (clicked_skus_hash != NaN) and (product_skus_hash == NaN)
condition = (search['product_skus_hash'].isnull()) & (~search['clicked_skus_hash'].isnull())
search = search.loc[~condition]
# Convert strings to list object 
for col in ['product_skus_hash', 'clicked_skus_hash', 'query_vector']: 
    search[col] = search[col].progress_apply(convert_str_to_list)
# Add search events from test data
test_search = test_df[['session_id_hash', 'query_vector', 'clicked_skus_hash',
       'product_skus_hash', 'server_timestamp_epoch_ms', 'event_type',
       'is_search', 'is_test']]
test_search = test_search[test_search.is_search==True]
# Concat test and train search data
search = pd.concat([search, test_search])
search.reset_index(inplace=True)
# Compute number of returned and clicked items 
search['impression_size'] = search.product_skus_hash.str.len().fillna(0)
search['clicks_size'] = search.clicked_skus_hash.str.len().fillna(0)
# Compute number of search queries per session 
tmp = search.groupby('session_id_hash').size().reset_index()
tmp.columns = ['session_id_hash', 'nb_queries']
search = search.merge(tmp, on='session_id_hash', how='left')
# Update list of impressions by the clicked item when it is missing
def add_clicked(x): 
    if isinstance(x.clicked_skus_hash, list) and isinstance(x.product_skus_hash, list):
        return list(set(x.product_skus_hash).union(set(x.clicked_skus_hash)))
    return x.product_skus_hash
search['updated_product_skus_hash'] = search.progress_apply(add_clicked, axis=1)

100%|██████████| 819393/819393 [00:33<00:00, 24499.01it/s]
100%|██████████| 819393/819393 [00:04<00:00, 164598.00it/s]
100%|██████████| 819393/819393 [02:46<00:00, 4909.77it/s]
100%|██████████| 834938/834938 [00:28<00:00, 29056.09it/s]


In [9]:
def add_clicked(x): 
    if isinstance(x.clicked_skus_hash, list) and isinstance(x.product_skus_hash, list):
        return list(set(x.product_skus_hash).union(set(x.clicked_skus_hash)))
    return x.product_skus_hash
search['updated_product_skus_hash'] = search.progress_apply(add_clicked, axis=1)

100%|██████████| 834938/834938 [00:31<00:00, 26235.30it/s]


- Define the session search as a sequence of search queries and the  interacted items 

In [10]:
def all_products(x): 
    t =[]
    for products in x.dropna(): 
        t += products
    if len(t)==0:
        return ['missing']
    return t

session_search = search.sort_values(['session_id_hash',
                                     'server_timestamp_epoch_ms']).groupby('session_id_hash').agg({'query_vector': lambda x: list(np.concatenate(x.values)),
                                                                                                                    'updated_product_skus_hash': all_products,
                                                                                                                    'clicked_skus_hash': all_products,
                                                                                                                    'impression_size': list,
                                                                                                                    'clicks_size': list,
                                                                                                                    'nb_queries': 'last'
                                                                                                                  })
session_search.columns = ['flat_query_vector', 'flat_product_skus_hash', 'flat_clicked_skus_hash', 'impressions_size', 'clicks_size', 'nb_queries']
session_search['clicked-flag'] = session_search.progress_apply(lambda x: [int(e in x['flat_clicked_skus_hash']) for e in x['flat_product_skus_hash']], axis=1)
session_search = session_search.reset_index()

100%|██████████| 560394/560394 [00:58<00:00, 9543.72it/s] 


- Save search tables 

In [11]:
session_search.to_parquet(os.path.join(OUTPUT_DIR, "session_search.parquet"))
search.to_parquet(os.path.join(OUTPUT_DIR, "search.parquet"))

- Add search clicks as an additional product_action in the event_table 

In [12]:
# select search events with clicks
use_cols = ['session_id_hash', 'clicked_skus_hash',
            'server_timestamp_epoch_ms', 'event_type',
            'is_search']
search_clicks = search[search.clicks_size>0][use_cols]
print("There are %s search sessions that generate a click" %search_clicks.session_id_hash.nunique())
# Create new event-type and product-action
search_clicks['event_type'] = 'search'
search_clicks['product_action'] = 'click'
search_clicks['is_test'] = search_clicks['session_id_hash'].isin(test_df.session_id_hash.unique()).astype(int)
# Unstack the list of clicked items to multiple rows : each row is a single clicked item 
lst_col = 'clicked_skus_hash'
search_clicks = pd.DataFrame({
    col:np.repeat(search_clicks[col].values, search_clicks[lst_col].str.len()) for col in search_clicks.columns.difference([lst_col])}).assign(
    **{lst_col:np.concatenate(search_clicks[lst_col].values)})[search_clicks.columns.tolist()]
search_clicks.columns = ['session_id_hash', 'product_sku_hash', 'server_timestamp_epoch_ms',
                         'event_type', 'is_search', 'product_action', 'is_test']
# add nb_after_add for click events from the task mapping 
search_clicks['nb_after_add'] = search_clicks.session_id_hash.map(test_session_mapping)
# Add search clicks to event table 
event_df = pd.concat([event_df, search_clicks])
event_df.event_type.value_counts()

There are 157493 search sessions that generate a click


pageview         20967984
event_product    10671295
search             402589
Name: event_type, dtype: int64

- Fill missing product ids of pageview events with the url of the page : new column 'product_url_hash' is created

In [13]:
event_df['product_url_hash'] = event_df['product_sku_hash'].fillna(event_df['hashed_url'])

-  Keep sessions with at least one 'add' product_action 

In [14]:
sessions_with_add_mask = event_df.groupby('session_id_hash').progress_apply(lambda x: 'add' in x.product_action.values)
print(" \n There are %s sessions with at least one add-to-cart (AC) event" %sessions_with_add_mask.sum())
sessions_with_add = sessions_with_add_mask[sessions_with_add_mask == True].index.tolist()
event_df =event_df[event_df.session_id_hash.isin(sessions_with_add)]

100%|██████████| 4982436/4982436 [06:46<00:00, 12242.56it/s]


 
 There are 262395 sessions with at least one add-to-cart (AC) event


-  Add product information

In [15]:
product_info = pd.read_csv('/workspace/sku_to_content.csv')
def product_main_category(x):
    if pd.isna(x):
        return x
    return x.split('/')[0]

# Extract product main category
product_info['main_category'] = product_info['category_hash'].progress_apply(product_main_category)

# Compute average price of main and hierarchy category
main_price = product_info.groupby('main_category')['price_bucket'].mean().reset_index()
main_price.columns = ['main_category', 'mean_price_main']
hierarchy_price = product_info.groupby('category_hash')['price_bucket'].mean().reset_index()
hierarchy_price.columns = ['category_hash', 'mean_price_hierarchy']
product_info = product_info.merge(main_price, on=['main_category'], how='left')
product_info = product_info.merge(hierarchy_price, on=['category_hash'], how='left')

# Merge the event table with product information 
event_df = event_df.merge(product_info[['product_sku_hash', 'main_category', 'category_hash',
                                        'price_bucket', 'mean_price_hierarchy', 'mean_price_main' ]], 
                          on='product_sku_hash', how='left')

100%|██████████| 66386/66386 [00:00<00:00, 439230.37it/s]


- Create features related to the first purchase event within a session : 
             'first_purchase_id', 'first_purchase_position', 'is_purchased', 'purchase_timestamp'

In [16]:
def get_purchase_index(x): 
    if 'purchase' not in x.product_action.values: 
        return ['no_purchase', len(x), 0, x.server_timestamp_epoch_ms.values.max()]
    position =  x.product_action.values.tolist().index('purchase')
    purchase_id = x.product_sku_hash.values.tolist()[position]
    is_purchased = 1 
    purchase_timestamp = x.server_timestamp_epoch_ms.values.tolist()[position]
    return (purchase_id, position, is_purchased, purchase_timestamp)
purchase_event = event_df.groupby('session_id_hash').progress_apply(get_purchase_index)
purchase_event.columns = ['session_id_hash', 'purchase_features']
purchase_event = pd.DataFrame(purchase_event.tolist(), index= purchase_event.index).reset_index()
purchase_event.columns = ['session_id_hash', 'first_purchase_id',
                          'first_purchase_position', 'is_purchased',
                          'purchase_timestamp']
# merge purchase features and event table 
event_df = event_df.merge(purchase_event, on='session_id_hash', how='left')
print("There are %s purchase events" %event_df.drop_duplicates('session_id_hash').is_purchased.sum())

100%|██████████| 262395/262395 [00:48<00:00, 5410.34it/s]


There are 46138 purchase events


* Keep only the interactions that happened before the first purchase events 

In [17]:
event_df = event_df[event_df['server_timestamp_epoch_ms'] <= event_df['purchase_timestamp']]

- Drop sessions that lost the AC event after filtering out interactions happening after the first puchase

In [18]:
sessions_with_add_mask = event_df.groupby('session_id_hash').progress_apply(lambda x: 'add' in x.product_action.values)
print(" \n We keep %s sessions with at least one add-to-cart (AC) event before purchase" %sessions_with_add_mask.sum())
sessions_with_add = sessions_with_add_mask[sessions_with_add_mask == True].index.tolist()
event_df =event_df[event_df.session_id_hash.isin(sessions_with_add)]

100%|██████████| 262395/262395 [00:23<00:00, 10962.63it/s]


 
 We keep 262296 sessions with at least one add-to-cart (AC) event before purchase


- Create features related to the first AC event : 
            'AC_position', 'first_AC_id', 'original_nb_after_add'

In [19]:
def first_ac_product(x):
    add_index = x.product_action.values.tolist().index('add')
    product_id = x['product_sku_hash'].values.tolist()[add_index]
    nb_after = len(x[add_index+1:])
    return [product_id, nb_after]
add_event = event_df.sort_values(['session_id_hash',
                                  'server_timestamp_epoch_ms']).groupby('session_id_hash').progress_apply(first_ac_product)
add_event = pd.DataFrame(add_event.tolist(), index= add_event.index).reset_index()
add_event.columns = ['session_id_hash', 'first_AC_id', 'original_nb_after_add']
event_df = event_df.merge(add_event, on='session_id_hash', how='left')

100%|██████████| 262296/262296 [00:47<00:00, 5551.90it/s]


- Fill missing actions with 'view'

In [20]:
event_df.product_action = event_df.product_action.fillna('view')

- The final event table is : 

In [21]:
display(event_df)

Unnamed: 0,session_id_hash,event_type,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url,is_search,is_test,nb_after_add,product_url_hash,main_category,category_hash,price_bucket,mean_price_hierarchy,mean_price_main,first_purchase_id,first_purchase_position,is_purchased,purchase_timestamp,first_AC_id,original_nb_after_add
0,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,event_product,detail,293231f99098fc9552a19111d8ed7b6188bf0721b1a501...,1551545503415,4e8321996f16f2734a5af4b4c780b4c4669fb316cf24ad...,0,0,,293231f99098fc9552a19111d8ed7b6188bf0721b1a501...,115a6a7017ee55752b8487c77dfde92b0d501d10a2e69c...,115a6a7017ee55752b8487c77dfde92b0d501d10a2e69c...,7.0,6.621057,6.258815,36e8246f800db10613eef89c81513df909ec8171a875f0...,8,1,1551546521904,36e8246f800db10613eef89c81513df909ec8171a875f0...,4
1,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,pageview,view,,1551545510494,744ac6435f90719fdec0276541a5c481138cc1f2f51106...,0,0,,744ac6435f90719fdec0276541a5c481138cc1f2f51106...,,,,,,36e8246f800db10613eef89c81513df909ec8171a875f0...,8,1,1551546521904,36e8246f800db10613eef89c81513df909ec8171a875f0...,4
2,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,pageview,view,,1551545514312,15c3f072fa3c3caa710d9cc114fa017f37deb117ec158c...,0,0,,15c3f072fa3c3caa710d9cc114fa017f37deb117ec158c...,,,,,,36e8246f800db10613eef89c81513df909ec8171a875f0...,8,1,1551546521904,36e8246f800db10613eef89c81513df909ec8171a875f0...,4
3,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,event_product,detail,36e8246f800db10613eef89c81513df909ec8171a875f0...,1551545514312,15c3f072fa3c3caa710d9cc114fa017f37deb117ec158c...,0,0,,36e8246f800db10613eef89c81513df909ec8171a875f0...,06fa312761d4b39e2f649781514ac69a4c1505c221fc46...,06fa312761d4b39e2f649781514ac69a4c1505c221fc46...,8.0,7.859416,5.692630,36e8246f800db10613eef89c81513df909ec8171a875f0...,8,1,1551546521904,36e8246f800db10613eef89c81513df909ec8171a875f0...,4
4,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,event_product,add,36e8246f800db10613eef89c81513df909ec8171a875f0...,1551545686640,83b4fdad686c1be4eba335f70d23ae202b84b6153e109e...,0,0,,36e8246f800db10613eef89c81513df909ec8171a875f0...,06fa312761d4b39e2f649781514ac69a4c1505c221fc46...,06fa312761d4b39e2f649781514ac69a4c1505c221fc46...,8.0,7.859416,5.692630,36e8246f800db10613eef89c81513df909ec8171a875f0...,8,1,1551546521904,36e8246f800db10613eef89c81513df909ec8171a875f0...,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5830489,ffe097c27c9dc7714be4170e5e4ed614b5ec7d5e2d2256...,search,click,ad3e11b11865f20a9bcc672698e01b3e961d4776229503...,1556580676511,,1,1,8,ad3e11b11865f20a9bcc672698e01b3e961d4776229503...,115a6a7017ee55752b8487c77dfde92b0d501d10a2e69c...,115a6a7017ee55752b8487c77dfde92b0d501d10a2e69c...,9.0,6.621057,6.258815,no_purchase,17,0,1556582292621,ad3e11b11865f20a9bcc672698e01b3e961d4776229503...,8
5830490,ffecd521a45cf3a43952a5f4cc2f5d2f1a95b3e30befcc...,search,click,237bd4d65e9a9ae281b4994767e6897fd6e8a14e9e6184...,1557449040003,,1,1,2,237bd4d65e9a9ae281b4994767e6897fd6e8a14e9e6184...,06fa312761d4b39e2f649781514ac69a4c1505c221fc46...,06fa312761d4b39e2f649781514ac69a4c1505c221fc46...,3.0,3.247863,5.692630,no_purchase,12,0,1557449386923,e2d05c62db7057b26232d57e49645d53f397a031ee09ff...,2
5830491,fff39b2d9e9595285f3fe5578c863881b0bdfb78b8e5f6...,search,click,046cf76660fe7d9f2a47272112c42d1adc642e76a60dfd...,1557697901706,,1,1,0,046cf76660fe7d9f2a47272112c42d1adc642e76a60dfd...,0665a81d19c89281cc00e7f7d779ded2ed42c933838602...,0665a81d19c89281cc00e7f7d779ded2ed42c933838602...,4.0,5.009017,5.009099,no_purchase,8,0,1557697912685,046cf76660fe7d9f2a47272112c42d1adc642e76a60dfd...,0
5830492,fff637fac2778b4cc545d1ab7af42596f85bf652c49ce9...,search,click,4d4b50e8de27416fb2cdce41fdd7be611fd7b0fcf2e333...,1557403643669,,1,1,0,4d4b50e8de27416fb2cdce41fdd7be611fd7b0fcf2e333...,115a6a7017ee55752b8487c77dfde92b0d501d10a2e69c...,115a6a7017ee55752b8487c77dfde92b0d501d10a2e69c...,4.0,6.621057,6.258815,no_purchase,6,0,1557403716699,b082c9fb875817897d7250a95810ee1519157968b269d7...,0


- Save to parquet file with 10 partitions 

In [22]:
event_df['parquet_split'] =  np.random.randint(0, 10, size=event_df.shape[0])
event_df.to_parquet(os.path.join(OUTPUT_DIR, "event_table"), partition_cols=['parquet_split'])

In [23]:
del event_df
gc.collect()

86

<h1> <center> <a id='row_workflow'> Define the preprocessed row interactions table </a> </center></h1>

In [4]:
files = glob.glob(OUTPUT_DIR + '/event_table/parquet_split*/*.parquet')

-  Workflow: Fill missing values, encode categorical variables and normalize numericals

In [5]:
#  load data 
df_event = nvt.Dataset(files, part_size="1GB") 

# convert timestamp to datetime object
to_datetime = ["server_timestamp_epoch_ms"] >> nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ms')) >> nvt.ops.Rename( f = lambda x: 'timestamp')

# fill missing product ids before categorify to keep id '0' for padding 
missing_ids = ['product_sku_hash','hashed_url'] >> nvt.ops.FillMissing(fill_val='missing')
cat_product_ids = missing_ids >> nvt.ops.Categorify()

#joint encode product url pruchase_event and add_event ids
cat_joint_product_ids = [['product_url_hash', 'first_purchase_id', 'first_AC_id']] >> nvt.ops.Categorify()

# Encode the categorical features
categ_feats = ['session_id_hash', 'product_action',  'event_type', 'price_bucket', 'main_category', 'category_hash']
cat_feats = categ_feats >> nvt.ops.Categorify()

# Fill and normalize numerical features 
cont_feats = ['mean_price_hierarchy', 'mean_price_main'] >> nvt.ops.FillMedian()
continuous_feats = cont_feats >> nvt.ops.Normalize()

# Keep original value of the remaining features 
other_feats =   ['is_purchased', 'purchase_timestamp', 'original_nb_after_add']

# Define and fit the workflow
workflow = nvt.Workflow(['nb_after_add', 'is_search', 'is_test'] + cat_feats  + cat_product_ids + cat_joint_product_ids + \
                        other_feats + to_datetime  + continuous_feats)
workflow.fit(df_event)

# Transform event table 
new_gdf = workflow.transform(df_event).to_ddf().compute()

# Include the item first time seen feature (for recency calculation) : Using product_url_hash column 
items_first_ts_df = new_gdf.groupby('product_url_hash').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
interactions_merged_df = new_gdf.merge(items_first_ts_df, on=['product_url_hash'], how='left')

In [8]:
use_cols = ['session_id_hash', 'timestamp',  'event_type', 'product_action',
            
            'product_sku_hash', 'hashed_url', 'product_url_hash',
            
            'main_category', 'category_hash', 'price_bucket', 'mean_price_hierarchy', 'mean_price_main', 
            
            'itemid_ts_first',  
            
            'first_AC_id', 
            
            'first_purchase_id', 'is_purchased', 'purchase_timestamp',
            
            'nb_after_add', 'original_nb_after_add', 'is_search', 'is_test'] 

interactions_merged_df[use_cols].head(3)

Unnamed: 0,session_id_hash,timestamp,event_type,product_action,product_sku_hash,hashed_url,product_url_hash,main_category,category_hash,price_bucket,mean_price_hierarchy,mean_price_main,itemid_ts_first,first_AC_id,first_purchase_id,is_purchased,purchase_timestamp,nb_after_add,original_nb_after_add,is_search,is_test
0,64241,2019-02-26 23:54:37.013,1,3,46528,172162,146875,1,4,9,2.292797,-2.520677,2019-01-15 20:02:55.620,146875,176371,0,1551226394839,,6,0,0
1,64241,2019-02-27 00:07:05.406,2,6,55897,172162,170452,0,0,0,0.010558,0.023351,2019-01-15 19:24:01.522,146875,176371,0,1551226394839,,6,0,0
2,64241,2019-02-27 00:07:05.406,1,3,46528,172162,146875,1,4,9,2.292797,-2.520677,2019-01-15 20:02:55.620,146875,176371,0,1551226394839,,6,0,0


- Save resulting table and nvtabular workflow 

In [9]:
# save the workflow : 
workflow.save(os.path.join(OUTPUT_DIR, "categorify_workflow"))

# Save the parquet table 
interactions_merged_df[use_cols].to_parquet(os.path.join(OUTPUT_DIR , 'row_interactions_task2_preproc_v2.parquet'))

<h1> <center> <a id='session_workflow'>Preprocessing of session table  </a> </center></h1>


In [46]:
cont_feats = ['mean_price_hierarchy', 'mean_price_main'] 
purchase_feat = ['first_AC_id', 'first_purchase_id', 'is_purchased', 'purchase_timestamp']
data_info = ['is_search', 'is_test'] 

In [47]:
interactions_merged_df = pd.read_parquet(os.path.join(OUTPUT_DIR , 'row_interactions_task2_preproc_v2.parquet'))

In [48]:
interactions_merged_df.head(3)

Unnamed: 0,session_id_hash,timestamp,event_type,product_action,product_sku_hash,hashed_url,product_url_hash,main_category,category_hash,price_bucket,mean_price_hierarchy,mean_price_main,itemid_ts_first,first_AC_id,first_purchase_id,is_purchased,purchase_timestamp,nb_after_add,original_nb_after_add,is_search,is_test
0,64241,2019-02-26 23:54:37.013,1,3,46528,172162,146875,1,4,9,2.292797,-2.520677,2019-01-15 20:02:55.620,146875,176371,0,1551226394839,,6,0,0
1,64241,2019-02-27 00:07:05.406,2,6,55897,172162,170452,0,0,0,0.010558,0.023351,2019-01-15 19:24:01.522,146875,176371,0,1551226394839,,6,0,0
2,64241,2019-02-27 00:07:05.406,1,3,46528,172162,146875,1,4,9,2.292797,-2.520677,2019-01-15 20:02:55.620,146875,176371,0,1551226394839,,6,0,0


In [49]:
# convert booleans to int 
interactions_merged_df[data_info ] = interactions_merged_df[ data_info ].astype('int32')

In [50]:
interactions_merged_df[interactions_merged_df.is_test==1].nb_after_add.isnull().sum()

0

In [51]:
# create time features
sessionTime = ['timestamp']

sessionTime_hour = (
    sessionTime >> 
    #nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ms').dt.hour) >> 
    nvt.ops.LambdaOp(lambda col: col.dt.hour) >> 
    nvt.ops.Rename(postfix = '_hour')
)
sessionTime_weekday = (
    sessionTime >> 
    #nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ms').dt.weekday) >> 
    nvt.ops.LambdaOp(lambda col: col.dt.weekday) >> 
    nvt.ops.Rename(postfix = '_wd')
)
sessionTime_day = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: col.dt.day) >> 
    nvt.ops.Rename(postfix="_day")
)

sessionTime_timestamp = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: (col.astype(int) / 1e6).astype(int)) >> 
    nvt.ops.Rename(f = lambda col: "ts")
)

# compute cycled features 
def get_cycled_feature_value_sin(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    return value_sin

def get_cycled_feature_value_cos(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_cos = np.cos(2*np.pi*value_scaled)
    return value_cos
hour_sin = sessionTime_hour >> (lambda col: get_cycled_feature_value_sin(col, 24)) >> nvt.ops.Rename(postfix = '_sin')
hour_cos = sessionTime_hour >> (lambda col: get_cycled_feature_value_cos(col, 24)) >> nvt.ops.Rename(postfix = '_cos')
weekday_sin = sessionTime_weekday >> (lambda col: get_cycled_feature_value_sin(col+1, 7)) >> nvt.ops.Rename(postfix = '_sin')
weekday_cos= sessionTime_weekday >> (lambda col: get_cycled_feature_value_cos(col+1, 7)) >> nvt.ops.Rename(postfix = '_cos')
cycled_features = hour_sin + hour_cos + weekday_sin + weekday_cos


# calculate item recency 
from nvtabular.ops import Operator
class ItemRecency(Operator):
    def transform(self, columns, gdf):
        for column in columns:
            col = gdf[column]
            #col.loc[col == ""] = None
            item_first_timestamp = gdf['itemid_ts_first']
            delta_days = (col - item_first_timestamp).dt.days
            gdf[column + "_age_days"] = delta_days * (delta_days >=0)
        return gdf
            
    def output_column_names(self, columns):
        return [column + "_age_days" for column in columns]
            
    def dependencies(self):
        return ["itemid_ts_first"]
recency_features = ["timestamp"] >> ItemRecency() 
recency_features_norm = recency_features >> nvt.ops.LogOp() >> nvt.ops.Normalize() >> nvt.ops.Rename(postfix = '_norm')

time_features = (
    sessionTime_timestamp +
    sessionTime + 
    sessionTime_hour +
    sessionTime_day + 
    sessionTime_weekday +
    recency_features +
    recency_features_norm + 
    cycled_features)

In [52]:
time_features.columns

['ts',
 'timestamp',
 'timestamp_hour',
 'timestamp_day',
 'timestamp_wd',
 'timestamp_age_days',
 'timestamp_age_days_norm',
 'timestamp_hour_sin',
 'timestamp_hour_cos',
 'timestamp_wd_sin',
 'timestamp_wd_cos']

- Grouping interactions into sessions

In [53]:
# Define Groupby Workflow: search columns are not used
# N.B: Add the op ListSlice when upgrading nvt 0.5.1 to 0.6 
filter_nan_products = (interactions_merged_df.columns >> nvt.ops.Filter(f=lambda df: df['product_sku_hash'] != 0))

groupby_only_product = filter_nan_products - ['timestamp']  + time_features  >> nvt.ops.Groupby(
    groupby_cols=["session_id_hash"], 
    sort_cols=["ts"],
    aggs={
       "product_sku_hash": ["list", "count"], 
    }
)
    
groupby_product_url = ['session_id_hash', 'product_url_hash']  + time_features >> nvt.ops.Groupby(
    groupby_cols=["session_id_hash"], 
    sort_cols=["ts"],
    aggs={
       "product_url_hash": ["list", "count"]
    }
)


groupby_other_features = ['session_id_hash', 'product_action', 'event_type', 'first_AC_id', 'original_nb_after_add',
                          'price_bucket', 'category_hash', 'main_category', 'nb_after_add' ] + data_info  + purchase_feat + cont_feats + time_features >> \
    nvt.ops.Groupby(
    groupby_cols=["session_id_hash"], 
    sort_cols=["ts"],
    aggs={
        "product_action": ["list"],     
        "event_type": ["list"],    
        "price_bucket": ["list"],
        'main_category': ["list"],
        "category_hash": ["list"],
        'mean_price_hierarchy':["list"],
        'mean_price_main':["list"],
        "ts": ["list", "first", "last"],
        "is_test": ["last"],
        "is_search": ["last"],
        'nb_after_add': ["last"],
        "original_nb_after_add": ["last"],
        
        'first_AC_id': ["last"],
        'is_purchased': ["last"],
        'purchase_timestamp':["last"],

        "timestamp": ["first"],
        'timestamp_day': ["list"],
        'timestamp_hour': ["list"],
        'timestamp_month': ["list"],
        'timestamp_wd': ["list"],
        'timestamp_age_days': ["list"],
        'timestamp_age_days_norm': ["list"],
        'timestamp_hour_sin': ["list"],
        'timestamp_hour_sin_norm': ["list"],
        'timestamp_hour_cos': ["list"],
        'timestamp_hour_cos_norm': ["list"],
        'timestamp_wd_sin': ["list"],
        'timestamp_wd_sin_norm': ["list"],
        'timestamp_wd_cos': ["list"],
        'timestamp_wd_cos_norm': ["list"],   
        
        },
    name_sep="-")

- workflow 1 : group other features that user interactions 

In [54]:
remaining_columns = [x for x in groupby_other_features.columns if x!= 'timestamp-first']
day_index = ((groupby_other_features - remaining_columns)  >> 
    nvt.ops.LambdaOp(lambda col: (col.max() - col).dt.days + 1) >> 
    nvt.ops.Rename(f = lambda col: "day_index")
)              

In [55]:
workflow = nvt.Workflow(groupby_other_features + day_index)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)
new_gdf_other = workflow.transform(dataset).to_ddf().compute()
len(new_gdf_other)

262296

In [56]:
new_gdf_other.head(2)

Unnamed: 0,timestamp_hour-list,first_AC_id-last,ts-last,timestamp_age_days-list,timestamp_hour_cos-list,timestamp_wd_cos-list,ts-first,timestamp-first,timestamp_day-list,session_id_hash,category_hash-list,main_category-list,is_test-last,mean_price_main-list,is_purchased-last,event_type-list,timestamp_hour_sin-list,timestamp_wd_sin-list,purchase_timestamp-last,is_search-last,price_bucket-list,timestamp_wd-list,original_nb_after_add-last,product_action-list,ts-list,mean_price_hierarchy-list,timestamp_age_days_norm-list,nb_after_add-last,day_index
0,"[20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 2...",143199,1552426869735,"[52, 52, 52, 52, 52, 52, 56, 56, 56, 56, 56, 5...","[0.5000007, 0.5000007, 0.5000007, 0.5000007, 0...","[-0.22252177, -0.22252177, -0.22252177, -0.222...",1552423391039,2019-03-12 20:43:11.039,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...",1,"[86, 0, 86, 0, 86, 86, 0, 0, 0, 0, 0, 0, 0, 0,...","[2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[0.02335137, 0.02335137, 0.02335137, 0.0233513...",0,"[1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[-0.866025, -0.866025, -0.866025, -0.866025, -...","[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...",1552426869735,0,"[10, 0, 10, 0, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",13,"[3, 0, 3, 6, 1, 1, 6, 0, 0, 6, 6, 0, 0, 6, 0, ...","[1552423391039, 1552423391039, 1552423391039, ...","[2.6224637, 0.010557701, 2.6224637, 0.01055770...","[0.66833013, 0.66833013, 0.66833013, 0.6683301...",,63
1,"[16, 16, 16, 16, 16, 16]",77197,1557247744887,"[112, 112, 111, 111, 111, 111]","[-0.4999995, -0.4999995, -0.4999995, -0.499999...","[-0.22252177, -0.22252177, -0.22252177, -0.222...",1557247655055,2019-05-07 16:47:35.055,"[7, 7, 7, 7, 7, 7]",2,"[60, 60, 60, 60, 60, 60]","[2, 2, 2, 2, 2, 2]",1,"[0.02335137, 0.02335137, 0.02335137, 0.0233513...",0,"[1, 1, 1, 1, 1, 1]","[-0.8660257, -0.8660257, -0.8660257, -0.866025...","[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...",1557247744887,0,"[8, 8, 8, 8, 8, 8]","[1, 1, 1, 1, 1, 1]",0,"[3, 3, 3, 3, 1, 1]","[1557247655055, 1557247655055, 1557247660313, ...","[1.1459051, 1.1459051, 1.1459051, 1.1459051, 1...","[1.2716198, 1.2716198, 1.2645365, 1.2645365, 1...",0.0,7


- workflow 2 : create the sequence of product interactions and pageviews

In [57]:
workflow = nvt.Workflow(groupby_product_url)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)
new_gdf_sku_url = workflow.transform(dataset).to_ddf().compute()
len(new_gdf_sku_url)

262296

In [58]:
new_gdf_sku_url.head()

Unnamed: 0,product_url_hash_list,product_url_hash_count,session_id_hash
0,"[143199, 7298, 143199, 7298, 143199, 143199, 7...",32,1
1,"[130669, 130669, 77197, 77197, 77197, 77197]",6,2
2,"[53641, 53641, 91938, 91938, 165669, 165669, 1...",136,3
3,"[170620, 170620, 170620, 170620, 107009, 10700...",22,4
4,"[46772, 46772, 121609, 121609, 123318, 63033, ...",244,5


- workflow 3 : create sequence with only product interactions

In [59]:
workflow = nvt.Workflow(groupby_only_product)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)
new_gdf_prod_only = workflow.transform(dataset).to_ddf().compute()
len(new_gdf_prod_only)

262296

- Merge the three resulting frames

In [60]:
sessions_gdf = new_gdf_sku_url.merge(new_gdf_other, on='session_id_hash',  how='inner')
sessions_gdf = sessions_gdf.merge(new_gdf_prod_only,  on='session_id_hash',  how='left' )

In [61]:
sessions_gdf.shape

(262296, 33)

In [62]:
sessions_gdf.columns

Index(['product_url_hash_list', 'product_url_hash_count', 'session_id_hash',
       'timestamp_hour-list', 'first_AC_id-last', 'ts-last',
       'timestamp_age_days-list', 'timestamp_hour_cos-list',
       'timestamp_wd_cos-list', 'ts-first', 'timestamp-first',
       'timestamp_day-list', 'category_hash-list', 'main_category-list',
       'is_test-last', 'mean_price_main-list', 'is_purchased-last',
       'event_type-list', 'timestamp_hour_sin-list', 'timestamp_wd_sin-list',
       'purchase_timestamp-last', 'is_search-last', 'price_bucket-list',
       'timestamp_wd-list', 'original_nb_after_add-last',
       'product_action-list', 'ts-list', 'mean_price_hierarchy-list',
       'timestamp_age_days_norm-list', 'nb_after_add-last', 'day_index',
       'product_sku_hash_count', 'product_sku_hash_list'],
      dtype='object')

- Display resulting session table 

In [63]:
SELECTED_COLS = ['session_id_hash', 'day_index', 'product_url_hash_list',
                 'event_type-list', 'product_action-list', 
                
                'category_hash-list', 'main_category-list',
                'price_bucket-list', 'mean_price_hierarchy-list', 'mean_price_main-list',
                
                'product_sku_hash_count',  'product_sku_hash_list',
                'product_url_hash_count',
                
                 'ts-first', 'ts-last',  'ts-list',
                 'timestamp_hour_cos-list', 'timestamp_hour_sin-list', 'timestamp_wd_sin-list', 'timestamp_wd_cos-list',
                 'timestamp_age_days-list', 'timestamp_age_days_norm-list', 
                
                 'first_AC_id-last', 'is_purchased-last', 'purchase_timestamp-last', 'nb_after_add-last',
                 'is_search-last' ,'is_test-last',
                ] 
         

sessions_gdf = sessions_gdf[SELECTED_COLS]
sessions_gdf.tail(2)

Unnamed: 0,session_id_hash,day_index,product_url_hash_list,event_type-list,product_action-list,category_hash-list,main_category-list,price_bucket-list,mean_price_hierarchy-list,mean_price_main-list,product_sku_hash_count,product_sku_hash_list,product_url_hash_count,ts-first,ts-last,ts-list,timestamp_hour_cos-list,timestamp_hour_sin-list,timestamp_wd_sin-list,timestamp_wd_cos-list,timestamp_age_days-list,timestamp_age_days_norm-list,first_AC_id-last,is_purchased-last,purchase_timestamp-last,nb_after_add-last,is_search-last,is_test-last
262294,250375,64,"[46772, 46772, 45686, 45686, 122704, 122704, 4...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[0, 6, 0, 6, 6, 0, 6, 0, 6, 0, 0, 6, 0, 6, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.010557701, 0.010557701, 0.010557701, 0.0105...","[0.02335137, 0.02335137, 0.02335137, 0.0233513...",24,"[55897, 55897, 55897, 55897, 55897, 55897, 558...",24,1552353672164,1552354551465,"[1552353672164, 1552353672164, 1552353688549, ...","[0.96592575, 0.96592575, 0.96592575, 0.9659257...","[0.25881928, 0.25881928, 0.25881928, 0.2588192...","[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...","[-0.22252177, -0.22252177, -0.22252177, -0.222...","[55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 5...","[0.7122043, 0.7122043, 0.7122043, 0.7122043, 0...",111417,0,1552354551465,,0,0
262295,250376,103,"[155647, 155647, 170790, 170790, 170790, 170790]","[1, 1, 1, 1, 1, 1]","[3, 3, 3, 3, 1, 1]","[46, 46, 46, 46, 46, 46]","[2, 2, 2, 2, 2, 2]","[7, 7, 8, 8, 8, 8]","[0.72179425, 0.72179425, 0.72179425, 0.7217942...","[0.02335137, 0.02335137, 0.02335137, 0.0233513...",6,"[49338, 49338, 54109, 54109, 54109, 54109]",6,1548970153891,1548970245921,"[1548970153891, 1548970153891, 1548970168089, ...","[0.707107, 0.707107, 0.707107, 0.707107, 0.707...","[-0.70710653, -0.70710653, -0.70710653, -0.707...","[-0.43388462, -0.43388462, -0.43388462, -0.433...","[-0.90096843, -0.90096843, -0.90096843, -0.900...","[16, 16, 16, 16, 16, 16]","[-0.23774733, -0.23774733, -0.23774733, -0.237...",170790,0,1548970245922,,0,0


- Un-hash session id 

In [64]:
session_map = cudf.read_parquet(OUTPUT_DIR + '/categorify_workflow/categories/unique.session_id_hash.parquet').reset_index()
session_map.columns = ['session_id_hash', 'original_session_id_hash']
sessions_gdf = sessions_gdf.merge(session_map, on=['session_id_hash'], how='left')

- Get index of AC product in the session 

In [65]:
sessions_gdf = sessions_gdf.to_pandas()

In [66]:
sessions_gdf['first_add_index'] = sessions_gdf.progress_apply(lambda x: x['product_url_hash_list'].tolist().index(x['first_AC_id-last']), axis=1)

100%|██████████| 262296/262296 [00:06<00:00, 39551.86it/s]


- Compute actual number of non repeated interactions after add 

In [67]:
def get_nb_after_add(x): 
    add_index =  x['first_add_index']
    nb_after_last_add = len(x['product_url_hash_list']) - add_index - 1
    return nb_after_last_add
sessions_gdf['original_nb_after_add'] = sessions_gdf.progress_apply(get_nb_after_add, axis=1)

100%|██████████| 262296/262296 [00:04<00:00, 60838.29it/s]


- Compute artificially designed `nb_after_add` for train and validation session : By selecting nb_after_add in (0, 2, 4, 6, 8, 10) closest to the given value of actual number of events after the first AC

In [68]:
NB_AFTER_ADD = [0, 2, 4, 6, 8, 10]
sessions_gdf.loc[sessions_gdf['is_test-last']==0, 'nb_after_add-last'] = \
sessions_gdf.loc[sessions_gdf['is_test-last']==0, 'original_nb_after_add'].progress_apply(
    lambda z: min(NB_AFTER_ADD, key=lambda x:abs(x-z)))

100%|██████████| 214585/214585 [00:00<00:00, 304277.85it/s]


- Create cross-validation folds : 
     - Define random 5 folds column 
     - Reserve the 3 last weeks for validation 

In [69]:
from sklearn.model_selection import GroupKFold
sessions_gdf['fold'] = np.random.randint(1,6, sessions_gdf.shape[0]) 
sessions_gdf['is_valid'] = 0 
sessions_gdf.loc[((sessions_gdf['is_test-last']==0) & (sessions_gdf['day_index']<=50)), 'is_valid'] = 1

### Save session table 

- Save the whole session table 

In [70]:
sessions_gdf.to_parquet(os.path.join(OUTPUT_DIR, 'session_interactions_task2_preproc2.parquet'))

-  Save unique product sku mapping from updated product_url_hash encoded column

In [71]:
urls_ids = interactions_merged_df[interactions_merged_df.event_type==2]['product_url_hash'].unique()
mapping = pd.read_parquet(OUTPUT_DIR + '/categorify_workflow/categories/unique.product_url_hash_first_purchase_id_first_AC_id.parquet')
mask = mapping.reset_index()['index'].isin(urls_ids)
mapping_prod = mapping[~mask].reset_index()
mapping_prod.columns =  ['encoded_product_sku', 'original_product_sku']
print("There are %s unique product" %mapping_prod.shape[0])
mapping_prod.to_parquet(OUTPUT_DIR +'/mapping_product_sku_without_urls_task2_v2.parquet')

There are 55898 unique product


- Save products embedding matrices based on their encoded ids 

In [72]:
product_info = pd.read_csv('/workspace/sku_to_content.csv', usecols=['product_sku_hash', 
                                                                     'description_vector', 
                                                                     'image_vector'])
# convert strings to list object 
import ast
def convert_str_to_list(x): 
    if pd.isnull(x): 
        return x
    return ast.literal_eval(x)
for col in ['description_vector', 'image_vector']: 
    product_info[col] = product_info[col].progress_apply(convert_str_to_list)
product_info.columns = ['original_product_sku', 'description_vector', 'image_vector']

### Merge product embeddings and mapping_prod
embeddings_table = mapping_prod.merge(product_info, on=['original_product_sku'], how='left')

# Fill missing embeddings with vector of zeros 
embeddings_table.loc[embeddings_table.description_vector.isnull(),
                         'description_vector'] = pd.Series([np.zeros(50)] * embeddings_table.description_vector.isnull().sum()).values

embeddings_table.loc[embeddings_table.image_vector.isnull(),
                         'image_vector'] = pd.Series([np.zeros(50)] * embeddings_table.image_vector.isnull().sum()).values

# Create Numpy matrix with the image vectors of the products
image_matrix = np.concatenate(embeddings_table.image_vector.values).reshape(-1, 50)
# Create Numpy matrix with the description vectors of the products
desc_matrix = np.concatenate(embeddings_table.description_vector.values).reshape(-1, 50)
# Define a dictionary to map the encoded product_sku to the position in the embedding matrices
mapping_id_sku_emb_position = dict(zip(embeddings_table.encoded_product_sku, embeddings_table.index))
# Saving the objects:
import pickle
with open(OUTPUT_DIR+'/embedding_data_v2.pkl', 'wb') as f:  
    pickle.dump([desc_matrix, image_matrix, mapping_id_sku_emb_position], f)

100%|██████████| 66386/66386 [00:06<00:00, 11021.03it/s]
100%|██████████| 66386/66386 [00:05<00:00, 12523.92it/s]


<h2> <center> <a id='session_duplicate'>  Duplicated train sessions with different split points after the AC event </a></center></h2>

- Load data 

In [79]:
# load session browsing data 
data = pd.read_parquet(os.path.join(OUTPUT_DIR, "session_interactions_task2_preproc2.parquet"))
#load product embeddings 
desc_matrix, image_matrix, mapping_id_sku_emb_position = pickle.load(open(OUTPUT_DIR + "/embedding_data_v2.pkl", "rb"))
#load encoded product-ids
mapping = pd.read_parquet(os.path.join(OUTPUT_DIR,
                                       "categorify_workflow/categories/unique.product_url_hash_first_purchase_id_first_AC_id.parquet"))
#load session search data 
search_session = pd.read_parquet(os.path.join(OUTPUT_DIR, "session_search.parquet"))
search_session.columns = ['original_session_id_hash', 'flat_query_vector', 'flat_product_skus_hash',
       'flat_clicked_skus_hash', 'impressions_size', 'clicks_size',
       'nb_queries', 'clicked-flag']

- Repeat rows of sessions with different number of actions after the AC event : Only for train and validation datasets

In [80]:
def truncate_at_different_position(x, feature_list): 
    product_id =  x['first_AC_id-last']
    nb_after_add = x['nb_after_add-last']
    add_index = x['product_action-list'].tolist().index(1)
    general_feat = []
    if nb_after_add == 0: 
        general_feat.append([x[col][:int(add_index+nb_after_add+1)] for col in feature_list])
    
    else:
        for i in range(0, int(nb_after_add)+2, 2) : 
            general_feat.append([x[col][0:int(add_index+i+1)] for col in feature_list])
    return general_feat

feature_list = [col for col in data.columns if 'list' in col]
non_list_features = [col for col in data.columns if 'list' not in col]

list_frame = data[data['is_test-last'] == 0][non_list_features].copy()
list_frame['dynamic_truncated_lists'] = data[data['is_test-last'] == 0].progress_apply(partial(truncate_at_different_position,
                                                                                               feature_list=feature_list), axis=1)
# Unstack the list of clicked items to multiple rows : each row is a single clicked item 
lst_col = 'dynamic_truncated_lists'
duplicated_sessions = pd.DataFrame({
    col:np.repeat(list_frame[col].values,
                  list_frame[lst_col].str.len()) for col in list_frame.columns.difference([lst_col])}).assign(
    **{lst_col:[item for sublist in list_frame[lst_col].values for item in sublist]})[list_frame.columns.tolist()]
# Unstack truncated lists to 22 feature)list columns 
t = pd.DataFrame(duplicated_sessions[lst_col].to_list(), columns=feature_list)
duplicated_frame = pd.concat([duplicated_sessions, t], axis=1)
duplicated_frame.drop(lst_col, axis=1, inplace=True)

100%|██████████| 214585/214585 [01:44<00:00, 2046.38it/s]


- Update nb_after_add-last column 

In [81]:
def get_nb_after_add(x): 
    product_id =  x['first_AC_id-last']
    add_index = x['product_url_hash_list'].tolist().index(product_id)
    nb_after_last_add = len(x['product_url_hash_list']) - add_index - 1
    return nb_after_last_add
duplicated_frame['updated_original_nb_after_add'] = duplicated_frame.progress_apply(get_nb_after_add, axis=1)

100%|██████████| 1173855/1173855 [00:30<00:00, 38105.89it/s]


In [82]:
NB_AFTER_ADD = [0, 2, 4, 6, 8, 10]
duplicated_frame.loc[duplicated_frame['is_test-last']==0, 'nb_after_add-last'] = \
duplicated_frame.loc[duplicated_frame['is_test-last']==0, 'updated_original_nb_after_add'].progress_apply(
    lambda z: min(NB_AFTER_ADD, key=lambda x:abs(x-z)))

100%|██████████| 1173855/1173855 [00:03<00:00, 342219.81it/s]


- Merge back with test sessions 

In [83]:
duplicated_frame = pd.concat([data[data['is_test-last']==1],duplicated_frame])

- Save dataset 

In [84]:
duplicated_frame.to_parquet(os.path.join(OUTPUT_DIR, 'duplicated_sessions_with_different_nb_after_add_cuts.parquet'))