In [None]:
#
# The MIT License (MIT)

# Copyright (c) 2021, NVIDIA CORPORATION

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

In [31]:
from IPython.display import display
import gc
import glob
import os

import cudf
import cupy
from functools import partial
import json
import numpy as np
import nvtabular as nvt
import pandas as pd
from tqdm import tqdm
import pickle 

from generate_features import generate_xgb_feats
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
tqdm.pandas()

- Load session data, product vectors and encoded mappings 

In [32]:
DATA_FOLDER = '/workspace/coveo_task2_v1_phase2/sessions_with_repetitions/'
# load session browsing data 
data = pd.read_parquet(DATA_FOLDER + "/session_interactions_task2_preproc2.parquet")
#load product embeddings 
desc_matrix, image_matrix, mapping_id_sku_emb_position = pickle.load(open(DATA_FOLDER + "/embedding_data_v2.pkl", "rb"))
#load encoded product-ids
mapping = pd.read_parquet(os.path.join(DATA_FOLDER,
                                       "categorify_workflow/categories/unique.product_url_hash_first_purchase_id_first_AC_id.parquet"))
#load session search data 
search_session = pd.read_parquet(DATA_FOLDER + "/session_search.parquet")
search_session.columns = ['original_session_id_hash', 'flat_query_vector', 'flat_product_skus_hash',
       'flat_clicked_skus_hash', 'impressions_size', 'clicks_size',
       'nb_queries', 'clicked-flag']
# merge browsing and search sessions
data = data.merge(search_session, on='original_session_id_hash', how='left')
data['add_product_id'] = data.progress_apply(lambda x: x['product_url_hash_list'][x['product_action-list'].tolist().index(1)], axis=1)

100%|██████████| 262296/262296 [00:05<00:00, 51428.84it/s]


In [33]:
from functools import partial
def truncate_session(x, col): 
    nb_after_add = x['nb_after_add-last']
    add_index = x['product_action-list'].tolist().index(1)
    return x[col][0:int(add_index+nb_after_add+1)]

In [34]:
feature_list = [col for col in data.columns if 'list' in col]
feature_list

['product_url_hash_list',
 'event_type-list',
 'product_action-list',
 'category_hash-list',
 'main_category-list',
 'price_bucket-list',
 'mean_price_hierarchy-list',
 'mean_price_main-list',
 'product_sku_hash_list',
 'ts-list',
 'timestamp_hour_cos-list',
 'timestamp_hour_sin-list',
 'timestamp_wd_sin-list',
 'timestamp_wd_cos-list',
 'timestamp_age_days-list',
 'timestamp_age_days_norm-list']

<h2> <center> First XGB set </center></h2> 

   <center>Train sessions are kept with their original length:  i.e without truncating to artificial nb_after_add events </center>

- Truncate Validation data to mimic Test set

In [35]:
for col in feature_list:
     data.loc[data.is_valid==1, col] = data.loc[data.is_valid==1, :].progress_apply(partial(truncate_session, col=col), axis=1)

100%|██████████| 40012/40012 [00:00<00:00, 46192.54it/s]
100%|██████████| 40012/40012 [00:01<00:00, 37929.59it/s]
100%|██████████| 40012/40012 [00:00<00:00, 42260.83it/s]
100%|██████████| 40012/40012 [00:00<00:00, 42824.97it/s]
100%|██████████| 40012/40012 [00:00<00:00, 44924.34it/s]
100%|██████████| 40012/40012 [00:00<00:00, 45309.28it/s]
100%|██████████| 40012/40012 [00:00<00:00, 44591.04it/s]
100%|██████████| 40012/40012 [00:00<00:00, 45410.46it/s]
100%|██████████| 40012/40012 [00:00<00:00, 47314.78it/s]
100%|██████████| 40012/40012 [00:00<00:00, 44193.24it/s]
100%|██████████| 40012/40012 [00:00<00:00, 40072.47it/s]
100%|██████████| 40012/40012 [00:00<00:00, 44690.37it/s]
100%|██████████| 40012/40012 [00:00<00:00, 42230.29it/s]
100%|██████████| 40012/40012 [00:01<00:00, 38822.61it/s]
100%|██████████| 40012/40012 [00:00<00:00, 43692.51it/s]
100%|██████████| 40012/40012 [00:00<00:00, 40092.89it/s]


In [36]:
xgboost_frame = generate_xgb_feats(data, search_session, mapping, image_matrix, desc_matrix, mapping_id_sku_emb_position)

  0%|          | 558/560394 [00:00<02:09, 4339.51it/s]

Number of products present in search and not in browsing data is: 3155


100%|██████████| 560394/560394 [00:05<00:00, 95404.40it/s] 
100%|██████████| 560394/560394 [00:03<00:00, 170182.53it/s]
100%|██████████| 262296/262296 [00:18<00:00, 14544.51it/s]
100%|██████████| 262296/262296 [00:10<00:00, 25529.50it/s]
100%|██████████| 262296/262296 [00:10<00:00, 25536.02it/s]
100%|██████████| 262296/262296 [02:40<00:00, 1638.03it/s]


In [37]:
booleans = [col for col in xgboost_frame.columns if 'has_' in col ]
xgboost_frame[booleans] = xgboost_frame[booleans].astype(int)

In [38]:
display(xgboost_frame)

Unnamed: 0,original_session_id_hash,session_id_hash,is_purchased-last,nb_after_add-last,is_test-last,is_valid,fold,product_url_id_list-0,product_url_id_list-1,product_url_id_list-2,product_url_id_list-3,product_url_id_list-4,event_type_list-0,event_type_list-1,event_type_list-2,event_type_list-3,event_type_list-4,product_action_list-0,product_action_list-1,product_action_list-2,product_action_list-3,product_action_list-4,category_list-0,category_list-1,category_list-2,category_list-3,category_list-4,price_list-0,price_list-1,price_list-2,price_list-3,price_list-4,relative_price_list-0,relative_price_list-1,relative_price_list-2,relative_price_list-3,relative_price_list-4,add_product_id,add_nb_interactions,add_has_been_detailed,add_has_been_removed,add_has_been_searched,add_has_been_clicked,add_category_hash,add_price,add_relative_price,session_length,nb_before_add,nb_unique_interactions,nb_queries
0,425942e274cbb9d78931fafd6caa6b2a2257176b7486c6...,68033,0,6.0,1,0,2,45686,84229.0,84229.0,45686.0,45686.0,2,2.0,2.0,2.0,2.0,6,0.0,6.0,6.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.010558,0.010558,0.010558,0.010558,0.010558,147516,6,0,0,0,0,112,4,-0.593181,16,2,4,1.0
1,4259a9cc59e6b0c0074a8c9eee1752b46a101fbe74967b...,68034,0,0.0,1,0,4,39550,65571.0,65571.0,65571.0,65571.0,2,1.0,1.0,1.0,1.0,0,3.0,3.0,1.0,1.0,0,86.0,86.0,86.0,86.0,0,10.0,10.0,10.0,10.0,0.010558,2.622464,2.622464,2.622464,2.622464,65571,4,0,0,0,0,86,10,2.622464,6,4,2,0.0
2,425beea309519760c9ff70dc4456f65de12ecf42a2112c...,68035,0,10.0,0,0,4,35894,69490.0,69490.0,124002.0,124002.0,1,2.0,2.0,2.0,2.0,3,6.0,0.0,6.0,0.0,32,0.0,0.0,0.0,0.0,10,0.0,0.0,0.0,0.0,2.654426,0.010558,0.010558,0.010558,0.010558,165310,4,0,0,0,0,34,9,-1.467394,32,4,10,0.0
3,425cd4a276a2909a7e0d39ef82f87a5c712b28ba6172f4...,68036,0,10.0,0,0,1,120429,21482.0,120429.0,21482.0,21482.0,2,1.0,2.0,1.0,1.0,6,3.0,0.0,3.0,3.0,0,137.0,0.0,137.0,137.0,0,9.0,0.0,9.0,9.0,0.010558,0.010558,0.010558,0.010558,0.010558,21482,8,0,0,0,0,137,9,0.010558,38,28,13,0.0
4,425d6c7c7fb987e308e12dec4fb9e28cfa9bdd3b71b6d3...,68037,1,10.0,0,0,5,7449,7449.0,7449.0,37510.0,37510.0,2,2.0,2.0,1.0,1.0,0,0.0,6.0,4.0,4.0,0,0.0,0.0,137.0,137.0,0,0.0,0.0,7.0,7.0,0.010558,0.010558,0.010558,0.010558,0.010558,37510,6,1,1,0,0,137,7,0.010558,38,8,10,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262291,ff97453ea3082fc58d55cf00fc1cf330ad5ac2a4a1e4f6...,261892,1,10.0,0,0,3,153319,163502.0,18473.0,163502.0,130435.0,1,1.0,1.0,1.0,1.0,4,4.0,4.0,4.0,4.0,31,0.0,34.0,0.0,28.0,7,0.0,1.0,0.0,6.0,0.154242,0.010558,-1.467394,0.010558,0.324840,130435,6,0,1,0,0,28,6,0.324840,122,10,19,0.0
262292,ff975de8b4917a1bb8b7024c523148caa41d26a28da046...,261893,0,0.0,1,0,4,81282,81282.0,81282.0,81282.0,0.0,1,1.0,1.0,1.0,0.0,3,3.0,1.0,1.0,0.0,11,11.0,11.0,11.0,0.0,9,9.0,9.0,9.0,0.0,2.469104,2.469104,2.469104,2.469104,-0.000199,81282,4,0,0,0,0,11,9,2.469104,4,2,1,0.0
262293,ff976608f29a478ee90c31096368579dc3bdfd9017f965...,261894,0,0.0,1,0,5,7143,140857.0,140857.0,140857.0,140857.0,2,1.0,1.0,1.0,1.0,0,3.0,3.0,1.0,1.0,0,137.0,137.0,137.0,137.0,0,8.0,8.0,8.0,8.0,0.010558,0.010558,0.010558,0.010558,0.010558,140857,12,0,0,0,0,137,8,0.010558,24,22,3,0.0
262294,ff97f3f9238113091ca6e6bcd1b14d093e7d1db8fba1f5...,261895,0,8.0,0,1,5,103519,45686.0,45686.0,45686.0,45686.0,1,2.0,2.0,2.0,2.0,1,0.0,6.0,0.0,6.0,46,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.721794,0.010558,0.010558,0.010558,0.010558,103519,4,0,0,0,0,46,7,0.721794,10,4,3,1.0


- Save data for training 

In [43]:
pd.read_parquet("/workspace/coveo_task2_v1_phase2/sessions_with_repetitions/categorify_workflow/categories/unique.product_action.parquet")

Unnamed: 0,product_action,product_action_count
0,,0
1,add,765314
2,click,190612
3,detail,3043570
4,purchase,128728
5,remove,590246
6,view,3471259


In [39]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/without_truncation')
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [40]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> Second XGB set </center></h2> 

   <center>  - Train sessions are truncated with respect to the artificial nb_after_add feature </center>
    
   <center> - Keep the original distribution of validation (nb_after_add different from the test set)  </center>  

- Truncate Train sessions to nb_after_add event 

In [10]:
for col in feature_list:
     data.loc[(data.is_valid==0) & (data['is_test-last']==0), col] = \
        data.loc[(data.is_valid==0) & (data['is_test-last']==0), :].progress_apply(partial(truncate_session, col=col), axis=1)

100%|██████████| 174573/174573 [00:04<00:00, 43154.38it/s]
100%|██████████| 174573/174573 [00:04<00:00, 36104.96it/s]
100%|██████████| 174573/174573 [00:04<00:00, 42765.53it/s]
100%|██████████| 174573/174573 [00:04<00:00, 40553.91it/s]
100%|██████████| 174573/174573 [00:04<00:00, 42043.93it/s]
100%|██████████| 174573/174573 [00:04<00:00, 40387.32it/s]
100%|██████████| 174573/174573 [00:04<00:00, 41896.66it/s]
100%|██████████| 174573/174573 [00:04<00:00, 36615.13it/s]
100%|██████████| 174573/174573 [00:04<00:00, 38417.15it/s]
100%|██████████| 174573/174573 [00:04<00:00, 38606.89it/s]
100%|██████████| 174573/174573 [00:04<00:00, 39196.66it/s]
100%|██████████| 174573/174573 [00:04<00:00, 42419.14it/s]
100%|██████████| 174573/174573 [00:04<00:00, 41030.36it/s]
100%|██████████| 174573/174573 [00:04<00:00, 41488.04it/s]
100%|██████████| 174573/174573 [00:04<00:00, 39380.16it/s]
100%|██████████| 174573/174573 [00:04<00:00, 43208.65it/s]


- Generate xgb features

In [11]:
xgboost_frame = generate_xgb_feats(data)

100%|██████████| 262296/262296 [00:14<00:00, 17770.28it/s]
100%|██████████| 262296/262296 [00:06<00:00, 40301.78it/s]
100%|██████████| 262296/262296 [00:10<00:00, 24293.92it/s]


- Save data 

In [12]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/with_truncation_orignial_nb_distribution')
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [13]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> Third XGB set </center></h2> 

   <center>  - Train/Validation sessions are truncated with respect to the artificial nb_after_add feature </center>
    
   <center>  - Resample validation to have the same distribution of nb_after_add as in test set  </center>  

 

In [14]:
valid_mask = data['is_valid']==1
test_mask = data['is_test-last']==1
train_mask = (data['is_valid']==0) & (data['is_test-last']==0)

- Relative distribution of nb_after_add

In [15]:
frames = []
frames.append(data[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.007057,0.564251,0.006023
2.0,0.0376,0.173,0.037289
4.0,0.055713,0.099034,0.060182
6.0,0.056509,0.069816,0.062231
8.0,0.05915,0.051938,0.06568
10.0,0.78397,0.041961,0.768594


- Truncate validation sessions with nb_after_add > 0 to the AC event (nb_after_add = 0)

In [16]:
from functools import partial
def truncate_session_to_AC(x, col, nb_add_sample=0): 
    add_index = x['product_action-list'].tolist().index(1)
    return x[col][0:int(add_index+nb_add_sample+1)]
def resample_sessions(data, feature_list, nb_add_sample=0):
    for col in feature_list:
        data[col] = data.apply(partial(truncate_session_to_AC, col=col, nb_add_sample=nb_add_sample), axis=1)
    return data

In [17]:
data_resample_valid = data.copy()

In [18]:
for nb_after_add in [6, 8, 10]: 
    nb_to_resample = abs(int( (relative_nb_distribution.loc[nb_after_add, 'valid_nb_after_add'] - relative_nb_distribution.loc[nb_after_add, 'test_nb_after_add']) * valid_mask.sum()))
    print("Truncate %s random sessions with nb_after_add == %s" %(nb_to_resample, nb_after_add))
    valid_session_nb = data_resample_valid[valid_mask & (data_resample_valid['nb_after_add-last']==nb_after_add)].session_id_hash.values
    sample = np.random.choice(valid_session_nb, nb_to_resample, replace=False)
    
    if nb_after_add != 10:
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), :],
            feature_list)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'nb_after_add-last'] = 0
    else: 
        nb_sample_2 =  abs(int((relative_nb_distribution.loc[2, 'valid_nb_after_add'] - relative_nb_distribution.loc[2, 'test_nb_after_add']) * valid_mask.sum()))
        nb_sample_4 =  abs(int((relative_nb_distribution.loc[4, 'valid_nb_after_add'] - relative_nb_distribution.loc[4, 'test_nb_after_add']) * valid_mask.sum()))
        sample_2 = sample[:nb_sample_2]
        sample_4 = sample[nb_sample_2:nb_sample_2+nb_sample_4]
        sample_0 = sample[nb_sample_2+nb_sample_4:]
        
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_0), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_0), :],
            feature_list)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_0), 'nb_after_add-last'] = 0
        
        
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_2), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_2), :],
            feature_list,  nb_add_sample=2)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_2), 'nb_after_add-last'] = 2
        
        
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_4), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_4), :],
            feature_list,  nb_add_sample=4)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_4), 'nb_after_add-last'] = 4

Truncate 303 random sessions with nb_after_add == 6
Truncate 549 random sessions with nb_after_add == 8
Truncate 29074 random sessions with nb_after_add == 10


In [19]:
frames = []
frames.append(data_resample_valid[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data_resample_valid[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data_resample_valid[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.007057,0.564251,0.579401
2.0,0.0376,0.173,0.172998
4.0,0.055713,0.099034,0.09902
6.0,0.056509,0.069816,0.054659
8.0,0.05915,0.051938,0.051959
10.0,0.78397,0.041961,0.041962


- Generate xgb features 

In [20]:
xgboost_frame = generate_xgb_feats(data_resample_valid)

100%|██████████| 262296/262296 [00:16<00:00, 15864.82it/s]
100%|██████████| 262296/262296 [00:06<00:00, 41267.21it/s]
100%|██████████| 262296/262296 [00:11<00:00, 23561.32it/s]


In [21]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/with_truncation_same_valid_test_nb_distribution')
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [22]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> Fourth XGB set </center></h2> 

   <center>  - Train/Validation sessions are truncated with respect to the artificial nb_after_add feature </center>
    
   <center>  - Resample Validation and Train to have the same distribution of nb_after_add as in test set  </center>  


In [23]:
frames = []
frames.append(data_resample_valid[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data_resample_valid[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data_resample_valid[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.007057,0.564251,0.579401
2.0,0.0376,0.173,0.172998
4.0,0.055713,0.099034,0.09902
6.0,0.056509,0.069816,0.054659
8.0,0.05915,0.051938,0.051959
10.0,0.78397,0.041961,0.041962


In [24]:
for nb_after_add in [6, 8, 10]: 
    nb_to_resample = abs(int( (relative_nb_distribution.loc[nb_after_add, 'train_nb_after_add'] - relative_nb_distribution.loc[nb_after_add, 'test_nb_after_add']) * train_mask.sum()))
    print("Truncate %s random sessions with nb_after_add == %s" %(nb_to_resample, nb_after_add))
    train_session_nb = data_resample_valid[train_mask & (data_resample_valid['nb_after_add-last']==nb_after_add)].session_id_hash.values
    sample = np.random.choice(train_session_nb, nb_to_resample, replace=False)
    
    if nb_after_add != 10:
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), :],
            feature_list)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'nb_after_add-last'] = 0
    else: 
        nb_sample_2 =  abs(int((relative_nb_distribution.loc[2, 'train_nb_after_add'] - relative_nb_distribution.loc[2, 'test_nb_after_add']) * train_mask.sum()))
        nb_sample_4 =  abs(int((relative_nb_distribution.loc[4, 'train_nb_after_add'] - relative_nb_distribution.loc[4, 'test_nb_after_add']) * train_mask.sum()))
        sample_2 = sample[:nb_sample_2]
        sample_4 = sample[nb_sample_2:nb_sample_2+nb_sample_4]
        sample_0 = sample[nb_sample_2+nb_sample_4:]
        
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_0), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_0), :],
            feature_list)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_0), 'nb_after_add-last'] = 0
        
        
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_2), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_2), :],
            feature_list,  nb_add_sample=2)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_2), 'nb_after_add-last'] = 2
        
        
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_4), 'product_url_hash_list'] = resample_sessions(
            data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_4), :],
            feature_list,  nb_add_sample=4)
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample_4), 'nb_after_add-last'] = 4

Truncate 2323 random sessions with nb_after_add == 6
Truncate 1259 random sessions with nb_after_add == 8
Truncate 129534 random sessions with nb_after_add == 10


In [25]:
frames = []
frames.append(data_resample_valid[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data_resample_valid[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data_resample_valid[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.590865,0.564251,0.579401
2.0,0.172999,0.173,0.172998
4.0,0.09903,0.099034,0.09902
6.0,0.043203,0.069816,0.054659
8.0,0.051938,0.051938,0.051959
10.0,0.041965,0.041961,0.041962


- Generate xgb features 

In [27]:
xgboost_frame = generate_xgb_feats(data_resample_valid)

100%|██████████| 262296/262296 [00:14<00:00, 17799.00it/s]
100%|██████████| 262296/262296 [00:06<00:00, 40234.23it/s]
100%|██████████| 262296/262296 [00:11<00:00, 23369.92it/s]


- Save data 

In [29]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/with_truncation_same_train_valid_test_nb_distribution')
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [30]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> 6th XGB set - Same sessions with different split point after AC </center> </h2>

In [4]:
data = pd.read_parquet(os.path.join(DATA_FOLDER, 'duplicated_sessions_with_different_nb_after_add_cuts.parquet'))

- Check the new distribution 

In [5]:
valid_mask = data['is_valid']==1
test_mask = data['is_test-last']==1
train_mask = (data['is_valid']==0) & (data['is_test-last']==0)
frames = []
frames.append(data[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.013082,0.564251,0.005031
2.0,0.095794,0.173,0.095847
4.0,0.123067,0.099034,0.126571
6.0,0.127046,0.069816,0.132239
8.0,0.129072,0.051938,0.134069
10.0,0.511939,0.041961,0.506243


- Generate xgb features

In [6]:
xgboost_frame = generate_xgb_feats(data)

100%|██████████| 1221566/1221566 [01:14<00:00, 16502.83it/s]
100%|██████████| 1221566/1221566 [00:31<00:00, 39000.77it/s]
100%|██████████| 1221566/1221566 [00:51<00:00, 23920.66it/s]


In [7]:
display(xgboost_frame.head())

Unnamed: 0,original_session_id_hash,session_id_hash,is_purchased-last,nb_after_add-last,is_test-last,is_valid,fold,product_url_id_list-0,product_url_id_list-1,product_url_id_list-2,product_url_id_list-3,product_url_id_list-4,event_type_list-0,event_type_list-1,event_type_list-2,event_type_list-3,event_type_list-4,product_action_list-0,product_action_list-1,product_action_list-2,product_action_list-3,product_action_list-4,category_list-0,category_list-1,category_list-2,category_list-3,category_list-4,price_list-0,price_list-1,price_list-2,price_list-3,price_list-4,relative_price_list-0,relative_price_list-1,relative_price_list-2,relative_price_list-3,relative_price_list-4,add_product_id,add_nb_interactions,add_has_been_detailed,add_has_been_removed,add_category_hash,add_price,add_relative_price,session_length,nb_unique_interactions
0,425942e274cbb9d78931fafd6caa6b2a2257176b7486c6...,68033,0,6.0,1,0,2,45686,84229.0,84229.0,45686.0,45686.0,2,2.0,2.0,2.0,2.0,6,0.0,6.0,6.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.010558,0.010558,0.010558,0.010558,0.010558,147516,6,False,False,112,4,-0.593181,16,4
1,4259a9cc59e6b0c0074a8c9eee1752b46a101fbe74967b...,68034,0,0.0,1,0,4,39550,65571.0,65571.0,65571.0,65571.0,2,1.0,1.0,1.0,1.0,0,3.0,3.0,1.0,1.0,0,86.0,86.0,86.0,86.0,0,10.0,10.0,10.0,10.0,0.010558,2.622464,2.622464,2.622464,2.622464,65571,4,False,False,86,10,2.622464,6,2
2,425e34aba2ab0b5d5537645638aea41f14b4edb19b927e...,68039,0,0.0,1,0,2,161499,105204.0,105204.0,105204.0,105204.0,2,1.0,1.0,1.0,1.0,0,3.0,3.0,1.0,1.0,0,60.0,60.0,60.0,60.0,0,8.0,8.0,8.0,8.0,0.010558,1.145905,1.145905,1.145905,1.145905,105204,4,False,False,60,8,1.145905,26,8
3,425e404aaaf6d7ead16cc2c20bcbd563de0b85e0bb0a45...,68040,0,6.0,1,0,2,23721,103480.0,103480.0,103480.0,103480.0,2,1.0,1.0,1.0,1.0,6,3.0,3.0,1.0,1.0,0,137.0,137.0,137.0,137.0,0,7.0,7.0,7.0,7.0,0.010558,0.010558,0.010558,0.010558,0.010558,103480,12,False,False,137,7,0.010558,22,3
4,425e6d56007d0b30cf84dca1cba494d184a277b94b1674...,68042,0,0.0,1,0,3,140307,140307.0,140307.0,140307.0,140307.0,1,1.0,1.0,1.0,1.0,3,3.0,3.0,1.0,1.0,113,113.0,113.0,113.0,113.0,7,7.0,7.0,7.0,7.0,-2.792637,-2.792637,-2.792637,-2.792637,-2.792637,140307,6,False,False,113,7,-2.792637,6,1


In [10]:
xgboost_frame.columns

Index(['original_session_id_hash', 'session_id_hash', 'is_purchased-last',
       'nb_after_add-last', 'is_test-last', 'is_valid', 'fold',
       'product_url_id_list-0', 'product_url_id_list-1',
       'product_url_id_list-2', 'product_url_id_list-3',
       'product_url_id_list-4', 'event_type_list-0', 'event_type_list-1',
       'event_type_list-2', 'event_type_list-3', 'event_type_list-4',
       'product_action_list-0', 'product_action_list-1',
       'product_action_list-2', 'product_action_list-3',
       'product_action_list-4', 'category_list-0', 'category_list-1',
       'category_list-2', 'category_list-3', 'category_list-4', 'price_list-0',
       'price_list-1', 'price_list-2', 'price_list-3', 'price_list-4',
       'relative_price_list-0', 'relative_price_list-1',
       'relative_price_list-2', 'relative_price_list-3',
       'relative_price_list-4', 'add_product_id', 'add_nb_interactions',
       'add_has_been_detailed', 'add_has_been_removed', 'add_category_hash',
    

In [8]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/duplicated_sessions_with_truncation')
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [9]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))