In [None]:
#
# The MIT License (MIT)

# Copyright (c) 2021, NVIDIA CORPORATION

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

In [2]:
from IPython.display import display
import gc
import glob
import os

import cudf
import cupy
from functools import partial
import json
import numpy as np
import nvtabular as nvt
import pandas as pd
from tqdm import tqdm
import pickle 

from generate_features import generate_xgb_feats
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
tqdm.pandas()

- Load session data, product vectors and encoded mappings 

In [3]:
DATA_FOLDER = '/workspace/coveo_task2_v1_phase2/sessions_wo_repetitions/'
# load session browsing data 
data = pd.read_parquet("/workspace/coveo_task2_v1_phase2/sessions_wo_repetitions/session_interactions_task2_preproc2.parquet")
#load product embeddings 
desc_matrix, image_matrix, mapping_id_sku_emb_position = pickle.load(open(DATA_FOLDER + "/embedding_data_v2.pkl", "rb"))
#load encoded product-ids
mapping = pd.read_parquet(os.path.join(DATA_FOLDER,
                                       "categorify_workflow/categories/unique.product_url_hash_first_purchase_id_first_AC_id.parquet"))
#load session search data 
search_session = pd.read_parquet("/workspace/coveo_task2_v1_phase2/sessions_wo_repetitions/session_search.parquet")
search_session.columns = ['original_session_id_hash', 'flat_query_vector', 'flat_product_skus_hash',
       'flat_clicked_skus_hash', 'impressions_size', 'clicks_size',
       'nb_queries', 'clicked-flag']

In [4]:
from functools import partial
def truncate_session(x, col): 
    product_id =  x['first_AC_id-last']
    nb_after_add = x['nb_after_add-last']
    add_index = x['product_url_hash_list'].tolist().index(product_id)
    return x[col][0:int(add_index+nb_after_add+1)]
feature_list = [col for col in data.columns if 'list' in col]

<h2> <center> First XGB set </center></h2> 

   <center>Train kept with their original length:  i.e without truncating to artificial nb_after_add events </center>

- Truncate Validation data to mimic Test set

In [5]:
for col in feature_list:
     data.loc[data.is_valid==1, col] = data.loc[data.is_valid==1, :].progress_apply(partial(truncate_session, col=col), axis=1)

100%|██████████| 40012/40012 [00:01<00:00, 27340.25it/s]
100%|██████████| 40012/40012 [00:01<00:00, 25136.79it/s]
100%|██████████| 40012/40012 [00:01<00:00, 29432.15it/s]
100%|██████████| 40012/40012 [00:01<00:00, 34153.18it/s]
100%|██████████| 40012/40012 [00:01<00:00, 32512.55it/s]
100%|██████████| 40012/40012 [00:01<00:00, 32697.77it/s]
100%|██████████| 40012/40012 [00:01<00:00, 33819.20it/s]
100%|██████████| 40012/40012 [00:01<00:00, 32317.93it/s]
100%|██████████| 40012/40012 [00:01<00:00, 32995.96it/s]
100%|██████████| 40012/40012 [00:01<00:00, 28509.68it/s]
100%|██████████| 40012/40012 [00:01<00:00, 30325.39it/s]
100%|██████████| 40012/40012 [00:01<00:00, 29905.03it/s]
100%|██████████| 40012/40012 [00:01<00:00, 32036.27it/s]
100%|██████████| 40012/40012 [00:01<00:00, 32751.38it/s]
100%|██████████| 40012/40012 [00:01<00:00, 28525.46it/s]
100%|██████████| 40012/40012 [00:01<00:00, 31111.78it/s]
100%|██████████| 40012/40012 [00:01<00:00, 31197.07it/s]
100%|██████████| 40012/40012 [0

In [7]:
xgboost_frame = generate_xgb_feats(data, search_session, mapping, image_matrix, desc_matrix, mapping_id_sku_emb_position )

  0%|          | 496/560394 [00:00<02:50, 3282.19it/s]

Number of products present in search and not in browsing data is: 3155


100%|██████████| 560394/560394 [00:07<00:00, 76132.36it/s] 
100%|██████████| 560394/560394 [00:03<00:00, 171474.87it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate before/after count statistics


100%|██████████| 262296/262296 [00:33<00:00, 7909.84it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate add-to-cart (AC) features


100%|██████████| 262296/262296 [00:18<00:00, 14431.02it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate session-level features


100%|██████████| 262296/262296 [00:12<00:00, 21482.99it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate p first/last interactions


100%|██████████| 262296/262296 [00:22<00:00, 11908.05it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate similarity features


100%|██████████| 262296/262296 [04:25<00:00, 988.74it/s] 


Create columns of first and last interactions
Merge all features


In [8]:
display(xgboost_frame)

Unnamed: 0,original_session_id_hash,session_id_hash,is_purchased-last,nb_after_add-last,is_test-last,is_valid,fold,product_url_id_list_after-0,product_url_id_list_after-1,product_url_id_list_after-2,product_url_id_list_after-3,product_url_id_list_after-4,event_type_list_after-0,event_type_list_after-1,event_type_list_after-2,event_type_list_after-3,event_type_list_after-4,product_action_list_after-0,product_action_list_after-1,product_action_list_after-2,product_action_list_after-3,product_action_list_after-4,category_list_after-0,category_list_after-1,category_list_after-2,category_list_after-3,category_list_after-4,price_list_after-0,price_list_after-1,price_list_after-2,price_list_after-3,price_list_after-4,relative_price_list_after-0,relative_price_list_after-1,relative_price_list_after-2,relative_price_list_after-3,relative_price_list_after-4,product_url_id_list_before-0,product_url_id_list_before-1,product_url_id_list_before-2,product_url_id_list_before-3,product_url_id_list_before-4,event_type_list_before-0,event_type_list_before-1,event_type_list_before-2,event_type_list_before-3,event_type_list_before-4,product_action_list_before-0,product_action_list_before-1,product_action_list_before-2,product_action_list_before-3,product_action_list_before-4,category_list_before-0,category_list_before-1,category_list_before-2,category_list_before-3,category_list_before-4,price_list_before-0,price_list_before-1,price_list_before-2,price_list_before-3,price_list_before-4,relative_price_list_before-0,relative_price_list_before-1,relative_price_list_before-2,relative_price_list_before-3,relative_price_list_before-4,add_product_id,add_nb_interactions,add_has_been_detailed,add_has_been_removed,add_has_been_viewed,add_has_been_searched,add_has_been_clicked,add_category_hash,add_main_category,add_price,add_relative_price,add_relative_price_main,session_length,nb_unique_interactions,nb_queries,nb_add_before,nb_add_after,nb_detail_before,nb_detail_after,nb_remove_before,nb_remove_after,nb_view_before,nb_view_after,nb_click_before,nb_click_after,mean_sim_desc,std_sim_desc,mean_sim_img,std_sim_img,mean_sim_desc_before,std_sim_desc_before,mean_sim_img_before,std_sim_img_before,mean_sim_desc_after,std_sim_desc_after,mean_sim_img_after,std_sim_img_after,main_category_similarity_general,main_category_similarity_add
0,4ddbca430251bfa7dc72992b0fdde75e71dba86a589386...,79745,0,0.0,1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,53887,2,1,0,0,0,0,137,3,3,0.028997,2.215501,2,1,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000
1,4ddbee1f69ec15f1e62ea52e9ccb92ebb19dd2188db950...,79746,0,6.0,0,0,2,61592.0,109345.0,76105.0,45567.0,169861.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,33.0,33.0,0.0,0.0,0.0,8.0,10.0,0.028997,0.028997,0.028997,2.713712,2.713712,46772.0,160280.0,28459.0,43248.0,154652.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,10.0,0.028997,0.028997,0.028997,0.028997,3.056660,135325,2,1,0,0,0,0,29,2,10,3.056660,0.039367,18,13,0.0,0,0,1,2,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.293978,0.000000,0.211881,0.0,1.045129,0.038483,0.838781,0.002330,0.153846,0.307692
2,4ddc1abd0d0f3d9b6a2d094ccce4554ef4b4d9cc980d6f...,79747,0,8.0,1,0,2,4943.0,15772.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,39994,8,1,0,0,0,0,2,1,3,-1.460671,-2.587784,10,3,0.0,0,0,0,0,0,1,0,0,0,0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.666667,0.333333
3,4ddc320fb590ea67dd8ac3d5a94af425f97a399546fe37...,79748,0,10.0,0,0,4,160548.0,45686.0,57309.0,35737.0,30793.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,10.0,0.028997,0.028997,0.028997,0.028997,0.884480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,2092,4,1,0,0,0,0,30,2,10,0.884480,0.039367,22,11,1.0,0,0,0,2,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.0,0.234273,0.234273,0.534890,0.332422,0.181818,0.272727
4,4ddc4b1139366c4c89be5547e6593722a2e43e11f8179b...,79749,0,0.0,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,146875,2,1,0,0,0,0,4,1,9,2.329340,-2.587784,2,1,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262291,fa2207eae886fa5312a4f9224fc3f030ac2758cca79824...,256300,1,0.0,0,0,1,7449.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,65701,3,1,0,0,0,0,129,3,9,0.669083,2.215501,4,2,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.500000
262292,fa2259a3d7e280179a81a8c233f7dbe0d0e9f4fdd4b171...,256301,0,4.0,0,0,4,39541.0,154599.0,52013.0,87960.0,82550.0,2.0,1.0,1.0,1.0,2.0,0.0,3.0,3.0,3.0,0.0,0.0,137.0,137.0,137.0,0.0,0.0,8.0,8.0,8.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,46772.0,115404.0,19683.0,62892.0,34492.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,137.0,137.0,0.0,0.0,0.0,9.0,8.0,0.028997,0.028997,0.028997,0.028997,0.028997,11875,3,1,0,0,0,0,137,3,8,0.028997,2.215501,42,20,1.0,0,0,6,3,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.099280,0.070202,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.100000,0.500000
262293,fa22cc6b0ea7317fd8bf3827b96d98642545c0ff83c2d3...,256302,0,0.0,0,0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,7449.0,165955.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,84131,3,1,0,0,0,0,137,3,6,0.028997,2.215501,7,3,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.666667,0.333333
262294,fa23cfe849f188989afdb94e7761ecc923e0f36bb2ce40...,256303,0,0.0,0,1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,173918.0,138830.0,47944.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,1.173350,0.028997,0.028997,0.028997,0.028997,156798,2,1,0,0,0,0,60,2,9,1.173350,0.039367,5,4,0.0,0,0,1,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.096042,0.000000,0.082167,0.0,0.000000,0.000000,0.000000,0.000000,0.500000,0.500000


- Save data for training 

In [9]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/without_truncation')
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [10]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> Second XGB set </center></h2> 

   <center>  - Train/Validation sessions are truncated with respect to the artificial nb_after_add feature </center>
    
   <center> - Keep the original distribution of validation (nb_after_add different from the test set)  </center>  

- Truncate sessions to nb_after_add event 

In [7]:
for col in feature_list:
     data[col] = data.progress_apply(partial(truncate_session, col=col), axis=1)

100%|██████████| 262296/262296 [00:07<00:00, 33835.68it/s]
100%|██████████| 262296/262296 [00:07<00:00, 36222.09it/s]
100%|██████████| 262296/262296 [00:07<00:00, 34109.76it/s]
100%|██████████| 262296/262296 [00:07<00:00, 34970.57it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35704.22it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35556.87it/s]
100%|██████████| 262296/262296 [00:07<00:00, 34681.66it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35724.44it/s]
100%|██████████| 262296/262296 [00:06<00:00, 37592.02it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35874.67it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35248.85it/s]
100%|██████████| 262296/262296 [00:07<00:00, 36051.61it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35842.46it/s]
100%|██████████| 262296/262296 [00:07<00:00, 34429.00it/s]
100%|██████████| 262296/262296 [00:06<00:00, 37613.73it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35612.90it/s]
100%|██████████| 262296/262296 [00:07<00:00, 35684.40it/

- Generate xgb features

In [8]:
xgboost_frame = generate_xgb_feats(data, search_session, mapping, image_matrix, desc_matrix, mapping_id_sku_emb_position)

  0%|          | 502/560394 [00:00<03:06, 3008.12it/s]

Number of products present in search and not in browsing data is: 31136


100%|██████████| 560394/560394 [00:05<00:00, 106601.40it/s]
100%|██████████| 560394/560394 [00:01<00:00, 292272.07it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate before/after count statistics


100%|██████████| 262296/262296 [00:32<00:00, 7997.10it/s] 
  0%|          | 0/262296 [00:00<?, ?it/s]

generate add-to-cart (AC) features


100%|██████████| 262296/262296 [00:18<00:00, 14025.36it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate session-level features


100%|██████████| 262296/262296 [00:12<00:00, 21195.99it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate p first/last interactions


100%|██████████| 262296/262296 [00:23<00:00, 10986.07it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate similarity features


100%|██████████| 262296/262296 [03:56<00:00, 1108.85it/s]


- Save data 

In [9]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/with_truncation_orignial_nb_distribution')
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [10]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> Third XGB set </center></h2> 

   <center>  - Train/Validation sessions are truncated with respect to the artificial nb_after_add feature </center>
    
   <center>  - Resample validation to have the same distribution of nb_after_add as in test set  </center>  

 

In [3]:
valid_mask = data['is_valid']==1
test_mask = data['is_test-last']==1
train_mask = (data['is_valid']==0) & (data['is_test-last']==0)

- Absolute distribution of nb_after_add 

In [12]:
frames = []
frames.append(data[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index'))
frames.append(data[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index'))
frames.append(data[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index'))
pd.concat(frames, axis=1)

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,29590,26921,7237
2.0,38750,8254,9568
4.0,27850,4725,6476
6.0,19016,3331,4214
8.0,13272,2478,2900
10.0,46095,2002,9617


- Relative distribution of nb_after_add

In [13]:
frames = []
frames.append(data[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.169499,0.564251,0.180871
2.0,0.22197,0.173,0.239128
4.0,0.159532,0.099034,0.161851
6.0,0.108929,0.069816,0.105318
8.0,0.076026,0.051938,0.072478
10.0,0.264044,0.041961,0.240353


- Truncate validation sessions with nb_after_add > 0 to the AC event (nb_after_add = 0)

In [14]:
from functools import partial
def truncate_session_to_AC(x, col): 
    product_id =  x['first_AC_id-last']
    add_index = x['product_url_hash_list'].tolist().index(product_id)
    return x[col][0:int(add_index+1)]
def resample_sessions(data, feature_list):
    for col in feature_list:
        data[col] = data.apply(partial(truncate_session_to_AC, col=col), axis=1)
    return data

In [15]:
data_resample_valid = data.copy()

In [16]:
for nb_after_add in [2, 4, 6, 8, 10]: 
    nb_to_resample = int((relative_nb_distribution.loc[nb_after_add, 'valid_nb_after_add'] - relative_nb_distribution.loc[nb_after_add, 'test_nb_after_add']) * valid_mask.sum())
    print("Truncate %s random sessions with nb_after_add == %s" %(nb_to_resample, nb_after_add))
    valid_session_nb = data_resample_valid[valid_mask & (data_resample_valid['nb_after_add-last']==nb_after_add)].session_id_hash.values
    sample = np.random.choice(valid_session_nb, nb_to_resample, replace=False)
    data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'product_url_hash_list'] = resample_sessions(
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), :],
        feature_list)
    
    data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'nb_after_add-last'] = 0

Truncate 2645 random sessions with nb_after_add == 2
Truncate 2513 random sessions with nb_after_add == 4
Truncate 1420 random sessions with nb_after_add == 6
Truncate 821 random sessions with nb_after_add == 8
Truncate 7938 random sessions with nb_after_add == 10


In [17]:
frames = []
frames.append(data_resample_valid[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data_resample_valid[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data_resample_valid[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.169499,0.564251,0.564181
2.0,0.22197,0.173,0.173023
4.0,0.159532,0.099034,0.099045
6.0,0.108929,0.069816,0.069829
8.0,0.076026,0.051938,0.051959
10.0,0.264044,0.041961,0.041962


- Generate xgb features 

In [18]:
xgboost_frame = generate_xgb_feats(data_resample_valid, search_session, mapping, image_matrix, desc_matrix, mapping_id_sku_emb_position)

  0%|          | 556/560394 [00:00<02:30, 3730.35it/s]

Number of products present in search and not in browsing data is: 31136


100%|██████████| 560394/560394 [00:03<00:00, 145585.41it/s]
100%|██████████| 560394/560394 [00:02<00:00, 231263.02it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate before/after count statistics


100%|██████████| 262296/262296 [00:30<00:00, 8468.48it/s] 
  0%|          | 0/262296 [00:00<?, ?it/s]

generate add-to-cart (AC) features


100%|██████████| 262296/262296 [00:19<00:00, 13364.27it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate session-level features


100%|██████████| 262296/262296 [00:12<00:00, 20266.54it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate p first/last interactions


100%|██████████| 262296/262296 [00:22<00:00, 11579.98it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate similarity features


100%|██████████| 262296/262296 [03:52<00:00, 1127.92it/s]


In [19]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/with_truncation_same_valid_test_nb_distribution')
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [20]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> Fourth XGB set </center></h2> 

   <center>  - Train/Validation sessions are truncated with respect to the artificial nb_after_add feature </center>
    
   <center>  - Resample Validation and Train to have the same distribution of nb_after_add as in test set  </center>  


In [21]:
frames = []
frames.append(data_resample_valid[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data_resample_valid[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data_resample_valid[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.169499,0.564251,0.564181
2.0,0.22197,0.173,0.173023
4.0,0.159532,0.099034,0.099045
6.0,0.108929,0.069816,0.069829
8.0,0.076026,0.051938,0.051959
10.0,0.264044,0.041961,0.041962


In [22]:
for nb_after_add in [2, 4, 6, 8, 10]: 
    nb_to_resample = int((relative_nb_distribution.loc[nb_after_add, 'train_nb_after_add'] - relative_nb_distribution.loc[nb_after_add, 'test_nb_after_add']) * train_mask.sum())
    print("Truncate %s random sessions with nb_after_add == %s" %(nb_to_resample, nb_after_add))
    train_session_nb = data_resample_valid[train_mask & (data_resample_valid['nb_after_add-last']==nb_after_add)].session_id_hash.values
    sample = np.random.choice(train_session_nb, nb_to_resample, replace=False)
    data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'product_url_hash_list'] = resample_sessions(
        data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), :],
        feature_list)
    data_resample_valid.loc[data_resample_valid.session_id_hash.isin(sample), 'nb_after_add-last'] = 0

Truncate 8548 random sessions with nb_after_add == 2
Truncate 10561 random sessions with nb_after_add == 4
Truncate 6827 random sessions with nb_after_add == 6
Truncate 4205 random sessions with nb_after_add == 8
Truncate 38769 random sessions with nb_after_add == 10


In [23]:
frames = []
frames.append(data_resample_valid[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data_resample_valid[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data_resample_valid[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.564234,0.564251,0.564181
2.0,0.173005,0.173,0.173023
4.0,0.099036,0.099034,0.099045
6.0,0.069822,0.069816,0.069829
8.0,0.051938,0.051938,0.051959
10.0,0.041965,0.041961,0.041962


- Generate xgb features 

In [24]:
xgboost_frame = generate_xgb_feats(data_resample_valid, search_session, mapping, image_matrix, desc_matrix, mapping_id_sku_emb_position)

  0%|          | 481/560394 [00:00<02:46, 3362.64it/s]

Number of products present in search and not in browsing data is: 31136


100%|██████████| 560394/560394 [00:04<00:00, 140021.63it/s]
100%|██████████| 560394/560394 [00:02<00:00, 241047.94it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate before/after count statistics


100%|██████████| 262296/262296 [00:32<00:00, 8054.16it/s] 
  0%|          | 0/262296 [00:00<?, ?it/s]

generate add-to-cart (AC) features


100%|██████████| 262296/262296 [00:18<00:00, 13820.12it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate session-level features


100%|██████████| 262296/262296 [00:12<00:00, 21644.64it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate p first/last interactions


100%|██████████| 262296/262296 [00:22<00:00, 11649.38it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate similarity features


100%|██████████| 262296/262296 [03:30<00:00, 1245.27it/s]


- Save data 

In [25]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/with_truncation_same_train_valid_test_nb_distribution')
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [26]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> Fifth XGB set </center></h2> 

   <center>  - Train/Validation sessions are truncated with respect to the artificial nb_after_add feature </center>
    
   <center>  - Resample Validation to have balanced distribution of nb_after_add between train and test set  </center>  


-  Create a balanced validation data taking into account Train and Test

In [None]:
frames = []
frames.append(data[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

In [None]:
balanced_valid = data.copy()

In [28]:
for nb_after_add in [2, 4, 6, 8, 10]: 
    balanced_percentage = (relative_nb_distribution.loc[nb_after_add, 'train_nb_after_add'] + relative_nb_distribution.loc[nb_after_add, 'test_nb_after_add']) / 2
    nb_to_resample = int((relative_nb_distribution.loc[nb_after_add, 'valid_nb_after_add'] - balanced_percentage) * valid_mask.sum())
    print("Truncate %s random sessions with nb_after_add == %s" %(nb_to_resample, nb_after_add))
    valid_session_nb = balanced_valid[valid_mask & (balanced_valid['nb_after_add-last']==nb_after_add)].session_id_hash.values
    sample = np.random.choice(valid_session_nb, nb_to_resample, replace=False)
    balanced_valid.loc[balanced_valid.session_id_hash.isin(sample), 'product_url_hash_list'] = resample_sessions(
        balanced_valid.loc[balanced_valid.session_id_hash.isin(sample), :],
        feature_list)
    
    balanced_valid.loc[balanced_valid.session_id_hash.isin(sample), 'nb_after_add-last'] = 0

Truncate 1666 random sessions with nb_after_add == 2
Truncate 1303 random sessions with nb_after_add == 4
Truncate 638 random sessions with nb_after_add == 6
Truncate 339 random sessions with nb_after_add == 8
Truncate 3495 random sessions with nb_after_add == 10


In [29]:
frames = []
frames.append(balanced_valid[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(balanced_valid[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(balanced_valid[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.169499,0.564251,0.36684
2.0,0.22197,0.173,0.197491
4.0,0.159532,0.099034,0.129286
6.0,0.108929,0.069816,0.089373
8.0,0.076026,0.051938,0.064006
10.0,0.264044,0.041961,0.153004


- Generate xgboost features 

In [30]:
xgboost_frame = generate_xgb_feats(data, search_session, mapping, image_matrix, desc_matrix, mapping_id_sku_emb_position)

  0%|          | 607/560394 [00:00<02:10, 4295.55it/s]

Number of products present in search and not in browsing data is: 31136


100%|██████████| 560394/560394 [00:03<00:00, 171581.98it/s]
100%|██████████| 560394/560394 [00:02<00:00, 229835.02it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate before/after count statistics


100%|██████████| 262296/262296 [00:32<00:00, 8131.96it/s] 
  0%|          | 0/262296 [00:00<?, ?it/s]

generate add-to-cart (AC) features


100%|██████████| 262296/262296 [00:18<00:00, 14033.20it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate session-level features


100%|██████████| 262296/262296 [00:12<00:00, 20432.57it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate p first/last interactions


100%|██████████| 262296/262296 [00:23<00:00, 11072.05it/s]
  0%|          | 0/262296 [00:00<?, ?it/s]

generate similarity features


100%|██████████| 262296/262296 [03:48<00:00, 1148.87it/s]


- Save data

In [31]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/with_truncation_balanced_valid_nb_distribution')
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [32]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))

<h2> <center> 6th XGB set - Same sessions with different split point after AC </center> </h2>

In [30]:
data = pd.read_parquet(os.path.join(DATA_FOLDER, 'duplicated_sessions_with_different_nb_after_add_cuts.parquet'))

In [31]:
valid_mask = data['is_valid']==1
test_mask = data['is_test-last']==1
train_mask = (data['is_valid']==0) & (data['is_test-last']==0)
frames = []
frames.append(data[train_mask]['nb_after_add-last'].value_counts().reset_index(name='train_nb_after_add').set_index('index') / train_mask.sum())
frames.append(data[test_mask]['nb_after_add-last'].value_counts().reset_index(name='test_nb_after_add').set_index('index') / test_mask.sum())
frames.append(data[valid_mask]['nb_after_add-last'].value_counts().reset_index(name='valid_nb_after_add').set_index('index') / valid_mask.sum())
relative_nb_distribution = pd.concat(frames, axis=1)
relative_nb_distribution

Unnamed: 0_level_0,train_nb_after_add,test_nb_after_add,valid_nb_after_add
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.286357,0.564251,0.296695
2.0,0.23782,0.173,0.243032
4.0,0.174257,0.099034,0.172083
6.0,0.128574,0.069816,0.124063
8.0,0.097381,0.051938,0.092815
10.0,0.075611,0.041961,0.071312


- Generate xgb features

In [12]:
xgboost_frame = generate_xgb_feats(duplicated_frame, search_session, mapping, image_matrix, desc_matrix, mapping_id_sku_emb_position )

  0%|          | 625/560394 [00:00<02:14, 4170.19it/s]

Number of products present in search and not in browsing data is: 3155


100%|██████████| 560394/560394 [00:08<00:00, 66778.91it/s] 
100%|██████████| 560394/560394 [00:03<00:00, 181296.66it/s]
  0%|          | 0/792204 [00:00<?, ?it/s]

generate before/after count statistics


100%|██████████| 792204/792204 [01:33<00:00, 8510.80it/s] 
  0%|          | 0/792204 [00:00<?, ?it/s]

generate add-to-cart (AC) features


100%|██████████| 792204/792204 [00:54<00:00, 14492.83it/s]
  0%|          | 0/792204 [00:00<?, ?it/s]

generate session-level features


100%|██████████| 792204/792204 [00:33<00:00, 23643.17it/s]
  0%|          | 0/792204 [00:00<?, ?it/s]

generate p first/last interactions


100%|██████████| 792204/792204 [01:09<00:00, 11344.98it/s]
  0%|          | 0/792204 [00:00<?, ?it/s]

generate similarity features


100%|██████████| 792204/792204 [10:41<00:00, 1234.42it/s]


Create columns of first and last interactions
Merge all features


In [13]:
display(xgboost_frame.head())

Unnamed: 0,original_session_id_hash,session_id_hash,is_purchased-last,nb_after_add-last,is_test-last,is_valid,fold,product_url_id_list_after-0,product_url_id_list_after-1,product_url_id_list_after-2,product_url_id_list_after-3,product_url_id_list_after-4,event_type_list_after-0,event_type_list_after-1,event_type_list_after-2,event_type_list_after-3,event_type_list_after-4,product_action_list_after-0,product_action_list_after-1,product_action_list_after-2,product_action_list_after-3,product_action_list_after-4,category_list_after-0,category_list_after-1,category_list_after-2,category_list_after-3,category_list_after-4,price_list_after-0,price_list_after-1,price_list_after-2,price_list_after-3,price_list_after-4,relative_price_list_after-0,relative_price_list_after-1,relative_price_list_after-2,relative_price_list_after-3,relative_price_list_after-4,product_url_id_list_before-0,product_url_id_list_before-1,product_url_id_list_before-2,product_url_id_list_before-3,product_url_id_list_before-4,event_type_list_before-0,event_type_list_before-1,event_type_list_before-2,event_type_list_before-3,event_type_list_before-4,product_action_list_before-0,product_action_list_before-1,product_action_list_before-2,product_action_list_before-3,product_action_list_before-4,category_list_before-0,category_list_before-1,category_list_before-2,category_list_before-3,category_list_before-4,price_list_before-0,price_list_before-1,price_list_before-2,price_list_before-3,price_list_before-4,relative_price_list_before-0,relative_price_list_before-1,relative_price_list_before-2,relative_price_list_before-3,relative_price_list_before-4,add_product_id,add_nb_interactions,add_has_been_detailed,add_has_been_removed,add_has_been_viewed,add_has_been_searched,add_has_been_clicked,add_category_hash,add_main_category,add_price,add_relative_price,add_relative_price_main,session_length,nb_unique_interactions,nb_queries,nb_add_before,nb_add_after,nb_detail_before,nb_detail_after,nb_remove_before,nb_remove_after,nb_view_before,nb_view_after,nb_click_before,nb_click_after,mean_sim_desc,std_sim_desc,mean_sim_img,std_sim_img,mean_sim_desc_before,std_sim_desc_before,mean_sim_img_before,std_sim_img_before,mean_sim_desc_after,std_sim_desc_after,mean_sim_img_after,std_sim_img_after,main_category_similarity_general,main_category_similarity_add
0,4ddbca430251bfa7dc72992b0fdde75e71dba86a589386...,79745,0,0.0,1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,53887,2,1,0,0,0,0,137,3,3,0.028997,2.215501,2,1,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,4ddc1abd0d0f3d9b6a2d094ccce4554ef4b4d9cc980d6f...,79747,0,8.0,1,0,2,4943.0,15772.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,39994,8,1,0,0,0,0,2,1,3,-1.460671,-2.587784,10,3,0.0,0,0,0,0,0,1,0,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.333333
2,4ddc4b1139366c4c89be5547e6593722a2e43e11f8179b...,79749,0,0.0,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,146875,2,1,0,0,0,0,4,1,9,2.32934,-2.587784,2,1,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,4ddde879ce326d402bbcfc5411c0009cd663fc0de824aa...,79757,0,0.0,1,0,5,112854.0,46772.0,45686.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,103316,3,1,0,0,0,1,137,3,8,0.028997,2.215501,6,4,1.0,0,0,0,0,0,1,0,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.25
4,4ddece6df88b7f61dcc54b9de7f35bec99bf6c048f12b3...,79762,0,0.0,1,0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028997,0.028997,0.028997,0.028997,0.028997,14374,2,1,0,0,0,0,137,3,6,0.028997,2.215501,2,1,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [14]:
OUTPUT_FOLDER = os.path.join(DATA_FOLDER, 'xgboost_data/duplicated_sessions_with_truncation')
for fold in range(1, 6): 
    xgboost_frame.loc[(xgboost_frame['is_test-last']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'test-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==1) & (xgboost_frame.fold==fold)].to_parquet(
        os.path.join(OUTPUT_FOLDER, 'valid-%s.parquet'%fold))
    
    xgboost_frame.loc[(xgboost_frame['is_valid']==0) & (xgboost_frame['is_test-last']==0) &
                          (xgboost_frame.fold==fold)].to_parquet(os.path.join(OUTPUT_FOLDER, 'train-%s.parquet'%fold))

In [15]:
xgboost_frame[xgboost_frame['is_test-last']==1].to_parquet(os.path.join(OUTPUT_FOLDER, 'test-full.parquet'))