In [17]:
import pandas as pd
import numpy as np
import glob
import os
import itertools
from itertools import permutations 
from tqdm import tqdm
from collections import defaultdict, Counter
from copy import deepcopy
import gc

In [7]:
import pyarrow
import pyarrow.parquet as pq

In [8]:
INPUT_PARQUET_PATH = "/home/gmoreira/dataset/ecommerce_preproc_2019-*/ecommerce_preproc.parquet/session_start_date=*"
#OUTPUT_NEG_SAMPLES_PARQUET_PATH = "/home/gmoreira/dataset/neg_samples.parquet"

In [9]:
FIRST_N_SESSIONS_PER_DAY = 10000

In [10]:
UNIFORM_SAMPLING = 'uniform'
RECENCY_SAMPLING = 'recency'
RECENT_POPULARITY_SAMPLING = 'popularity'
COOCURRENCE_SAMPLING = 'cooccurrence'

In [11]:
NEGATIVE_SAMPLING_STRATEGY = COOCURRENCE_SAMPLING

In [12]:
BATCH_SIZE = 1000
BATCHES_TO_UPDATE_ITEM_STATS = 3
BATCHES_TO_APPEND_ROWS_WITH_NEG_SAMPLES = 5
ITEM_STATS_KEEP_LAST_N_DAYS = 1.0
SEQUENCE_LENGTH = 20
NUM_NEG_SAMPLES = 50

In [13]:
input_parquet_files = sorted(glob.glob(INPUT_PARQUET_PATH+'*'))
input_parquet_files

['/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-01',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-02',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-03',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-04',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-05',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-06',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-07',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-08',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-09',
 '/home/gmoreira/da

In [14]:
def get_output_path_parquet_neg_samples(input_parquet_filename):
    return input_parquet_filename \
        .replace('ecommerce_preproc.parquet', 'ecommerce_preproc_neg_samples_{}_strategy_{}.parquet' \
                     .format(NUM_NEG_SAMPLES, NEGATIVE_SAMPLING_STRATEGY)) + '.parquet'

In [58]:
'''
def get_files(data_paths):
    paths = [['file://' + p for p in glob.glob(path + "/*.parquet")] for path in data_paths]
    return list(itertools.chain.from_iterable(paths))
input_parquet_files = get_files([INPUT_PARQUET_PATH])
'''
pass

In [10]:
'''
#Works but cannot be used now because the preprocessed sessions are not sorted by timestamp
def read_parquet_generator(filenames, batch_size=128):
    for filename in filenames:
        for batch in pq.read_table(filename).to_batches(batch_size):
            yield batch.to_pandas()
            
parquet_reader = read_parquet_generator([INPUT_PARQUET_PATH], batch_size=BATCH_SIZE)            
'''
pass

In [15]:
def split_dataframe_into_chuncks_generator(df, chunk_size): 
    number_chunks = len(df) // chunk_size + 1
    for i in range(number_chunks):
        yield df[i*chunk_size:(i+1)*chunk_size]

In [16]:
def insert_update_session_items_metadata(row):
    #Uses the session start as the event timestamp (as the sess_etime_seq might sometimes be many days before because of outlier sessions with more than 120 min duration (< 1%))
    etime = row['session_start_ts']
    #For each session
    for pid, csid, ccid, bid, price, relative_price, prod_recency in zip(
                                                        #row['sess_etime_seq'],
                                                        row['sess_pid_seq'], 
                                                        row['sess_csid_seq'],
                                                        row['sess_ccid_seq'],
                                                        row['sess_bid_seq'],
                                                        row['sess_price_seq'],
                                                        row['sess_relative_price_to_avg_category_seq'],
                                                        row['sess_product_recency_seq']):

        #If this item was not processed before
        if pid != 0:
            if pid in items_df.index:
                curr_row = items_df.loc[pid]

                first_ts = curr_row['first_ts']
                last_ts = curr_row['last_ts']
                if etime > last_ts:
                    last_ts = etime
            else:
                first_ts = etime
                last_ts = etime

            #Including or updating the item metadata
            items_df.loc[pid] = pd.Series({'csid': csid,
                                           'ccid': ccid,
                                           'bid': bid,
                                           'price': price,
                                           'relative_price_to_avg_category': relative_price,
                                           'product_recency': prod_recency,
                                           'first_ts': first_ts,
                                           'last_ts': last_ts})

In [17]:
session_cooccurrences_log_list = []

def append_session_coocurrences_log(row):
    global session_cooccurrences_log_list
    min_ts = min([t for t in row['sess_etime_seq'] if t != 0])
    valid_pids = list(set(list([p for p in row['sess_pid_seq'] if p != 0])))
    
    if len(valid_pids) > 1:
        items_permutations = permutations(valid_pids, 2)        
        new_coo_df = pd.DataFrame(items_permutations, columns=['pid_a', 'pid_b'])
        new_coo_df['ts'] = min_ts
        #This flag is used for counting unique values from this table to compute popularity
        new_coo_df['count_flag'] = ([1] + [0]*(len(valid_pids)-2))*len(valid_pids)
        
        session_cooccurrences_log_list.append(new_coo_df)

def concat_sessions_coocurrences_log():
    global items_coocurrence_df, session_cooccurrences_log_list
    items_coocurrence_df = pd.concat([items_coocurrence_df] + session_cooccurrences_log_list)
    session_cooccurrences_log_list = []

In [18]:
def remove_old_interactions(keep_last_n_days):
    global items_coocurrence_df
    last_ts = items_coocurrence_df['ts'].max()
    keep_last_n_secs = keep_last_n_days * 24 * 60 * 60
    items_coocurrence_df = items_coocurrence_df[items_coocurrence_df['ts'] >= (last_ts - keep_last_n_secs)]

In [19]:
def update_items_temporal_relevance_decay():    
    global items_temporal_relev_df
    max_reference_ts = items_df['first_ts'].max()
    prods_days_age = (max_reference_ts - items_df['first_ts']) / (60 * 60 * 24)

    time_relev_by_item_series = prod_relevance_decay(prods_days_age)
    items_temporal_relev_df = time_relev_by_item_series / time_relev_by_item_series.sum()

In [20]:
def update_items_coocurrences_counts():
    global items_coocurence_counts_df
    items_coocurence_counts_df = items_coocurrence_df.groupby(['pid_a','pid_b']).size().to_frame('count') \
                                    .reset_index(level=[1])

In [21]:
def update_items_recent_popularity():
    global items_recent_pop_df
    items_recent_pop_df = items_coocurrence_df[items_coocurrence_df['count_flag'] == True] \
            .groupby(['pid_a']).size().to_frame('count')
    items_recent_pop_df['prob'] = items_recent_pop_df['count'] / items_recent_pop_df['count'].sum()

In [22]:
# (83% of relevance in one quarter, 70% in one semester, 50% in one year and 23% in two years)
DAYS_DECAY_FACTOR = 0.002

def prod_relevance_decay(days_age):
    return np.exp(-days_age*DAYS_DECAY_FACTOR)

In [23]:
# (83% of relevance in one quarter, 70% in one semester, 50% in one year and 23% in two years)
DAYS_DECAY_FACTOR = 0.002
# Simulating 2 year of decay on relevance of a product 
for i in np.arange(0,365*2,30):    
    print(i, prod_relevance_decay(i))

0 1.0
30 0.9417645335842487
60 0.8869204367171575
90 0.835270211411272
120 0.7866278610665535
150 0.7408182206817179
180 0.697676326071031
210 0.6570468198150567
240 0.6187833918061408
270 0.5827482523739896
300 0.5488116360940264
330 0.5168513344916992
360 0.4867522559599717
390 0.4584060113052235
420 0.43171052342907973
450 0.4065696597405991
480 0.38289288597511206
510 0.3605949401730783
540 0.3395955256449391
570 0.31981902181630384
600 0.30119421191220214
630 0.2836540264997704
660 0.26713530196585034
690 0.25157855305975646
720 0.23692775868212176


In [24]:
items_df = None
items_coocurrence_df = None
items_coocurence_counts_df = None
items_recent_pop_df = None
items_temporal_relev_df = None

def reset_item_logs_and_statistics():
    global items_df, items_coocurrence_df, items_coocurence_counts_df, items_recent_pop_df, items_temporal_relev_df
    
    items_df = pd.DataFrame(columns={'pid': np.int64,
                                 'csid': np.int32,
                                 'ccid': np.int32,
                                 'bid': np.int32,
                                 'price': np.float,
                                 'relative_price_to_avg_category': np.float,
                                 'product_recency': np.float,
                                 'first_ts': np.int,
                                 'last_ts': np.int
                                }).set_index('pid')
    
    items_coocurrence_df = pd.DataFrame(columns={'pid_a': np.int64, 
                                                 'pid_b': np.int64, 
                                                 'ts': np.int32, 
                                                 'count_flag': np.int16})
    
    items_coocurence_counts_df = None
    items_recent_pop_df = None
    items_temporal_relev_df = None

In [26]:
def get_uniform_sampling_item_ids(n_samples):
    return np.random.choice(items_df.index, min(n_samples, len(items_df)), replace=False)

In [27]:
def get_popularity_sampling_item_ids(n_samples):
    return np.random.choice(items_recent_pop_df.index, min(n_samples, len(items_recent_pop_df)), replace=False, 
                            p=items_recent_pop_df['prob']).tolist()

In [28]:
def get_coocurrence_sampling_item_ids(pid, n_samples):
    samples = []
    if pid in items_coocurence_counts_df.index:
        coocurrent_df = items_coocurence_counts_df.loc[pid]
        #Dealing with cases when there is only one co-occurrent item (loc() returns a Series)
        if type(coocurrent_df) is pd.Series:
            coocurrent_df = coocurrent_df.to_frame().T
        coocurrent_df['probs'] = coocurrent_df['count'] / coocurrent_df['count'].sum()
        samples = np.random.choice(coocurrent_df['pid_b'], min(n_samples, len(coocurrent_df)), replace=False, 
                                   p=coocurrent_df['probs']).tolist()
    return samples

In [29]:
def get_recency_sampling_item_ids(n_samples):
    samples = np.random.choice(items_temporal_relev_df.index, min(n_samples, len(items_temporal_relev_df)), replace=False, 
                               p=items_temporal_relev_df.values).tolist()
    return samples

In [30]:
def get_candidate_samples_item_ids(pid, n_samples, strategy, ignore_list=None):
    #To ensure that after removing sessions from the current session we have the required number of samples
    SAMPLES_MULITPLIER = 2
    if strategy == UNIFORM_SAMPLING:
        samples = get_uniform_sampling_item_ids(n_samples*SAMPLES_MULITPLIER)
    elif strategy == RECENCY_SAMPLING:
        samples = get_recency_sampling_item_ids(n_samples*SAMPLES_MULITPLIER)
    elif strategy == RECENT_POPULARITY_SAMPLING:
        samples = get_popularity_sampling_item_ids(n_samples*SAMPLES_MULITPLIER)    
    elif strategy == COOCURRENCE_SAMPLING:
        samples = get_coocurrence_sampling_item_ids(pid, n_samples*SAMPLES_MULITPLIER)
    
        #Completing the list of samples based on global popularity
        if len(samples) < n_samples:
            samples += get_popularity_sampling_item_ids(n_samples - len(samples))
    else:
        raise Exception('Not a valid strategy. Should be: (uniform|recency|popularity|cooccurrence)')
        
    #Removing repeated entries
    samples = list(set(samples))
    #Removing samples from the ignore list
    if ignore_list is not None:
        samples = list([i for i in samples if i not in ignore_list])

    return samples[:n_samples]

In [31]:
#get_candidate_samples_item_ids(63246, 10, strategy=COOCURRENCE_SAMPLING, ignore_list=[2010])

In [33]:
def get_features_for_item_ids(pids):
    return items_df.loc[pids][['csid', 'ccid', 'bid', 'price', 'relative_price_to_avg_category', 'product_recency']] \
    .to_dict(orient='list')

In [28]:
##%%time
#l= np.random.choice(items_df.index, 50)
#l= np.ones(50, dtype='int')*10
#l = [10] * 50
#l = [33284, 6, 21, 2588, 23069, 1570, 8230, 552, 50, 4152, 55864, 24636, 113213, 65601, 9283, 1097, 1104, 90, 2651, 607, 13920, 7785, 619, 8821, 9337, 4221, 8330, 7307, 8848, 19089, 112784, 2197, 15002, 3234, 3252, 1207, 2745, 40128, 30918, 4299, 61646, 23247, 213, 6871, 8921, 15582, 6367, 15077, 8935, 3820, 1266, 1790, 256, 258, 3844, 1808, 14109, 1311, 8481, 3362, 1318, 2348, 301, 29996, 19245, 1331, 3891, 10553, 830, 1855, 1867, 3415, 2933, 8571, 21371, 390, 903, 1425, 404, 5527, 1958, 23477, 6071, 59321, 7100, 4545, 1479, 28628, 10200, 16857, 3544, 479, 480, 6628, 35812, 2032, 3059, 28153, 2046]
#for i in range(100):
#    for j in range(10):
#        x = get_features_for_item_ids(l)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


In [34]:
def padarray(A, size):
    if len(A) > size:
        A = A[:size]
    t = size - len(A)
    return np.pad(A, pad_width=(0, t), mode='constant')

In [35]:
padarray([1,2,3], 4)

array([1, 2, 3, 0])

In [36]:
def generate_neg_samples(session_pids, user_past_pids, n_samples, strategy):
    neg_samples_dict = defaultdict(list)
    
    #Ignores session items and also recently interacted items
    ignore_ids = set(np.hstack([session_pids, user_past_pids]))
    
    for pid in session_pids:
        if pid != 0:
            #Sampling item idds
            neg_item_ids = get_candidate_samples_item_ids(pid, n_samples, 
                                                          ignore_list=ignore_ids,
                                                         strategy=strategy
                                                         )      
            #Retrieving item features
            neg_item_features_dict = get_features_for_item_ids(neg_item_ids)

            
            pids_padded = padarray(neg_item_ids, n_samples).astype(int)
            neg_samples_dict['sess_neg_pids'].append(pids_padded)
            
            for k, v in neg_item_features_dict.items():
                values = padarray(v, n_samples)
                values = values.astype(int) if k in ['csid', 'ccid', 'bid'] else values.astype(float)
                neg_samples_dict['sess_neg_{}'.format(k)].append(values)
            
        
        else:
            #Creating padding neg samples for each padding interactions
            missing_padding_neg_samples = len(session_pids) - len(neg_samples_dict['sess_neg_pids'])                        
            for k in neg_samples_dict:
                neg_samples = neg_samples_dict[k]
                neg_samples_zeros = np.zeros_like(neg_samples[0])
                for p in range(missing_padding_neg_samples):
                    #Copying shape and dtype from the neg samples of the first interaction
                    neg_samples.append(neg_samples_zeros)
         
    #Concatenating neg. samples of all session interactions because Petastorm data loader 
    #does not support lists of lists. It will require reshaping neg. samples features inside the Pytorch model
    for k in neg_samples_dict:  
        neg_samples_dict[k] = np.hstack(neg_samples_dict[k])        

    return neg_samples_dict

In [32]:
#generate_neg_samples([10,20,30, 0, 0], n_samples=2, strategy='popularity')

In [38]:
def append_new_rows_to_parquet(new_rows_df, path):
    global pq_writer
    new_rows_pa = pyarrow.Table.from_pandas(new_rows_df)
    if pq_writer is None:
        #Creating parent folder recursively
        parent_folder = os.path.dirname(os.path.abspath(path))
        if not os.path.exists(parent_folder):
            os.makedirs(parent_folder)
        #Creating parquet file
        pq_writer = pq.ParquetWriter(path, new_rows_pa.schema) 
    pq_writer.write_table(new_rows_pa)

## Generates neg. samples for all sessions and creates new parquet files

In [None]:
FIRST_N_SESSIONS_PER_DAY

In [39]:
pq_writer = None

reset_item_logs_and_statistics()
try:
    #For each file (day)
    for idx_day, input_file in enumerate(input_parquet_files):
        print('='*40)
        print('[Day {}] Loading sessions from parquet: {}'.format(idx_day, input_file))
        output_filename = get_output_path_parquet_neg_samples(input_file)
        
        if os.path.exists(output_filename):
            raise Exception('Output parquet file already exists')
        
        #Loading parquet file and sorting sessions by timestamp
        sessions_df = pd.read_parquet(input_file)
        sessions_df.sort_values('session_start_ts', inplace=True)
        
        #TEMP: Limiting the number of negative samples per day for faster processing
        sessions_df = sessions_df[:FIRST_N_SESSIONS_PER_DAY]        
                
        new_rows = []
        
        print('Processing batches')
        #For each batch
        for batch_id, batch in tqdm(enumerate(split_dataframe_into_chuncks_generator(sessions_df, 
                                                                                chunk_size = BATCH_SIZE))):
            print('batch_id', batch_id)            
            #For each row (session)
            for i, row in batch.iterrows():
                insert_update_session_items_metadata(row)
                append_session_coocurrences_log(row)
                
                
                #Ignoring first batch (not computing neg. samples nor saving to parquet)
                if batch_id > 0:   
                    #Generating neg. samples for each interaction in the session
                    session_neg_samples_by_pid_dict = generate_neg_samples(row['sess_pid_seq'], 
                                                                           row['user_pid_seq_bef_sess'],
                                                                           NUM_NEG_SAMPLES, 
                                                                           strategy=NEGATIVE_SAMPLING_STRATEGY
                                                                          )
                    #Merging user and session features with neg samples for the session
                    new_row_with_neg_samples_dict = {**row.to_dict(), **session_neg_samples_by_pid_dict}
                    new_rows.append(new_row_with_neg_samples_dict)
                    
            
            #Each N batches updates item statistics (popularity, recency, co-occurrence)
            #Ps. Do the update for all the first five batches of the first file , for better sampling
            if (batch_id % BATCHES_TO_UPDATE_ITEM_STATS == BATCHES_TO_UPDATE_ITEM_STATS-1) or \
               (idx_day == 0 and batch_id < 5):
                print('[Batch {}] Updating item stats'.format(batch_id))
                remove_old_interactions(ITEM_STATS_KEEP_LAST_N_DAYS)
                if NEGATIVE_SAMPLING_STRATEGY in [RECENT_POPULARITY_SAMPLING, COOCURRENCE_SAMPLING]:
                    concat_sessions_coocurrences_log()
                    update_items_coocurrences_counts()
                    if RECENT_POPULARITY_SAMPLING:
                        update_items_recent_popularity()
                if NEGATIVE_SAMPLING_STRATEGY == RECENCY_SAMPLING:
                    update_items_temporal_relevance_decay()
                
            #Each N batches appends the new rows with neg. samples to parquet file
            if batch_id % BATCHES_TO_APPEND_ROWS_WITH_NEG_SAMPLES == BATCHES_TO_APPEND_ROWS_WITH_NEG_SAMPLES-1: 
                print('[Batch {}] Appending new rows with neg samples to parquet: {}'.format(batch_id,output_filename))
                append_new_rows_to_parquet(pd.DataFrame(new_rows), output_filename)
                del(new_rows)
                new_rows = []
            
               
        #Save pending rows
        if len(new_rows) > 0:
            print('[Batch {}] Appending new rows with neg samples to parquet: {}'.format(batch_id,output_filename))
            append_new_rows_to_parquet(pd.DataFrame(new_rows), output_filename)
            del(new_rows)
            new_rows = []
            
        #Flushing and releasing the current parquet file and proceeding for the new date
        pq_writer.close()
        pq_writer = None
                
        del(sessions_df)
        gc.collect()
finally:
    if pq_writer:
        pq_writer.close()

[Day 0] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-01


0it [00:00, ?it/s]

Processing batches
batch_id 0
[Batch 0] Updating item stats


1it [00:09,  9.00s/it]

batch_id 1
[Batch 1] Updating item stats


2it [00:31, 12.92s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:50, 14.91s/it]

batch_id 3
[Batch 3] Updating item stats


4it [01:12, 16.98s/it]

batch_id 4
[Batch 4] Updating item stats
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-01.parquet


5it [01:41, 20.58s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:05, 21.67s/it]

batch_id 6


7it [02:27, 21.67s/it]

batch_id 7


8it [02:48, 21.48s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:12, 22.24s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-01.parquet


11it [03:41, 20.16s/it]


batch_id 10
[Day 1] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-02


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:08,  8.92s/it]

batch_id 1


2it [00:29, 12.29s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:55, 16.42s/it]

batch_id 3


4it [01:17, 18.07s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-02.parquet


5it [01:40, 19.64s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:05, 21.31s/it]

batch_id 6


7it [02:25, 20.83s/it]

batch_id 7


8it [02:47, 21.32s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:13, 22.51s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-02.parquet


11it [03:37, 19.77s/it]


batch_id 10
[Day 2] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-03


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:09,  9.35s/it]

batch_id 1


2it [00:33, 13.81s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:01, 18.13s/it]

batch_id 3


4it [01:26, 20.26s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-03.parquet


5it [01:55, 22.86s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:27, 25.55s/it]

batch_id 6


7it [02:50, 24.80s/it]

batch_id 7


8it [03:18, 25.64s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:49, 27.27s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-03.parquet


11it [04:14, 23.10s/it]


batch_id 10
[Day 3] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-04


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:07,  7.93s/it]

batch_id 1


2it [00:34, 13.39s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:02, 17.94s/it]

batch_id 3


4it [01:20, 17.98s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-04.parquet


5it [01:43, 19.29s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:06, 20.56s/it]

batch_id 6


7it [02:26, 20.35s/it]

batch_id 7


8it [02:47, 20.47s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:07, 20.41s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-04.parquet


11it [03:29, 19.01s/it]


batch_id 10
[Day 4] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-05


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:08,  8.52s/it]

batch_id 1


2it [00:30, 12.45s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:52, 15.44s/it]

batch_id 3


4it [01:14, 17.44s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-05.parquet


5it [01:37, 18.99s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:01, 20.67s/it]

batch_id 6


7it [02:22, 20.74s/it]

batch_id 7


8it [02:47, 22.06s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:11, 22.42s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-05.parquet


11it [03:34, 19.51s/it]


batch_id 10
[Day 5] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-06


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:07,  7.71s/it]

batch_id 1


2it [00:31, 12.48s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:57, 16.50s/it]

batch_id 3


4it [01:21, 18.91s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-06.parquet


5it [01:46, 20.79s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:14, 22.85s/it]

batch_id 6


7it [02:38, 23.14s/it]

batch_id 7


8it [03:04, 24.14s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:29, 24.15s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-06.parquet


11it [04:01, 21.99s/it]


batch_id 10
[Day 6] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-07


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:10, 10.02s/it]

batch_id 1


2it [00:30, 13.11s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:52, 15.93s/it]

batch_id 3


4it [01:14, 17.64s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-07.parquet


5it [01:37, 19.27s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:00, 20.24s/it]

batch_id 6


7it [02:21, 20.57s/it]

batch_id 7


8it [02:45, 21.56s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:06, 21.54s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-07.parquet


11it [03:30, 19.15s/it]


batch_id 10
[Day 7] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-08


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:06,  6.84s/it]

batch_id 1


2it [00:30, 11.98s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:54, 15.41s/it]

batch_id 3


4it [01:19, 18.52s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-08.parquet


5it [01:46, 20.92s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:12, 22.48s/it]

batch_id 6


7it [02:35, 22.66s/it]

batch_id 7


8it [03:00, 23.25s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:29, 24.99s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-08.parquet


11it [04:01, 21.97s/it]


batch_id 10
[Day 8] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-09


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:09,  9.11s/it]

batch_id 1


2it [00:39, 15.49s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:07, 19.10s/it]

batch_id 3


4it [01:34, 21.73s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-09.parquet


5it [02:08, 25.41s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:34, 25.41s/it]

batch_id 6


7it [02:58, 25.04s/it]

batch_id 7


8it [03:26, 26.04s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:50, 25.31s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-09.parquet


11it [04:16, 23.30s/it]


batch_id 10
[Day 9] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-10


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:08,  8.17s/it]

batch_id 1


2it [00:31, 12.71s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:59, 17.26s/it]

batch_id 3


4it [01:24, 19.70s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-10.parquet


5it [01:52, 22.21s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:25, 25.26s/it]

batch_id 6


7it [02:53, 26.23s/it]

batch_id 7


8it [03:24, 27.75s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:53, 27.98s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-10.parquet


11it [04:14, 23.16s/it]


batch_id 10
[Day 10] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-11


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:09,  9.33s/it]

batch_id 1


2it [00:31, 13.17s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:56, 16.59s/it]

batch_id 3


4it [01:14, 17.16s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-11.parquet


5it [01:41, 20.05s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:05, 21.18s/it]

batch_id 6


7it [02:27, 21.48s/it]

batch_id 7


8it [02:46, 20.64s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:11, 22.08s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-11.parquet


11it [03:34, 19.47s/it]


batch_id 10
[Day 11] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-12


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:08,  8.64s/it]

batch_id 1


2it [00:38, 14.94s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:06, 18.97s/it]

batch_id 3


4it [01:34, 21.52s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-12.parquet


5it [01:57, 22.22s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:27, 24.49s/it]

batch_id 6


7it [02:51, 24.18s/it]

batch_id 7


8it [03:16, 24.47s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:43, 25.40s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-12.parquet


11it [04:08, 22.59s/it]


batch_id 10
[Day 12] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-13


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:09,  9.73s/it]

batch_id 1


2it [00:36, 14.99s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:11, 20.75s/it]

batch_id 3


4it [01:42, 23.93s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-13.parquet


5it [02:16, 27.02s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:51, 29.31s/it]

batch_id 6


7it [03:24, 30.50s/it]

batch_id 7


8it [03:54, 30.17s/it]

batch_id 8
[Batch 8] Updating item stats


9it [04:19, 28.82s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-13.parquet


11it [04:46, 26.06s/it]


batch_id 10
[Day 13] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-14


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:07,  7.89s/it]

batch_id 1


2it [00:29, 12.03s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:51, 15.08s/it]

batch_id 3


4it [01:13, 16.99s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-14.parquet


5it [01:36, 18.92s/it]

batch_id 5
[Batch 5] Updating item stats


6it [01:59, 19.98s/it]

batch_id 6


7it [02:20, 20.30s/it]

batch_id 7


8it [02:43, 21.11s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:06, 21.67s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-14.parquet


11it [03:32, 19.28s/it]


batch_id 10
[Day 14] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-15


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:08,  8.68s/it]

batch_id 1


2it [00:29, 12.22s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:51, 15.35s/it]

batch_id 3


4it [01:12, 16.95s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-15.parquet


5it [01:37, 19.22s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:07, 22.74s/it]

batch_id 6


7it [02:36, 24.38s/it]

batch_id 7


8it [03:05, 25.92s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:35, 27.07s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-15.parquet


11it [04:16, 23.29s/it]


batch_id 10
[Day 15] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-16


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:12, 12.05s/it]

batch_id 1


2it [00:38, 16.35s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:13, 22.02s/it]

batch_id 3


4it [01:48, 25.89s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-16.parquet


5it [02:19, 27.25s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:55, 30.03s/it]

batch_id 6


7it [03:23, 29.34s/it]

batch_id 7


8it [03:46, 27.53s/it]

batch_id 8
[Batch 8] Updating item stats


9it [04:12, 27.11s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-16.parquet


11it [04:36, 25.10s/it]


batch_id 10
[Day 16] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-17


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:08,  8.01s/it]

batch_id 1


2it [00:28, 11.85s/it]

batch_id 2
[Batch 2] Updating item stats


3it [00:55, 16.26s/it]

batch_id 3


4it [01:21, 19.27s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-17.parquet


5it [01:46, 21.08s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:11, 22.11s/it]

batch_id 6


7it [02:35, 22.62s/it]

batch_id 7


8it [02:59, 23.21s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:22, 23.14s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-17.parquet


11it [03:48, 20.76s/it]


batch_id 10
[Day 17] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-18


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:07,  7.31s/it]

batch_id 1


2it [00:33, 12.90s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:02, 17.71s/it]

batch_id 3


4it [01:25, 19.47s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-18.parquet


5it [01:53, 22.05s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:19, 23.04s/it]

batch_id 6


7it [02:45, 24.04s/it]

batch_id 7


8it [03:05, 22.74s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:29, 23.29s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-18.parquet


11it [03:54, 21.32s/it]


batch_id 10
[Day 18] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-19


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:09,  9.54s/it]

batch_id 1


2it [00:37, 15.02s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:04, 18.53s/it]

batch_id 3


4it [01:29, 20.50s/it]

batch_id 4
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-19.parquet


5it [01:57, 22.79s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:27, 24.94s/it]

batch_id 6


7it [02:53, 25.18s/it]

batch_id 7


8it [03:19, 25.48s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:45, 25.59s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-19.parquet


11it [04:14, 23.12s/it]


batch_id 10
[Day 19] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-20


0it [00:00, ?it/s]

Processing batches
batch_id 0


1it [00:07,  7.27s/it]

batch_id 1


2it [00:32, 12.60s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:02, 17.77s/it]

batch_id 3


4it [01:28, 20.38s/it]

batch_id 4


KeyboardInterrupt: 

## Loading the parquet with Negative samples with Petastorm

In [48]:
from petastorm.pytorch import DataLoader
from petastorm import make_batch_reader
from petastorm.unischema import UnischemaField
from petastorm.unischema import Unischema
from petastorm.codecs import NdarrayCodec

In [49]:
input_with_neg_parquet_path = 'file:///home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-01_full.parquet'

In [50]:
recsys_schema_full = [
  UnischemaField('user_idx', np.int, (), None, True),
#   UnischemaField('user_session', str_, (), None, True),
  UnischemaField('sess_seq_len', np.int, (), None, False),
  UnischemaField('session_start_ts', np.int64, (), None, True),
  UnischemaField('user_seq_length_bef_sess', np.int, (), None, False),
  UnischemaField('user_elapsed_days_bef_sess', np.float, (), None, True),
  UnischemaField('user_elapsed_days_log_bef_sess_norm', np.double, (), None, True),
  UnischemaField('sess_pid_seq', np.int64, (None,), None, True),
  UnischemaField('sess_etime_seq', np.int64, (None,), None, True),
  UnischemaField('sess_etype_seq', np.int, (None,), None, True),
  UnischemaField('sess_csid_seq', np.int, (None,), None, True),
  UnischemaField('sess_ccid_seq', np.int, (None,), None, True),
  UnischemaField('sess_bid_seq', np.int, (None,), None, True),
  UnischemaField('sess_price_seq', np.float, (None,), None, True),
  UnischemaField('sess_dtime_seq', np.float, (None,), None, True),
  UnischemaField('sess_product_recency_seq', np.float, (None,), None, True),
  UnischemaField('sess_relative_price_to_avg_category_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_hour_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_hour_cos_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_month_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_month_cos_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofweek_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofweek_cos_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofmonth_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofmonth_cos_seq', np.float, (None,), None, True),
  UnischemaField('user_pid_seq_bef_sess', np.int64, (None,), None, True),
  UnischemaField('user_etime_seq_bef_sess', np.int64, (None,), None, True),
  UnischemaField('user_etype_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_csid_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_ccid_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_bid_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_price_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_dtime_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_product_recency_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_relative_price_to_avg_category_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_hour_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_hour_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_month_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_month_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofweek_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofweek_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofmonth_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofmonth_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofmonth_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('sess_neg_pids', np.int, (None,), None, True),
  UnischemaField('sess_neg_csid', np.int, (None,), None, True),
  UnischemaField('sess_neg_ccid', np.int, (None,), None, True),
  UnischemaField('sess_neg_bid', np.int, (None,), None, True),
  UnischemaField('sess_neg_price', np.float, (None,), None, True),
  UnischemaField('sess_neg_relative_price_to_avg_category', np.float, (None,), None, True),
  UnischemaField('sess_neg_product_recency', np.float, (None,), None, True),
]

In [51]:
with DataLoader(
    make_batch_reader(input_with_neg_parquet_path, 
                num_epochs=1,
                # transform_spec=transform
                schema_fields=recsys_schema_full,
    ), batch_size=2) as train_loader:
    for i, batch in enumerate(train_loader):
        print("i:{}".format(i))
        print(batch)
        print(batch['sess_neg_product_recency'].shape)
        break

  'Unischema').format(f.name))
  column_as_pandas = column.data.chunks[0].to_pandas()


i:0
{'user_idx': tensor([550479254, 547827437]), 'sess_seq_len': tensor([21,  3]), 'session_start_ts': tensor([1569921441, 1569921441]), 'user_seq_length_bef_sess': tensor([0, 0]), 'user_elapsed_days_bef_sess': tensor([nan, nan], dtype=torch.float64), 'user_elapsed_days_log_bef_sess_norm': tensor([nan, nan], dtype=torch.float64), 'sess_pid_seq': tensor([[125685,   5713,   3066,  11869,  10670,   4312,  19814,  63599,   8931,
           6909,   7571,   3390,   1697,   3616,  21046,   2666,   7386,   3976,
          23679,  28513],
        [   515,     57,    331,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0]]), 'sess_etime_seq': tensor([[1569921441, 1569921527, 1569921549, 1569921628, 1569921649, 1569921670,
         1569921712, 1569921752, 1569921793, 1569921955, 1569922051, 1569922061,
         1569922096, 1569922146, 1569922168, 1569922273, 1569922283, 1569922301,
         1569

KeyboardInterrupt: 

In [51]:
df = pd.read_parquet('/home/gmoreira/dataset/ecommerce_preproc_with_neg_samples/ecommerce_preproc_neg_samples_50_strategy_uniform/session_start_date=2019-10-01-train.parquet')

In [52]:
len(df)

1000

In [53]:
df.columns

Index(['user_idx', 'user_session', 'sess_seq_len', 'session_start_ts',
       'user_seq_length_bef_sess', 'user_elapsed_days_bef_sess',
       'user_elapsed_days_log_bef_sess_norm', 'sess_pid_seq', 'sess_etime_seq',
       'sess_etype_seq', 'sess_csid_seq', 'sess_ccid_seq', 'sess_bid_seq',
       'sess_price_seq', 'sess_dtime_seq', 'sess_product_recency_seq',
       'sess_relative_price_to_avg_category_seq', 'sess_et_hour_sin_seq',
       'sess_et_hour_cos_seq', 'sess_et_month_sin_seq',
       'sess_et_month_cos_seq', 'sess_et_dayofweek_sin_seq',
       'sess_et_dayofweek_cos_seq', 'sess_et_dayofmonth_sin_seq',
       'sess_et_dayofmonth_cos_seq', 'user_pid_seq_bef_sess',
       'user_etime_seq_bef_sess', 'user_etype_seq_bef_sess',
       'user_csid_seq_bef_sess', 'user_ccid_seq_bef_sess',
       'user_bid_seq_bef_sess', 'user_price_seq_bef_sess',
       'user_dtime_seq_bef_sess', 'user_product_recency_seq_bef_sess',
       'user_relative_price_to_avg_category_seq_bef_sess',
       'us

In [34]:
df.dtypes

user_idx                                              int64
user_session                                         object
sess_seq_len                                          int64
session_start_ts                                      int64
user_seq_length_bef_sess                              int64
user_elapsed_days_bef_sess                          float64
user_elapsed_days_log_bef_sess_norm                 float64
sess_pid_seq                                         object
sess_etime_seq                                       object
sess_etype_seq                                       object
sess_csid_seq                                        object
sess_ccid_seq                                        object
sess_bid_seq                                         object
sess_price_seq                                       object
sess_dtime_seq                                       object
sess_product_recency_seq                             object
sess_relative_price_to_avg_category_seq 

In [55]:
df['sess_neg_sess_bid_seq'].values[0]

array([ 527,   23,    2,   84,   22,   20,   53,  461,  779,  343,  751,
       4305, 4305, 4305, 4305,    2, 4305,  273,   33,   18, 4305,    8,
       4305, 4305,  603,   78,   18,  224, 4305,    7,    4, 4305,    6,
       4305,   30,   53,  122,  309,  238,  138, 4305,  110,    5, 4305,
         18,  394,    2,    7, 4305, 2326,   27,    3,  549,  207,  603,
        104,  326,  100,  175,    2, 4305, 1052,  527,  114, 4305,    2,
         49,   44, 4305, 1021,  173, 4305,  187, 4305,    8, 4305, 4305,
       4305, 1390,  148,    3,    8,    3,    3,    2, 4305,   21,    7,
       4305,   40,   29,   53,  338,  152,    8,  165,  102,    8,   53,
         47,    3, 4305, 4305,   27, 4305, 4305,   31,   10, 4305,  384,
       4305,  603, 4305, 4305, 4305,  319,   79,  219,    3,    3,    3,
         43, 4305,   11,   13, 4305,   21,  167,   62, 4305,    4,    9,
        110,  561,  359,   86,   20, 4305,   40,   43,  170,  829,   66,
       4305, 4305,    6,  301, 4305,   68, 4305,  2

In [37]:
df['sess_neg_sess_bid_seq'].values[0][155]

4305.0

In [56]:
df

Unnamed: 0,user_idx,user_session,sess_seq_len,session_start_ts,user_seq_length_bef_sess,user_elapsed_days_bef_sess,user_elapsed_days_log_bef_sess_norm,sess_pid_seq,sess_etime_seq,sess_etype_seq,...,user_et_dayofweek_cos_seq_bef_sess,user_et_dayofmonth_sin_seq_bef_sess,user_et_dayofmonth_cos_seq_bef_sess,sess_neg_pids,sess_neg_sess_csid_seq,sess_neg_sess_ccid_seq,sess_neg_sess_bid_seq,sess_neg_sess_price_seq,sess_neg_sess_relative_price_to_avg_category_seq,sess_neg_sess_product_recency_seq
0,555648057,98499c8f-ad2d-45bd-99c9-d9947ad4784e,5,1569938266,0,,,"[3995, 168, 957, 337, 1726, 0, 0, 0, 0, 0, 0, ...","[1569938266, 1569938319, 1569938353, 156993839...","[2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5976, 11966, 48, 30747, 2786, 5564, 60760, 22...","[146, 12, 4, 3, 34, 15, 31, 12, 74, 52, 314, 7...","[48, 3, 4, 131, 23, 131, 13, 3, 13, 131, 86, 1...","[527, 23, 2, 84, 22, 20, 53, 461, 779, 343, 75...","[-1.2072219848632812, -0.9228723049163818, 0.9...","[-0.8343976736068726, -0.8237003684043884, 0.0...","[-3.5283703804016113, -3.647977113723755, -3.4..."
1,555698781,ef1abf6d-1b24-4cb8-b678-c9550bc33d7d,5,1569945681,0,,,"[4097, 10341, 6030, 11568, 8499, 0, 0, 0, 0, 0...","[1569945681, 1569945825, 1569945848, 156994588...","[2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[41122, 3497, 1254, 135, 29631, 29708, 38921, ...","[235, 37, 19, 2, 74, 152, 235, 91, 16, 2, 380,...","[131, 131, 12, 2, 13, 131, 131, 131, 131, 2, 1...","[1139, 61, 160, 3, 176, 4305, 503, 1934, 4305,...","[-2.5214123725891113, 0.11688324809074402, -0....","[-0.5392028093338013, -0.3629242181777954, -0....","[-3.857556104660034, -3.5666608810424805, -3.3..."
2,529338550,875a7800-5550-4aa3-85fe-7915b5f04023,6,1569965750,2,0.291377,-0.702407,"[35528, 7536, 22620, 19725, 27659, 71909, 0, 0...","[1569965750, 1569965775, 1569965821, 156996585...","[2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[-0.90096885, -0.90096885, 0.0, 0.0, 0.0, 0.0,...","[0.20129852, 0.20129852, 0.0, 0.0, 0.0, 0.0, 0...","[0.9795299, 0.9795299, 0.0, 0.0, 0.0, 0.0, 0.0...","[7914, 11007, 833, 45487, 11343, 5887, 39782, ...","[3, 27, 4, 134, 72, 38, 303, 198, 110, 302, 24...","[131, 14, 4, 55, 49, 17, 131, 131, 131, 131, 9...","[84, 29, 2, 24, 589, 4305, 4305, 197, 4305, 43...","[-0.474873423576355, -0.36009541153907776, 2.0...","[0.32676804065704346, -0.5733218789100647, 2.9...","[-3.857556104660034, -3.6964993476867676, -3.4..."
3,555725229,cd2621a6-61e4-465b-8aba-57fe2359a33c,5,1569950568,0,,,"[267, 14, 130, 115, 74, 0, 0, 0, 0, 0, 0, 0, 0...","[1569950568, 1569950635, 1569950718, 156995100...","[2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[51246, 1148, 2590, 67453, 163803, 382, 4598, ...","[196, 10, 196, 23, 95, 24, 230, 2, 15, 125, 10...","[131, 3, 131, 11, 53, 9, 88, 2, 131, 54, 3, 5,...","[121, 3, 73, 16, 531, 4305, 4305, 4, 20, 4305,...","[-2.759387969970703, 1.171530842781067, 1.3147...","[-0.9487777352333069, 0.9699004888534546, 8.01...","[-3.857556104660034, -3.6995019912719727, -3.8..."
4,522023225,dcb6e777-0bdf-4598-9c1b-e219499ef73f,2,1569943802,0,,,"[2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1569943802, 1569943852, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1024, 18499, 1874, 40394, 4486, 2320, 2355, 4...","[186, 54, 24, 9, 33, 24, 128, 2, 57, 207, 8, 1...","[131, 131, 9, 131, 22, 9, 65, 2, 131, 41, 8, 5...","[10, 4305, 19, 4305, 4305, 4305, 4305, 2, 17, ...","[0.7368451356887817, -0.6964377164840698, 0.58...","[0.5878480672836304, -0.6419907808303833, 0.20...","[-3.4588608741760254, -3.8255484104156494, -3...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,555652474,b808ab70-23a7-405f-a9be-0ddae7e05a92,3,1569937871,0,,,"[22, 78, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1569937871, 1569937963, 1569937984, 0, 0, 0, ...","[2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5259, 51167, 170, 5271, 12821, 124, 767, 5527...","[7, 58, 2, 27, 184, 14, 3, 83, 49, 2, 14, 22, ...","[10, 131, 2, 14, 131, 131, 131, 41, 28, 2, 131...","[19, 4305, 9, 48, 4, 44, 67, 4305, 20, 3, 8, 6...","[0.49252572655677795, 1.1661357879638672, 0.75...","[-0.16759197413921356, 0.17139066755771637, -0...","[-3.6690099239349365, -3.7491071224212646, -3...."
996,522138637,437d65f0-d3da-41c1-99df-4157cd95f82d,2,1569953816,0,,,"[55, 81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1569953816, 1569954009, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[67613, 37629, 2535, 77213, 3748, 1047, 1978, ...","[25, 109, 2, 13, 81, 4, 2, 2, 8, 196, 20, 3, 3...","[15, 131, 2, 131, 131, 4, 2, 2, 8, 131, 131, 1...","[4305, 4305, 4, 39, 42, 268, 4, 3, 47, 4305, 2...","[-0.43555590510368347, -1.1716824769973755, -0...","[-0.6772119402885437, 0.016083568334579468, -0...","[-3.857556104660034, -3.857556104660034, -3.53..."
997,514316046,d9a8115b-d69d-4e71-99c7-0a1347e11695,4,1569931744,0,,,"[6162, 18467, 16081, 61097, 0, 0, 0, 0, 0, 0, ...","[1569931744, 1569931786, 1569931815, 156993183...","[2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[40408, 80057, 126, 82433, 6840, 122140, 8642,...","[30, 182, 2, 18, 2, 28, 16, 2, 21, 4, 103, 4, ...","[7, 131, 2, 7, 2, 18, 131, 2, 16, 4, 52, 4, 7,...","[4305, 4305, 2, 247, 3, 4305, 112, 4, 82, 2, 4...","[-0.45678481459617615, -0.22726204991340637, 1...","[0.04079359769821167, -0.2926800847053528, 0.3...","[-3.7937638759613037, -3.637340545654297, -3.5..."
998,551450418,1abd871b-71c7-44ea-bd8a-b9eb0f2e1034,5,1569945137,1,0.009375,-0.947112,"[265, 373, 572, 588, 990, 0, 0, 0, 0, 0, 0, 0,...","[1569945137, 1569945148, 1569945179, 156994519...","[2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[-0.90096885, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.20129852, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.9795299, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[97982, 16640, 17849, 2010, 17409, 1101, 2126,...","[317, 369, 15, 33, 12, 39, 2, 2, 23, 235, 85, ...","[24, 131, 131, 22, 3, 25, 2, 2, 11, 131, 131, ...","[1367, 547, 65, 107, 4305, 110, 98, 4305, 11, ...","[-1.5836411714553833, -3.095940351486206, 0.03...","[-0.5201225876808167, -0.9587899446487427, -0....","[-3.470968008041382, -3.3620927333831787, -3.5..."
