In [1]:
import pandas as pd
import numpy as np
import glob
import os
import itertools
from itertools import permutations 
from tqdm import tqdm
from collections import defaultdict, Counter
from copy import deepcopy
import gc

In [2]:
import pyarrow
import pyarrow.parquet as pq

In [3]:
INPUT_PARQUET_PATH = "/home/gmoreira/dataset/ecommerce_preproc_2019-*/ecommerce_preproc.parquet/session_start_date=*"
OUTPUT_NEG_SAMPLES_PARQUET_PATH = "/home/gmoreira/dataset/neg_samples.parquet"

In [4]:
UNIFORM_SAMPLING = 'uniform'
RECENCY_SAMPLING = 'recency'
RECENT_POPULARITY_SAMPLING = 'popularity'
COOCURRENCE_SAMPLING = 'cooccurrence'

In [5]:
NEGATIVE_SAMPLING_STRATEGY = COOCURRENCE_SAMPLING

In [6]:
BATCH_SIZE = 1000
BATCHES_TO_UPDATE_ITEM_STATS = 3
BATCHES_TO_APPEND_ROWS_WITH_NEG_SAMPLES = 5
ITEM_STATS_KEEP_LAST_N_DAYS = 1.0
SEQUENCE_LENGTH = 20
NUM_NEG_SAMPLES = 50

In [7]:
input_parquet_files = sorted(glob.glob(INPUT_PARQUET_PATH+'*'))
input_parquet_files

['/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-01',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-02',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-03',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-04',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-05',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-06',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-07',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-08',
 '/home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-09',
 '/home/gmoreira/da

In [8]:
def get_output_path_parquet_neg_samples(input_parquet_filename):
    return input_parquet_filename \
        .replace('ecommerce_preproc.parquet', 'ecommerce_preproc_neg_samples_{}_strategy_{}.parquet' \
                     .format(NUM_NEG_SAMPLES, NEGATIVE_SAMPLING_STRATEGY)) + '.parquet'

In [9]:
'''
def get_files(data_paths):
    paths = [['file://' + p for p in glob.glob(path + "/*.parquet")] for path in data_paths]
    return list(itertools.chain.from_iterable(paths))
input_parquet_files = get_files([INPUT_PARQUET_PATH])
'''
pass

In [10]:
'''
#Works but cannot be used now because the preprocessed sessions are not sorted by timestamp
def read_parquet_generator(filenames, batch_size=128):
    for filename in filenames:
        for batch in pq.read_table(filename).to_batches(batch_size):
            yield batch.to_pandas()
            
parquet_reader = read_parquet_generator([INPUT_PARQUET_PATH], batch_size=BATCH_SIZE)            
'''
pass

In [11]:
def split_dataframe_into_chuncks_generator(df, chunk_size): 
    number_chunks = len(df) // chunk_size + 1
    for i in range(number_chunks):
        yield df[i*chunk_size:(i+1)*chunk_size]

In [12]:
def insert_update_session_items_metadata(row):
    #Uses the session start as the event timestamp (as the sess_etime_seq might sometimes be many days before because of outlier sessions with more than 120 min duration (< 1%))
    etime = row['session_start_ts']
    #For each session
    for pid, csid, ccid, bid, price, relative_price, prod_recency in zip(
                                                        #row['sess_etime_seq'],
                                                        row['sess_pid_seq'], 
                                                        row['sess_csid_seq'],
                                                        row['sess_ccid_seq'],
                                                        row['sess_bid_seq'],
                                                        row['sess_price_seq'],
                                                        row['sess_relative_price_to_avg_category_seq'],
                                                        row['sess_product_recency_seq']):

        #If this item was not processed before
        if pid != 0:
            if pid in items_df.index:
                curr_row = items_df.loc[pid]

                first_ts = curr_row['first_ts']
                last_ts = curr_row['last_ts']
                if etime > last_ts:
                    last_ts = etime
            else:
                first_ts = etime
                last_ts = etime

            #Including or updating the item metadata
            items_df.loc[pid] = pd.Series({'csid': csid,
                                           'ccid': ccid,
                                           'bid': bid,
                                           'price': price,
                                           'relative_price_to_avg_category': relative_price,
                                           'product_recency': prod_recency,
                                           'first_ts': first_ts,
                                           'last_ts': last_ts})

In [13]:
session_cooccurrences_log_list = []

def append_session_coocurrences_log(row):
    global session_cooccurrences_log_list
    min_ts = min([t for t in row['sess_etime_seq'] if t != 0])
    valid_pids = list(set(list([p for p in row['sess_pid_seq'] if p != 0])))
    
    if len(valid_pids) > 1:
        items_permutations = permutations(valid_pids, 2)        
        new_coo_df = pd.DataFrame(items_permutations, columns=['pid_a', 'pid_b'])
        new_coo_df['ts'] = min_ts
        #This flag is used for counting unique values from this table to compute popularity
        new_coo_df['count_flag'] = ([1] + [0]*(len(valid_pids)-2))*len(valid_pids)
        
        session_cooccurrences_log_list.append(new_coo_df)
        #items_coocurrence_df = pd.concat([items_coocurrence_df, new_coo_df])
        #items_coocurrence_df.append(new_coo_df, ignore_index=True)

def concat_sessions_coocurrences_log():
    global items_coocurrence_df, session_cooccurrences_log_list
    items_coocurrence_df = pd.concat([items_coocurrence_df] + session_cooccurrences_log_list)
    session_cooccurrences_log_list = []

In [14]:
def remove_old_interactions(keep_last_n_days):
    global items_coocurrence_df
    last_ts = items_coocurrence_df['ts'].max()
    keep_last_n_secs = keep_last_n_days * 24 * 60 * 60
    items_coocurrence_df = items_coocurrence_df[items_coocurrence_df['ts'] >= (last_ts - keep_last_n_secs)]

In [15]:
def update_items_temporal_relevance_decay():    
    global items_temporal_relev_df
    max_reference_ts = items_df['first_ts'].max()
    prods_days_age = (max_reference_ts - items_df['first_ts']) / (60 * 60 * 24)

    time_relev_by_item_series = prod_relevance_decay(prods_days_age)
    items_temporal_relev_df = time_relev_by_item_series / time_relev_by_item_series.sum()

In [16]:
def update_items_coocurrences_counts():
    global items_coocurence_counts_df
    items_coocurence_counts_df = items_coocurrence_df.groupby(['pid_a','pid_b']).size().to_frame('count') \
                                    .reset_index(level=[1])

In [17]:
def update_items_recent_popularity():
    global items_recent_pop_df
    items_recent_pop_df = items_coocurrence_df[items_coocurrence_df['count_flag'] == True] \
            .groupby(['pid_a']).size().to_frame('count')
    items_recent_pop_df['prob'] = items_recent_pop_df['count'] / items_recent_pop_df['count'].sum()

In [18]:
# (83% of relevance in one quarter, 70% in one semester, 50% in one year and 23% in two years)
DAYS_DECAY_FACTOR = 0.002

def prod_relevance_decay(days_age):
    return np.exp(-days_age*DAYS_DECAY_FACTOR)

In [19]:
# (83% of relevance in one quarter, 70% in one semester, 50% in one year and 23% in two years)
DAYS_DECAY_FACTOR = 0.002
# Simulating 2 year of decay on relevance of a product 
for i in np.arange(0,365*2,30):    
    print(i, prod_relevance_decay(i))

0 1.0
30 0.9417645335842487
60 0.8869204367171575
90 0.835270211411272
120 0.7866278610665535
150 0.7408182206817179
180 0.697676326071031
210 0.6570468198150567
240 0.6187833918061408
270 0.5827482523739896
300 0.5488116360940264
330 0.5168513344916992
360 0.4867522559599717
390 0.4584060113052235
420 0.43171052342907973
450 0.4065696597405991
480 0.38289288597511206
510 0.3605949401730783
540 0.3395955256449391
570 0.31981902181630384
600 0.30119421191220214
630 0.2836540264997704
660 0.26713530196585034
690 0.25157855305975646
720 0.23692775868212176


In [38]:
items_df = None
items_coocurrence_df = None
items_coocurence_counts_df = None
items_recent_pop_df = None
items_temporal_relev_df = None

def reset_item_logs_and_statistics():
    global items_df, items_coocurrence_df, items_coocurence_counts_df, items_recent_pop_df, items_temporal_relev_df
    
    items_df = pd.DataFrame(columns={'pid': np.int64,
                                 'csid': np.int32,
                                 'ccid': np.int32,
                                 'bid': np.int32,
                                 'price': np.float,
                                 'relative_price_to_avg_category': np.float,
                                 'product_recency': np.float,
                                 'first_ts': np.int,
                                 'last_ts': np.int
                                }).set_index('pid')
    
    items_coocurrence_df = pd.DataFrame(columns={'pid_a': np.int64, 
                                                 'pid_b': np.int64, 
                                                 'ts': np.int32, 
                                                 'count_flag': np.int16})
    
    items_coocurence_counts_df = None
    items_recent_pop_df = None
    items_temporal_relev_df = None

In [21]:
def get_uniform_sampling_item_ids(n_samples):
    return np.random.choice(items_df.index, min(n_samples, len(items_df)), replace=False)

In [22]:
def get_popularity_sampling_item_ids(n_samples):
    return np.random.choice(items_recent_pop_df.index, min(n_samples, len(items_recent_pop_df)), replace=False, 
                            p=items_recent_pop_df['prob']).tolist()

In [40]:
def get_coocurrence_sampling_item_ids(pid, n_samples):
    samples = []
    if pid in items_coocurence_counts_df.index:
        coocurrent_df = items_coocurence_counts_df.loc[pid]
        #Dealing with cases when there is only one co-occurrent item (loc() returns a Series)
        if type(coocurrent_df) is pd.Series:
            coocurrent_df = coocurrent_df.to_frame().T
        coocurrent_df['probs'] = coocurrent_df['count'] / coocurrent_df['count'].sum()
        samples = np.random.choice(coocurrent_df['pid_b'], min(n_samples, len(coocurrent_df)), replace=False, 
                                   p=coocurrent_df['probs']).tolist()
    return samples

In [24]:
def get_recency_sampling_item_ids(n_samples):
    samples = np.random.choice(items_temporal_relev_df.index, min(n_samples, len(items_temporal_relev_df)), replace=False, 
                               p=items_temporal_relev_df.values).tolist()
    return samples

In [25]:
def get_candidate_samples_item_ids(pid, n_samples, strategy, ignore_list=None):
    #To ensure that after removing sessions from the current session we have the required number of samples
    SAMPLES_MULITPLIER = 2
    if strategy == UNIFORM_SAMPLING:
        samples = get_uniform_sampling_item_ids(n_samples*SAMPLES_MULITPLIER)
    elif strategy == RECENCY_SAMPLING:
        samples = get_recency_sampling_item_ids(n_samples*SAMPLES_MULITPLIER)
    elif strategy == RECENT_POPULARITY_SAMPLING:
        samples = get_popularity_sampling_item_ids(n_samples*SAMPLES_MULITPLIER)    
    elif strategy == COOCURRENCE_SAMPLING:
        samples = get_coocurrence_sampling_item_ids(pid, n_samples*SAMPLES_MULITPLIER)
    
        #Completing the list of samples based on global popularity
        if len(samples) < n_samples:
            samples += get_popularity_sampling_item_ids(n_samples - len(samples))
    else:
        raise Exception('Not a valid strategy. Should be: (uniform|recency|popularity|cooccurrence)')
        
    #Removing repeated entries
    samples = list(set(samples))
    if ignore_list is not None:
        samples = list([i for i in samples if i not in ignore_list])

    return samples[:n_samples]

In [26]:
#get_candidate_samples_item_ids(63246, 10, strategy=COOCURRENCE_SAMPLING, ignore_list=[2010])

In [27]:
def get_features_for_item_ids(pids):
    return items_df.loc[pids][['csid', 'ccid', 'bid', 'price', 'relative_price_to_avg_category', 'product_recency']] #\
    #.to_dict(orient='list')

In [28]:
%%time
#l= np.random.choice(items_df.index, 50)
#l= np.ones(50, dtype='int')*10
#l = [10] * 50
#l = [33284, 6, 21, 2588, 23069, 1570, 8230, 552, 50, 4152, 55864, 24636, 113213, 65601, 9283, 1097, 1104, 90, 2651, 607, 13920, 7785, 619, 8821, 9337, 4221, 8330, 7307, 8848, 19089, 112784, 2197, 15002, 3234, 3252, 1207, 2745, 40128, 30918, 4299, 61646, 23247, 213, 6871, 8921, 15582, 6367, 15077, 8935, 3820, 1266, 1790, 256, 258, 3844, 1808, 14109, 1311, 8481, 3362, 1318, 2348, 301, 29996, 19245, 1331, 3891, 10553, 830, 1855, 1867, 3415, 2933, 8571, 21371, 390, 903, 1425, 404, 5527, 1958, 23477, 6071, 59321, 7100, 4545, 1479, 28628, 10200, 16857, 3544, 479, 480, 6628, 35812, 2032, 3059, 28153, 2046]
#for i in range(100):
#    for j in range(10):
#        x = get_features_for_item_ids(l)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


In [29]:
def padarray(A, size):
    if len(A) > size:
        A = A[:size]
    t = size - len(A)
    return np.pad(A, pad_width=(0, t), mode='constant')

In [30]:
padarray([1,2,3], 4)

array([1, 2, 3, 0])

In [31]:
def generate_neg_samples(session_pids, user_past_pids, n_samples, strategy):
    neg_samples_dict = defaultdict(list)
    
    #Ignores session items and also recently interacted items
    ignore_ids = set(np.hstack([session_pids, user_past_pids]))
    
    for pid in session_pids:
        if pid != 0:
            #neg_item_ids = np.ones(50, dtype='int')*10
            #neg_item_ids = [33284, 6, 21, 2588, 23069, 1570, 8230, 552, 50, 4152, 55864, 24636, 113213, 65601, 9283, 1097, 1104, 90, 2651, 607, 13920, 7785, 619, 8821, 9337, 4221, 8330, 7307, 8848, 19089, 112784, 2197, 15002, 3234, 3252, 1207, 2745, 40128, 30918, 4299, 61646, 23247, 213, 6871, 8921, 15582, 6367, 15077, 8935, 3820, 1266, 1790, 256, 258, 3844, 1808, 14109, 1311, 8481, 3362, 1318, 2348, 301, 29996, 19245, 1331, 3891, 10553, 830, 1855, 1867, 3415, 2933, 8571, 21371, 390, 903, 1425, 404, 5527, 1958, 23477, 6071, 59321, 7100, 4545, 1479, 28628, 10200, 16857, 3544, 479, 480, 6628, 35812, 2032, 3059, 28153, 2046]
            neg_item_ids = get_candidate_samples_item_ids(pid, n_samples, 
                                                          ignore_list=ignore_ids,
                                                         strategy=strategy
                                                         )            
            neg_item_features_dict = get_features_for_item_ids(neg_item_ids)

            '''
            pids_padded = padarray(neg_item_ids, n_samples).astype(int)
            neg_samples_dict['sess_neg_pids'].append(pids_padded)
            
            for k, v in neg_item_features_dict.items():
                values = padarray(v, n_samples)
                values = values.astype(int) if k in ['csid', 'ccid', 'bid'] else values
                neg_samples_dict['sess_neg_{}'.format(k)].append(values)
            '''
        '''
        else:
            #Creating padding neg samples for each padding interactions
            missing_padding_neg_samples = len(session_pids) - len(neg_samples_dict['sess_neg_pids'])            
            for p in range(missing_padding_neg_samples):
                for k in neg_samples_dict:
                    #Copying shape and dtype from the neg samples of the first interaction
                    neg_samples_dict[k].append(np.zeros_like(neg_samples_dict[k][0]))
        '''   
    #Concatenating neg. samples of all session interactions because Petastorm data loader 
    #does not support lists of lists. It will require reshaping neg. samples features inside the Pytorch model
    #for k in neg_samples_dict:  
    #    neg_samples_dict[k] = np.hstack(neg_samples_dict[k])        

    return neg_samples_dict

In [32]:
#generate_neg_samples([10,20,30, 0, 0], n_samples=2, strategy='popularity')

In [33]:
def append_new_rows_to_parquet(new_rows_df, path):
    global pq_writer
    new_rows_pa = pyarrow.Table.from_pandas(new_rows_df)
    if pq_writer is None:
        #Creating parent folder recursively
        parent_folder = os.path.dirname(os.path.abspath(path))
        if not os.path.exists(parent_folder):
            os.makedirs(parent_folder)
        #Creating parquet file
        pq_writer = pq.ParquetWriter(path, new_rows_pa.schema) 
    pq_writer.write_table(new_rows_pa)

## Generates neg. samples for all sessions and creates new parquet files

In [None]:
pq_writer = None

reset_item_logs_and_statistics()
try:
    #For each file (day)
    for idx_day, input_file in enumerate(input_parquet_files):
        print('='*40)
        print('[Day {}] Loading sessions from parquet: {}'.format(idx_day, input_file))
        output_filename = get_output_path_parquet_neg_samples(input_file)
        
        if os.path.exists(output_filename):
            raise Exception('Output parquet file already exists')
        
        #Loading parquet file and sorting sessions by timestamp
        sessions_df = pd.read_parquet(input_file)
        sessions_df.sort_values('session_start_ts', inplace=True)
                
        new_rows = []
        
        print('Processing batches')
        #For each batch
        for batch_id, batch in tqdm(enumerate(split_dataframe_into_chuncks_generator(sessions_df, 
                                                                                chunk_size = BATCH_SIZE))):
            print('batch_id', batch_id)            
            #For each row (session)
            for i, row in batch.iterrows():
                insert_update_session_items_metadata(row)
                append_session_coocurrences_log(row)
                
                
                #Ignoring first batch (not computing neg. samples nor saving to parquet)
                if batch_id > 0:   
                    #Generating neg. samples for each interaction in the session
                    session_neg_samples_by_pid_dict = generate_neg_samples(row['sess_pid_seq'], 
                                                                           row['user_pid_seq_bef_sess'],
                                                                           NUM_NEG_SAMPLES, 
                                                                           strategy=NEGATIVE_SAMPLING_STRATEGY
                                                                          )
                    #Merging user and session features with neg samples for the session
                    new_row_with_neg_samples_dict = {**row.to_dict(), **session_neg_samples_by_pid_dict}
                    new_rows.append(new_row_with_neg_samples_dict)
                    
            
            #Each N batches updates item statistics (popularity, recency, co-occurrence)
            #Ps. Do the update for all the first five batches of the first file , for better sampling
            if (batch_id % BATCHES_TO_UPDATE_ITEM_STATS == BATCHES_TO_UPDATE_ITEM_STATS-1) or \
               (idx_day == 0 and batch_id < 5):
                print('[Batch {}] Updating item stats'.format(batch_id))
                remove_old_interactions(ITEM_STATS_KEEP_LAST_N_DAYS)
                if NEGATIVE_SAMPLING_STRATEGY in [RECENT_POPULARITY_SAMPLING, COOCURRENCE_SAMPLING]:
                    concat_sessions_coocurrences_log()
                    update_items_coocurrences_counts()
                    if RECENT_POPULARITY_SAMPLING:
                        update_items_recent_popularity()
                if NEGATIVE_SAMPLING_STRATEGY == RECENCY_SAMPLING:
                    update_items_temporal_relevance_decay()
                
            #Each N batches appends the new rows with neg. samples to parquet file
            if batch_id % BATCHES_TO_APPEND_ROWS_WITH_NEG_SAMPLES == BATCHES_TO_APPEND_ROWS_WITH_NEG_SAMPLES-1: 
                print('[Batch {}] Appending new rows with neg samples to parquet: {}'.format(batch_id,output_filename))
                append_new_rows_to_parquet(pd.DataFrame(new_rows), output_filename)
                del(new_rows)
                new_rows = []
            
               
        #Save pending rows
        if len(new_rows) > 0:
            print('[Batch {}] Appending new rows with neg samples to parquet: {}'.format(batch_id,output_filename))
            append_new_rows_to_parquet(pd.DataFrame(new_rows), output_filename)
            del(new_rows)
            new_rows = []
            
        #Flushing and releasing the current parquet file and proceeding for the new date
        pq_writer.close()
        pq_writer = None
                
        del(sessions_df)
        gc.collect()
        
        break
finally:
    if pq_writer:
        pq_writer.close()

[Day 0] Loading sessions from parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-01


0it [00:00, ?it/s]

Processing batches
batch_id 0
[Batch 0] Updating item stats


1it [00:16, 16.17s/it]

batch_id 1
[Batch 1] Updating item stats


2it [00:42, 19.33s/it]

batch_id 2
[Batch 2] Updating item stats


3it [01:07, 20.77s/it]

batch_id 3
[Batch 3] Updating item stats


4it [01:40, 24.69s/it]

batch_id 4
[Batch 4] Updating item stats
[Batch 4] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-01.parquet


5it [02:10, 26.05s/it]

batch_id 5
[Batch 5] Updating item stats


6it [02:30, 24.48s/it]

batch_id 6


7it [02:53, 23.78s/it]

batch_id 7


8it [03:12, 22.60s/it]

batch_id 8
[Batch 8] Updating item stats


9it [03:35, 22.61s/it]

batch_id 9
[Batch 9] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-01.parquet


10it [03:57, 22.36s/it]

batch_id 10


11it [04:21, 22.83s/it]

batch_id 11
[Batch 11] Updating item stats


12it [04:40, 21.90s/it]

batch_id 12


13it [04:59, 20.81s/it]

batch_id 13


14it [05:17, 20.17s/it]

batch_id 14
[Batch 14] Updating item stats
[Batch 14] Appending new rows with neg samples to parquet: /home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-01.parquet


15it [05:39, 20.69s/it]

batch_id 15


16it [05:57, 19.90s/it]

batch_id 16


17it [06:20, 20.79s/it]

batch_id 17
[Batch 17] Updating item stats


18it [06:43, 21.35s/it]

batch_id 18


In [2]:
items_df

NameError: name 'items_df' is not defined

In [None]:
items_coocurrence_df

In [None]:
items_coocurence_counts_df

In [3]:
items_recent_pop_df

NameError: name 'items_recent_pop_df' is not defined

In [None]:
items_temporal_relev_df

In [None]:
items_df = pd.DataFrame(columns={'pid': np.int64,
                                 'csid': np.int32,
                                 'ccid': np.int32,
                                 'bid': np.int32,
                                 'price': np.float,
                                 'relative_price_to_avg_category': np.float,
                                 'product_recency': np.float,
                                 'first_ts': np.int,
                                 'last_ts': np.int
                                }).set_index('pid')
    
    items_coocurrence_df = pd.DataFrame(columns={'pid_a': np.int64, 
                                                 'pid_b': np.int64, 
                                                 'ts': np.int32, 
                                                 'count_flag': np.int16})
    
    items_coocurence_counts_df = None
    items_recent_pop_df = None
    items_temporal_relev_df = None

## Loading the parquet with Negative samples with Petastorm

In [None]:
from petastorm.pytorch import DataLoader
from petastorm import make_batch_reader
from petastorm.unischema import UnischemaField
from petastorm.unischema import Unischema
from petastorm.codecs import NdarrayCodec

In [None]:
input_with_neg_parquet_path = 'file:///home/gmoreira/dataset/ecommerce_preproc_2019-10/ecommerce_preproc_neg_samples_50_strategy_cooccurrence.parquet/session_start_date=2019-10-01.parquet'

In [None]:
recsys_schema_full = [
  UnischemaField('user_idx', np.int, (), None, True),
#   UnischemaField('user_session', str_, (), None, True),
  UnischemaField('sess_seq_len', np.int, (), None, False),
  UnischemaField('session_start_ts', np.int64, (), None, True),
  UnischemaField('user_seq_length_bef_sess', np.int, (), None, False),
  UnischemaField('user_elapsed_days_bef_sess', np.float, (), None, True),
  UnischemaField('user_elapsed_days_log_bef_sess_norm', np.double, (), None, True),
  UnischemaField('sess_pid_seq', np.int64, (None,), None, True),
  UnischemaField('sess_etime_seq', np.int64, (None,), None, True),
  UnischemaField('sess_etype_seq', np.int, (None,), None, True),
  UnischemaField('sess_csid_seq', np.int, (None,), None, True),
  UnischemaField('sess_ccid_seq', np.int, (None,), None, True),
  UnischemaField('sess_bid_seq', np.int, (None,), None, True),
  UnischemaField('sess_price_seq', np.float, (None,), None, True),
  UnischemaField('sess_dtime_seq', np.float, (None,), None, True),
  UnischemaField('sess_product_recency_seq', np.float, (None,), None, True),
  UnischemaField('sess_relative_price_to_avg_category_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_hour_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_hour_cos_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_month_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_month_cos_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofweek_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofweek_cos_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofmonth_sin_seq', np.float, (None,), None, True),
  UnischemaField('sess_et_dayofmonth_cos_seq', np.float, (None,), None, True),
  UnischemaField('user_pid_seq_bef_sess', np.int64, (None,), None, True),
  UnischemaField('user_etime_seq_bef_sess', np.int64, (None,), None, True),
  UnischemaField('user_etype_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_csid_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_ccid_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_bid_seq_bef_sess', np.int, (None,), None, True),
  UnischemaField('user_price_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_dtime_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_product_recency_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_relative_price_to_avg_category_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_hour_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_hour_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_month_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_month_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofweek_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofweek_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofmonth_sin_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofmonth_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('user_et_dayofmonth_cos_seq_bef_sess', np.float, (None,), None, True),
  UnischemaField('sess_neg_pids', np.int, (None,), None, True),
  UnischemaField('sess_neg_csid', np.int, (None,), None, True),
  UnischemaField('sess_neg_ccid', np.int, (None,), None, True),
  UnischemaField('sess_neg_bid', np.int, (None,), None, True),
  UnischemaField('sess_neg_price', np.float, (None,), None, True),
  UnischemaField('sess_neg_relative_price_to_avg_category', np.float, (None,), None, True),
  UnischemaField('sess_neg_product_recency', np.float, (None,), None, True),
]

In [None]:
with DataLoader(
    make_batch_reader(input_with_neg_parquet_path, 
                num_epochs=1,
                # transform_spec=transform
                schema_fields=recsys_schema_full,
    ), batch_size=2) as train_loader:
    for i, batch in enumerate(train_loader):
        print("i:{}".format(i))
        print(batch)
        print(batch['sess_neg_product_recency'].shape)
        break

In [None]:
#TODO: Identify bottleneck as items_df and items_coocurrence_df become larger