In [122]:
import os
import pandas as pd
import numpy as np

In [123]:
INPUT_PATH = '/home/gmoreira/dataset/ecommerce_preproc_with_neg_samples/ecommerce_preproc_neg_samples_50_strategy_session_cooccurrence'
OUTPUT_PATH = '/home/gmoreira/dataset/ecommerce_preproc_binary_class_with_neg_samples/ecommerce_preproc_binary_classif_neg_samples_50_strategy_session_cooccurrence'

In [124]:
parquet_files = sorted(os.listdir(INPUT_PATH))

In [125]:
SESSION_COMMON_COLS = ['user_idx', 'user_session'] #, 'sess_seq_len', 'session_start_ts']

SESSION_EVENT_COLS = ['sess_etime_seq','sess_etype_seq',
                     'sess_dtime_seq', #(sess_etime_seq was fixed by the script that adds neg samples but the sess_dtime_seq were not fixed yet, neither the time cycling features)
                     'sess_et_hour_sin_seq', 'sess_et_hour_cos_seq', 
                     'sess_et_month_sin_seq', 'sess_et_month_cos_seq', 
                     'sess_et_dayofweek_sin_seq', 'sess_et_dayofweek_cos_seq', 
                     'sess_et_dayofmonth_sin_seq', 'sess_et_dayofmonth_cos_seq']

SESSION_POSITIVE_SAMPLES_COLS = ['sess_pid_seq', 
                                 'sess_csid_seq', 'sess_ccid_seq', 
                                 'sess_bid_seq',
                                 'sess_price_seq',                              
                                 'sess_relative_price_to_avg_category_seq', 
                                 'sess_product_recency_seq',
                                ]

SESSION_NEGATIVE_SAMPLES_COLS = ['sess_neg_pids',
                               'sess_neg_sess_csid_seq', 'sess_neg_sess_ccid_seq',
                               'sess_neg_sess_bid_seq', 'sess_neg_sess_price_seq',
                               'sess_neg_sess_relative_price_to_avg_category_seq',
                               'sess_neg_sess_product_recency_seq']

NEG_POS_ITEM_COLS_MAPPING = {
    'sess_neg_pids': 'sess_pid_seq',
    'sess_neg_sess_csid_seq': 'sess_csid_seq',
    'sess_neg_sess_ccid_seq': 'sess_ccid_seq',
    'sess_neg_sess_bid_seq': 'sess_bid_seq',
    'sess_neg_sess_price_seq': 'sess_price_seq',
    'sess_neg_sess_relative_price_to_avg_category_seq': 'sess_relative_price_to_avg_category_seq',
    'sess_neg_sess_product_recency_seq': 'sess_product_recency_seq'
}

In [126]:
SEQ_LEN = 20
N_NEG_SAMPLES = 50
NEG_SAMPLES_SHAPE = (SEQ_LEN, N_NEG_SAMPLES)

In [127]:
def get_pos_samples_from_sessions(df):
    session_positive_samples_selected_df = df[SESSION_COMMON_COLS+SESSION_EVENT_COLS+SESSION_POSITIVE_SAMPLES_COLS] \
            .set_index(SESSION_COMMON_COLS) \
            [SESSION_EVENT_COLS+SESSION_POSITIVE_SAMPLES_COLS]
    session_positive_samples_selected_df['session_pos'] = list(np.tile(np.arange(SEQ_LEN), (len(session_positive_samples_selected_df),1))+1)
    
    session_positive_samples_df = session_positive_samples_selected_df \
            .apply(pd.Series.explode) \
            .reset_index()
    session_positive_samples_df = session_positive_samples_df[session_positive_samples_df['sess_etime_seq'] != 0]
    session_positive_samples_df['label'] = 1
    session_positive_samples_df['neg_sample_idx'] = -1
    return session_positive_samples_df


def get_neg_samples_from_sessions(df):
    session_negative_samples_selected_df = df[SESSION_COMMON_COLS+SESSION_EVENT_COLS+SESSION_NEGATIVE_SAMPLES_COLS] \
                                            .set_index(SESSION_COMMON_COLS) \
                                            [SESSION_EVENT_COLS+SESSION_NEGATIVE_SAMPLES_COLS]
    
    session_negative_samples_selected_df['session_pos'] = list(np.tile(np.arange(SEQ_LEN), (len(session_negative_samples_selected_df),1))+1)
    
    session_negative_samples_selected_df = session_negative_samples_selected_df \
                                            .apply(pd.Series.explode) \
                                            .reset_index()
    
    session_negative_samples_selected_df = session_negative_samples_selected_df[session_negative_samples_selected_df['sess_etime_seq'] != 0]
    session_negative_samples_selected_df['neg_sample_idx'] = list(np.tile(np.arange(N_NEG_SAMPLES), (len(session_negative_samples_selected_df),1))+1)
    session_negative_samples_selected_df['label'] = 0
    session_negative_samples_df = session_negative_samples_selected_df \
            .set_index(SESSION_COMMON_COLS+SESSION_EVENT_COLS+['session_pos','label']) \
            .apply(pd.Series.explode) \
           .reset_index()
    session_negative_samples_df.columns = list([NEG_POS_ITEM_COLS_MAPPING[col] if col in NEG_POS_ITEM_COLS_MAPPING else col for col in session_negative_samples_df.columns])
    return session_negative_samples_df


def convert_sessions_with_negs_to_binary_classification_dataset(df):
    # Reshaping negative samples so that they have (seq_len, n_neg_samples)
    for col in SESSION_NEGATIVE_SAMPLES_COLS:
        df[col] = df[col].apply(lambda x: x.reshape(SEQ_LEN, N_NEG_SAMPLES))
        
    pos_samples_df = get_pos_samples_from_sessions(df)
    neg_samples_df = get_neg_samples_from_sessions(df)
    merged_df = pd.concat([pos_samples_df, neg_samples_df]).sort_values(['user_session', 'session_pos'])
    return merged_df

## Converting to binary dataset

In [129]:
print('Input folder: {}'.format(INPUT_PATH))
print('Output folder: {}'.format(OUTPUT_PATH))
print()

for parquet_file in parquet_files:
    print('Processing "{}"'.format(parquet_file))
    sessions_df = pd.read_parquet(os.path.join(INPUT_PATH, parquet_file))
    sessions_with_negs_for_binary_classification_df = convert_sessions_with_negs_to_binary_classification_dataset(sessions_df)
    
    out_filename = parquet_file.replace('session_', 'session_binary_')
    sessions_with_negs_for_binary_classification_df.to_parquet(os.path.join(OUTPUT_PATH, out_filename+'.csv'))
    
print('Finished conversion')

Input folder: /home/gmoreira/dataset/ecommerce_preproc_with_neg_samples/ecommerce_preproc_neg_samples_50_strategy_session_cooccurrence
Output folder: /home/gmoreira/dataset/ecommerce_preproc_binary_class_with_neg_samples/ecommerce_preproc_binary_classif_neg_samples_50_strategy_session_cooccurrence

Processing "session_start_date=2019-10-01-test.parquet"
Processing "session_start_date=2019-10-01-train.parquet"
Processing "session_start_date=2019-10-01.parquet"
Processing "session_start_date=2019-10-02-test.parquet"
Processing "session_start_date=2019-10-02-train.parquet"
Processing "session_start_date=2019-10-02.parquet"
Processing "session_start_date=2019-10-03-test.parquet"
Processing "session_start_date=2019-10-03-train.parquet"
Processing "session_start_date=2019-10-03.parquet"
Processing "session_start_date=2019-10-04-test.parquet"
Processing "session_start_date=2019-10-04-train.parquet"
Processing "session_start_date=2019-10-04.parquet"
Processing "session_start_date=2019-10-05-te

Processing "session_start_date=2019-11-18-train.parquet"
Processing "session_start_date=2019-11-18.parquet"
Processing "session_start_date=2019-11-19-test.parquet"
Processing "session_start_date=2019-11-19-train.parquet"
Processing "session_start_date=2019-11-19.parquet"
Processing "session_start_date=2019-11-20-test.parquet"
Processing "session_start_date=2019-11-20-train.parquet"
Processing "session_start_date=2019-11-20.parquet"
Processing "session_start_date=2019-11-21-test.parquet"
Processing "session_start_date=2019-11-21-train.parquet"
Processing "session_start_date=2019-11-21.parquet"
Processing "session_start_date=2019-11-22-test.parquet"
Processing "session_start_date=2019-11-22-train.parquet"
Processing "session_start_date=2019-11-22.parquet"
Processing "session_start_date=2019-11-23-test.parquet"
Processing "session_start_date=2019-11-23-train.parquet"
Processing "session_start_date=2019-11-23.parquet"
Processing "session_start_date=2019-11-24-test.parquet"
Processing "sess

### Analyzing negative samples

In [9]:
#Checking the rate of repeated negative samples for session co-occurrence strategy, 
# which didn't ignore previsouly sampled items in the samplers chain: (session co-occurrence, recent populairty)
#The bug was already fixed, and dataset needs to be reprocessed
#from collections import Counter
#counts = Counter(np.hstack(sessions_df['sess_neg_pids'].apply(lambda x: [i for i in [len(set(row))for row in x.reshape(NEG_SAMPLES_SHAPE)] if i > 1]).values))
'''Counter({50: 41392, 48: 372, 49: 2361, 47: 66, 46: 11, 45: 2})'''

'Counter({50: 41392, 48: 372, 49: 2361, 47: 66, 46: 11, 45: 2})'

In [130]:
#sessions_df['sess_neg_pids'].values[0].reshape(20,50)