In [11]:
import pandas as pd
import os
import gc

In [12]:
INPUT_PATH = '/mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc.parquet'
OUTPUT_PATH = '/mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc-split'
!mkdir -p $OUTPUT_PATH

In [13]:
def fix_event_timestamp(event_ts_sequence):
    MAXIMUM_SESSION_DURATION_SECS = 60 * 120
    DEFAULT_ELAPSED_SECS_BETWEEN_INTERACTIONS = 60
    min_event_ts = min([e for e in event_ts_sequence if e != 0])
    max_event_ts = max(event_ts_sequence)

    if max_event_ts <= min_event_ts + MAXIMUM_SESSION_DURATION_SECS:
        return event_ts_sequence

    else:
        last_ts = None

        result = []
        for etime in event_ts_sequence:
            if etime > min_event_ts + MAXIMUM_SESSION_DURATION_SECS:
                etime = last_ts + DEFAULT_ELAPSED_SECS_BETWEEN_INTERACTIONS
            last_ts = etime
            result.append(etime)

        return result

In [14]:
def split_train_eval_parquet_file(sessions_df, parquet_path, perc_valid_set, perc_test_set):
    #sessions_df = pd.read_parquet(parquet_path)

    # Shuffling the order of sesions
    sessions_df = sessions_df.sample(frac=1).reset_index(drop=True)

    dataset_size = len(sessions_df)

    valid_set_size = int(dataset_size * perc_valid_set)
    test_set_size = int(dataset_size * perc_test_set)
    train_set_limit = dataset_size - valid_set_size - test_set_size

    train_df = sessions_df[:train_set_limit]
    valid_df = sessions_df[train_set_limit:train_set_limit+valid_set_size]
    test_df = sessions_df[train_set_limit+valid_set_size:dataset_size]

    try:

        # Sorting the sessions by start time
        train_df.sort_values("session_start_ts", inplace=True)
        valid_df.sort_values("session_start_ts", inplace=True)
        test_df.sort_values("session_start_ts", inplace=True)

        output_train_path = parquet_path.replace("-full.parquet", "-train.parquet")
        output_valid_path = parquet_path.replace("-full.parquet", "-valid.parquet")
        output_test_path = parquet_path.replace("-full.parquet", "-test.parquet")

        print('Saving train set to {}'.format(output_train_path))
        train_df.to_parquet(output_train_path)
        
        print('Saving valid set to {}'.format(output_valid_path))
        valid_df.to_parquet(output_valid_path)
        
        print('Saving test set to {}'.format(output_test_path))
        test_df.to_parquet(output_test_path)

    finally:
        del sessions_df
        del train_df
        del valid_df
        del test_df
        gc.collect()

In [15]:
for sessions_day_dir in sorted(os.listdir(INPUT_PATH)):
    if os.path.isdir(os.path.join(INPUT_PATH,sessions_day_dir)):
        print('Processing day: {}'.format(sessions_day_dir))
        
        dir_files = os.listdir(os.path.join(INPUT_PATH,sessions_day_dir))
        #Ignoring .crc hidden file
        parquet_name = list([f for f in dir_files if not f.startswith('.')])[0]
        parquet_name_full_path = os.path.join(INPUT_PATH, sessions_day_dir, parquet_name)
        print('Reading original file: '+parquet_name_full_path)
        pd_full = pd.read_parquet(parquet_name_full_path)
                
        # Temporary, as the event timestamps from some rare items of the pre-processed dataset are many days after the session start
        #But, fixing timestamp at this point really is not considered for training
        #pd_full["sess_etime_seq"] = pd_full["sess_etime_seq"].apply(fix_event_timestamp)
        
        print('Saving original file...')
        output_parquet_file = os.path.join(OUTPUT_PATH, sessions_day_dir+'-full.parquet')
        pd_full.to_parquet(output_parquet_file)
        
        print('Spliting train and test files...')
        split_train_eval_parquet_file(pd_full, output_parquet_file, perc_valid_set=0.1, perc_test_set=0.1)  
        
        print('')

Processing day: session_start_date=2019-10-01
Reading original file: /mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc.parquet/session_start_date=2019-10-01/part-00066-bca5d158-0ed7-49a5-8413-2de6d87aff02.c000.snappy.parquet
Saving original file...
Spliting train and test files...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Saving train set to /mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc-split/session_start_date=2019-10-01-train.parquet
Saving valid set to /mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc-split/session_start_date=2019-10-01-valid.parquet
Saving test set to /mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc-split/session_start_date=2019-10-01-test.parquet

Processing day: session_start_date=2019-10-02
Reading original file: /mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc.parquet/session_start_date=2019-10-02/part-00149-bca5d158-0ed7-49a5-8413-2de6d87aff02.c000.snappy.parquet
Saving original file...
Spliting train and test files...
Saving train set to /mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc-split/session_start_date=2019-10-02-train.parquet
Saving valid set to /mount/results/repeated_interactions=False/total_months=1/ecommerce_preproc-split/session

# Checking Lengths

In [16]:
import pandas as pd

In [17]:
df = pd.read_parquet('/mount/results/repeated_interactions=True/total_months=1/ecommerce_preproc-split/session_start_date=2019-10-18-train.parquet')

In [18]:
len(df)

116441

In [19]:
df.columns

Index(['user_idx', 'user_session', 'session_start_ts', 'sess_seq_len',
       'bef_sess_seq_length', 'sess_pid_seq', 'sess_etime_seq',
       'sess_etype_seq', 'sess_csid_seq', 'sess_ccid_seq', 'sess_bid_seq',
       'sess_price_log_norm_seq', 'sess_dtime_secs_seq',
       'sess_dtime_secs_log_norm_seq', 'sess_prod_recency_days_seq',
       'sess_prod_recency_days_log_norm_seq',
       'sess_relative_price_to_avg_category_seq', 'sess_et_hour_sin_seq',
       'sess_et_hour_cos_seq', 'sess_et_month_sin_seq',
       'sess_et_month_cos_seq', 'sess_et_dayofweek_sin_seq',
       'sess_et_dayofweek_cos_seq', 'sess_et_dayofmonth_sin_seq',
       'sess_et_dayofmonth_cos_seq', 'sess_session_reversed_order_seq',
       'bef_sess_pid_seq', 'bef_sess_etime_seq', 'bef_sess_etype_seq',
       'bef_sess_csid_seq', 'bef_sess_ccid_seq', 'bef_sess_bid_seq',
       'bef_sess_price_log_seq', 'bef_sess_dtime_secs_seq',
       'bef_sess_dtime_secs_log_norm_seq', 'bef_sess_prod_recency_days_seq',
       'bef_

In [20]:
df['bef_sess_pid_seq'].apply(len).describe()

count    116441.000000
mean          9.579426
std           8.413303
min           0.000000
25%           1.000000
50%           8.000000
75%          20.000000
max          20.000000
Name: bef_sess_pid_seq, dtype: float64