In [1]:
import pandas as pd
import os
import gc

In [2]:
INPUT_PATH = '/home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet'
OUTPUT_PATH = '/home/gmoreira/dataset/ecommerce_preproc_split'

In [3]:
def fix_event_timestamp(event_ts_sequence):
    MAXIMUM_SESSION_DURATION_SECS = 60 * 120
    DEFAULT_ELAPSED_SECS_BETWEEN_INTERACTIONS = 60
    min_event_ts = min([e for e in event_ts_sequence if e != 0])
    max_event_ts = max(event_ts_sequence)

    if max_event_ts <= min_event_ts + MAXIMUM_SESSION_DURATION_SECS:
        return event_ts_sequence

    else:
        last_ts = None

        result = []
        for etime in event_ts_sequence:
            if etime > min_event_ts + MAXIMUM_SESSION_DURATION_SECS:
                etime = last_ts + DEFAULT_ELAPSED_SECS_BETWEEN_INTERACTIONS
            last_ts = etime
            result.append(etime)

        return result

In [4]:
def split_train_eval_parquet_file(sessions_df, parquet_path, perc_valid_set):
    #sessions_df = pd.read_parquet(parquet_path)

    # Shuffling the order of sesions
    sessions_df = sessions_df.sample(frac=1).reset_index(drop=True)

    dataset_size = len(sessions_df)

    train_set_limit = dataset_size - int(dataset_size * perc_valid_set)

    train_df = sessions_df[:train_set_limit]
    valid_df = sessions_df[train_set_limit:dataset_size]

    try:

        # Sorting the sessions by start time
        train_df.sort_values("session_start_ts", inplace=True)
        valid_df.sort_values("session_start_ts", inplace=True)

        output_train_path = parquet_path.replace(".parquet", "-train.parquet")
        output_test_path = parquet_path.replace(".parquet", "-test.parquet")

        train_df.to_parquet(output_train_path)
        valid_df.to_parquet(output_test_path)

    finally:
        del sessions_df
        del train_df
        del valid_df
        gc.collect()

In [5]:
for sessions_day_dir in sorted(os.listdir(INPUT_PATH)):
    if os.path.isdir(os.path.join(INPUT_PATH,sessions_day_dir)):
        print('Processing day: {}'.format(sessions_day_dir))
        
        dir_files = os.listdir(os.path.join(INPUT_PATH,sessions_day_dir))
        #Ignoring .crc hidden file
        parquet_name = list([f for f in dir_files if not f.startswith('.')])[0]
        parquet_name_full_path = os.path.join(INPUT_PATH, sessions_day_dir, parquet_name)
        print('Reading original file: '+parquet_name_full_path)
        pd_full = pd.read_parquet(parquet_name_full_path)
                
        # Temporary, as the event timestamps from some rare items of the pre-processed dataset are many days after the session start
        pd_full["sess_etime_seq"] = pd_full["sess_etime_seq"].apply(fix_event_timestamp)
        
        print('Saving original file...')
        output_parquet_file = os.path.join(OUTPUT_PATH, sessions_day_dir+'.parquet')
        pd_full.to_parquet(output_parquet_file)
        
        print('Spliting train and test files...')
        split_train_eval_parquet_file(pd_full, output_parquet_file, perc_valid_set=0.1)  
        
        print('')

Processing day: session_start_date=2019-10-01
Reading original file: /home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-01/part-00066-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet
Saving original file...
Spliting train and test files...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Processing day: session_start_date=2019-10-02
Reading original file: /home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-02/part-00149-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet
Saving original file...
Spliting train and test files...

Processing day: session_start_date=2019-10-03
Reading original file: /home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-03/part-00026-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet
Saving original file...
Spliting train and test files...

Processing day: session_start_date=2019-10-04
Reading original file: /home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-04/part-00045-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet
Saving original file...
Spliting train and test files...

Processing day: session_start_date=2019-10-05

Saving original file...
Spliting train and test files...

Processing day: session_start_date=2019-10-28
Reading original file: /home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-28/part-00190-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet
Saving original file...
Spliting train and test files...

Processing day: session_start_date=2019-10-29
Reading original file: /home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-29/part-00115-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet
Saving original file...
Spliting train and test files...

Processing day: session_start_date=2019-10-30
Reading original file: /home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-30/part-00010-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet
Saving original file...
Spliting train and tes

In [None]:
df_temp = pd.read_parquet('/home/gmoreira/dataset/ecommerce_preproc/ecommerce_preproc_2019-10/ecommerce_preproc.parquet/session_start_date=2019-10-04/part-00045-1fb5299f-9048-47a3-bb5d-1af042cb871c.c000.snappy.parquet')