### Script to create a Spark instance on NGC

```
ngc batch run --name "gmoreira-jupyterlab" --preempt RUNONCE --min-timeslice 0s --total-runtime 0s --ace nv-us-west-2 --instance dgx1v.16g.8.norm --commandline "jupyter lab --ip=0.0.0.0 --allow-root --no-browser --NotebookApp.token='recsys' --notebook-dir=/ --NotebookApp.allow_origin='*' & date; sleep 200000h" --result /mount/results --image "nvidian/rapids/spark2_on_ngc_test" --org nvidian --team prj-recsys --workspace gmoreira-wksp:/mount/workspace:RW --datasetid 62351:/mount/data/ecommerce_raw --port 6006 --port 7000 --port 8888
```

# Read, Parse, Process E-Commerce data on PySpark
eCommerce dataset: https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store

## Data Download from Kaggle

In [1]:
!pip install kaggle --upgrade

Collecting kaggle
  Downloading kaggle-1.5.10.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 8.6 MB/s  eta 0:00:01
Collecting python-slugify
  Downloading python-slugify-4.0.1.tar.gz (11 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 11.2 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: kaggle, python-slugify
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.10-py3-none-any.whl size=73269 sha256=084c829b6290fdcf6c96b77f1048d1c4e7b832e1b996ebe2ff5eae1ac6e5ecc5
  Stored in directory: /home/jovyan/.cache/pip/wheels/ea/c5/fe/7e7fb5b3d1f150fac96188949b3d83d375a4c9df16ba557e52
  Building wheel for python-slugify (setup.py) ... [?25ldone
[?25h  Created wheel for python-slugify: filename=python_slugify-4.0.1-py2.py3-none-any.whl size=6767 sha256=660b8ac4542b8c65aea8d54ba947c67fd9f3de0d1bcc57ab829bad563ef7be

In [2]:
# NOTE: first to get kaggle api tiken from account page in Kaggle. Place it at ~/.kaggle/kaggle.json
!mkdir -p ~/.kaggle/ && cp /mount/workspace/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
!mkdir -p ~/data
!cd ~/data && kaggle datasets download mkechinov/ecommerce-behavior-data-from-multi-category-store
!cd ~/data && unzip ecommerce-behavior-data-from-multi-category-store.zip

Downloading ecommerce-behavior-data-from-multi-category-store.zip to /home/jovyan/data
100%|█████████████████████████████████████▉| 4.28G/4.29G [01:17<00:00, 49.2MB/s]
100%|██████████████████████████████████████| 4.29G/4.29G [01:17<00:00, 59.6MB/s]
Archive:  ecommerce-behavior-data-from-multi-category-store.zip
  inflating: 2019-Nov.csv            
  inflating: 2019-Oct.csv            


### Downloading additional months from Google Drive

In [10]:
!pip install gdown
!cd ~/data 



In [11]:
!cd ~/data && gdown https://drive.google.com/uc?id=1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P
!cd ~/data && echo "Unziping" && gunzip 2019-Dec.csv.gz

Downloading...
From: https://drive.google.com/uc?id=1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P
To: /home/jovyan/data/2019-Dec.csv.gz
2.95GB [00:51, 57.0MB/s]
Unziping


In [12]:
!cd ~/data && gdown https://drive.google.com/uc?id=1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT
!cd ~/data && echo "Unziping" && gunzip 2020-Jan.csv.gz

Downloading...
From: https://drive.google.com/uc?id=1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT
To: /home/jovyan/data/2020-Jan.csv.gz
2.39GB [00:48, 49.5MB/s]
Unziping


In [13]:
!cd ~/data && gdown https://drive.google.com/uc?id=1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB
!cd ~/data && echo "Unziping" && gunzip 2020-Feb.csv.gz

Downloading...
From: https://drive.google.com/uc?id=1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB
To: /home/jovyan/data/2020-Feb.csv.gz
2.35GB [00:37, 62.0MB/s]
Unziping


In [14]:
!cd ~/data && gdown https://drive.google.com/uc?id=1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8
!cd ~/data && echo "Unziping" && gunzip 2020-Mar.csv.gz

Downloading...
From: https://drive.google.com/uc?id=1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8
To: /home/jovyan/data/2020-Mar.csv.gz
2.42GB [00:27, 88.8MB/s]
Unziping


In [15]:
!cd ~/data && gdown https://drive.google.com/uc?id=1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL
!cd ~/data && echo "Unziping" && gunzip 2020-Apr.csv.gz

Downloading...
From: https://drive.google.com/uc?id=1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL
To: /home/jovyan/data/2020-Apr.csv.gz
2.93GB [00:55, 53.0MB/s]
Unziping


## Configurations

In [79]:
NUM_MONTHS_TO_PREPROCESS = 1 #For the eCommerce dataset there are up to 7 months (2019-Oct to 2020-Apr)
KEEP_REPEATED_USER_INTERACTIONS = False

## Setup Spark

In [4]:
import os

In [5]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS
from pyspark.sql import Window
from pyspark.ml.linalg import Vectors, Vector, DenseVector, SparseVector, VectorUDT
from pyspark.ml.stat import Summarizer

In [6]:
from pyspark.sql import SparkSession

# Setup for 400GB Mem machine
config = pyspark.SparkConf().setAll([('spark.executor.memory', '350g'),
                                     ('spark.executor.instances', '1'),
                                     ('spark.cores.max', '64'),
                                     #('spark.cores.max', '3'),
                                     ('spark.executor.cores', '64'),
                                     ('spark.driver.memory','30g')
                                    ])
spark = SparkSession.builder.config(conf=config).appName("JobName").getOrCreate()

# Setup for 200GB Mem machine
# config = pyspark.SparkConf().setAll([('spark.executor.cores', '4'),
#                                      ('spark.executor.instances', '2'),
#                                      ('spark.executor.memory', '12g'), 
#                                      ('spark.executor.memoryOverhead', '2g'), 
#                                      #('spark.driver.cores', '4'),
#                                      #('spark.driver.memory', '12g'),
#                                      ('spark.dynamicAllocation.enabled', 'false'),
#                                      ('spark.io.compression.codec', 'snappy')
#                                     ])

# spark = SparkSession.builder.config(conf=config).appName("Recsys-Transformer-Research").getOrCreate()

## Read Data

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [8]:
DATA_PATH = '/home/jovyan/data'

In [9]:
MONTHS_FILES = ["2019-Oct.csv", "2019-Nov.csv", "2019-Dec.csv", "2020-Jan.csv", "2020-Feb.csv", "2020-Mar.csv", "2020-Apr.csv"]

In [10]:
selected_months = MONTHS_FILES[:NUM_MONTHS_TO_PREPROCESS]
selected_months

['2019-Oct.csv']

In [11]:
files_paths = [os.path.join(DATA_PATH, file) for file in selected_months]
files_paths

['/home/jovyan/data/2019-Oct.csv']

#### Read through Spark from CSV

In [12]:
%%time
raw_df = spark.read.csv(files_paths, inferSchema = True, header = True) #.limit(1000)
raw_df.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

CPU times: user 2.04 ms, sys: 4.34 ms, total: 6.38 ms
Wall time: 10.4 s


In [13]:
%%time
raw_df.head(2)

CPU times: user 6.14 ms, sys: 0 ns, total: 6.14 ms
Wall time: 240 ms


[Row(event_time='2019-10-01 00:00:00 UTC', event_type='view', product_id=44600062, category_id=2103807459595387724, category_code=None, brand='shiseido', price=35.79, user_id=541312140, user_session='72d76fde-8bb3-4e00-8c23-a032dfed738c'),
 Row(event_time='2019-10-01 00:00:00 UTC', event_type='view', product_id=3900821, category_id=2053013552326770905, category_code='appliances.environment.water_heater', brand='aqua', price=33.2, user_id=554748717, user_session='9333dfbd-b87a-4708-9857-6336556b0fcc')]

## Convert timestamp from datetime

In [14]:
raw_df = raw_df.withColumn('event_time_dt', F.to_timestamp('event_time','yyyy-MM-dd HH:mm:ss')) \
           .withColumn('event_time_ts', F.unix_timestamp('event_time_dt')) \

In [15]:
raw_df.show(5)

+--------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+-------------------+-------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|      event_time_dt|event_time_ts|
+--------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+-------------------+-------------+
|2019-10-01 00:00:...|      view|  44600062|2103807459595387724|                null|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|2019-10-01 00:00:00|   1569888000|
|2019-10-01 00:00:...|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|2019-10-01 00:00:00|   1569888000|
|2019-10-01 00:00:...|      view|  17200506|2053013559792632471|furniture.living_...|    null|  543.1|519107250|566511c2-e2e3-422...|2019-10-01 00:00:01|   1569

## Removing repeated (user,item) interactions

In [81]:
df = raw_df

In [82]:
%%time
#df.count()

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 6.68 µs


In [83]:
#Keeping only the first user interaction with an item (ignores all future repeated interactions)
if not KEEP_REPEATED_USER_INTERACTIONS:
    df_first_user_item_interaction_df = df.groupBy(['user_id', 'product_id']).agg(F.min('event_time_ts').alias('first_user_item_event_time_ts'))
    df = df.join(df_first_user_item_interaction_df, how='inner',
                                              on=((df['user_id'] == df_first_user_item_interaction_df['user_id'])) &
                                                  (df['product_id'] == df_first_user_item_interaction_df['product_id']) &
                                                  (df['event_time_ts'] == df_first_user_item_interaction_df['first_user_item_event_time_ts'])) \
                                .drop(df_first_user_item_interaction_df['user_id']) \
                                .drop(df_first_user_item_interaction_df['product_id']) \
                                .drop(df_first_user_item_interaction_df['first_user_item_event_time_ts'])
#Keeps repeated interactions on the same items, removing only consecutive interactions, because it might be due to browser tab refreshes or different interaction types (e.g. click, add-to-card, purchase)
else:
    user_id_window = Window.partitionBy('user_id').orderBy('event_time_ts')
    df = df.withColumn('last_product_id_interacted_by_user', F.lag('product_id').over(user_id_window)) \
           .where(F.col('last_product_id_interacted_by_user') != F.col('product_id')) \
           .drop(F.col('last_product_id_interacted_by_user'))    

In [84]:
%%time
#print(df.count())

CPU times: user 1 µs, sys: 2 µs, total: 3 µs
Wall time: 5.72 µs


**Full Dataset (7 months) Stats - Number of interactions**
- No filter: 411709736
- Removed iser consecutive repeated interactions in the same items: 261390136
- Removing all user repeated interactions with the same items: 204098003

**1 Months Stats - Number of interactions**
- No filter: 42448764
- Removed iser consecutive repeated interactions in the same items: 26565608 
- Removing all user repeated interactions with the same items: 23312920

## Categorical features encoding

API: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html?pyspark.ml.feature.StringIndexer#pyspark.ml.feature.StringIndexer

How to apply:
https://stackoverflow.com/questions/36942233/apply-stringindexer-to-several-columns-in-a-pyspark-dataframe

In [85]:
%%time
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

#Null values are treated by handleInvalid="keep" with a special bucket
#userid_idxer = StringIndexer(inputCol="user_id", outputCol="user_idx", handleInvalid="keep", stringOrderType="frequencyDesc")
product_idxer = StringIndexer(inputCol="product_id", outputCol="product_idx", handleInvalid="keep", stringOrderType="frequencyDesc")
category_id_idxer = StringIndexer(inputCol="category_id", outputCol="category_sub_idx", handleInvalid="keep", stringOrderType="frequencyDesc")
category_code_idxer = StringIndexer(inputCol="category_code", outputCol="category_code_idx", handleInvalid="keep", stringOrderType="frequencyDesc")
brand_idxer = StringIndexer(inputCol="brand", outputCol="brand_idx", handleInvalid="keep", stringOrderType="frequencyDesc")
event_type_idxer = StringIndexer(inputCol="event_type", outputCol="event_type_idx", handleInvalid="keep", stringOrderType="frequencyDesc")


#NOTE: userid_idxer gets error!

indexers = [product_idxer, category_id_idxer, category_code_idxer, brand_idxer, event_type_idxer]   #,userid_idxer
indexing_pipeline = Pipeline(stages = indexers)
         
indexing_model = indexing_pipeline.fit(df)
df_indexed = indexing_model.transform(df)

CPU times: user 61.4 ms, sys: 47.9 ms, total: 109 ms
Wall time: 2min 54s


### Add two to indexed columns

In [86]:
start_idx = 2

df_indexed = df_indexed.withColumn('product_idx', (F.col('product_idx')+start_idx).cast(IntegerType())) \
                       .withColumn('category_sub_idx', (F.col('category_sub_idx')+start_idx).cast(IntegerType())) \
                       .withColumn('category_code_idx', (F.col('category_code_idx')+start_idx).cast(IntegerType())) \
                       .withColumn('brand_idx', (F.col('brand_idx')+start_idx).cast(IntegerType())) \
                       .withColumn('event_type_idx', (F.col('event_type_idx')+start_idx).cast(IntegerType())) \
                       .withColumn('user_idx', F.col('user_id')+start_idx) #Keeps the original user id as it is (because it causes error on StringIndexer, due to the high cardinality)

#.withColumn('user_idx', (F.col('user_idx')+start_idx).cast(IntegerType())) \

In [87]:
df_indexed.columns

['event_time',
 'event_type',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_session',
 'event_time_dt',
 'event_time_ts',
 'user_id',
 'product_id',
 'product_idx',
 'category_sub_idx',
 'category_code_idx',
 'brand_idx',
 'event_type_idx',
 'user_idx']

### Analyze categorical statistics

In [88]:
from pyspark.sql.functions import countDistinct

In [89]:
#gr = df_indexed.agg(F.countDistinct("user_id"))
#gr.show()

In [90]:
%%time
#gr = df_indexed.agg(F.countDistinct("user_session"))
#gr.show()

CPU times: user 1e+03 ns, sys: 2 µs, total: 3 µs
Wall time: 7.15 µs


In [91]:
%%time
#gr = df_indexed.groupBy('event_type_idx').count()
#gr.show()

CPU times: user 0 ns, sys: 2 µs, total: 2 µs
Wall time: 4.29 µs


In [92]:
%%time
#df_indexed.agg(F.max('product_idx'), F.max('category_sub_idx'), F.max('category_code_idx'), 
#               F.max('brand_idx'), F.max('event_type_idx')).show()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


## Extract temporal features

In [93]:
df_indexed = df_indexed.withColumn('et_hour', F.hour('event_time_dt'))\
                     .withColumn('et_dayofweek', F.dayofweek('event_time_dt'))\
                     .withColumn('et_dayofmonth', F.dayofmonth('event_time_dt'))\
                     .withColumn('et_month', F.month('event_time_dt'))                

In [94]:
def cyclical_feature(value, func, max_value):
    if func == 'sin':
        f = np.sin
    elif func == 'cos':
        f = np.cos
    else:
        raise Exception('Invalid func (expected: sin|cos)')
        
    value_scaled = (value + 1e-8) / max_value
    return float(f(2.*np.pi*value_scaled))

@udf(returnType=FloatType())
def cyclical_feature_udf(value, func, max_value):
    return cyclical_feature(value, func, max_value)

In [95]:
# Generating cyclical features to model continuity on temporal features
df_indexed = df_indexed.withColumn('et_hour_sin', cyclical_feature_udf(F.col('et_hour'), F.lit('sin'), F.lit(24))) \
                   .withColumn('et_hour_cos', cyclical_feature_udf(F.col('et_hour'), F.lit('cos'), F.lit(24))) \
                   .withColumn('et_dayofweek_sin', cyclical_feature_udf(F.col('et_dayofweek'), F.lit('sin'), F.lit(7))) \
                   .withColumn('et_dayofweek_cos', cyclical_feature_udf(F.col('et_dayofweek'), F.lit('cos'), F.lit(7))) \
                   .withColumn('et_dayofmonth_sin', cyclical_feature_udf(F.col('et_dayofmonth'), F.lit('sin'), F.lit(31))) \
                   .withColumn('et_dayofmonth_cos', cyclical_feature_udf(F.col('et_dayofmonth'), F.lit('cos'), F.lit(31))) \
                   .withColumn('et_month_sin', cyclical_feature_udf(F.col('et_month'), F.lit('sin'), F.lit(12))) \
                   .withColumn('et_month_cos', cyclical_feature_udf(F.col('et_month'), F.lit('cos'), F.lit(12)))

In [96]:
# Recency feature
item_first_interaction_df = df_indexed.groupBy('product_id').agg(F.min('event_time_ts').alias('prod_first_event_time_ts'))
df_indexed = df_indexed.join(item_first_interaction_df, on='product_id', how='inner') \
            .withColumn('product_recency_days', (F.col('event_time_ts') - F.col('prod_first_event_time_ts')) / (60*60*24)) \
            .withColumn('product_recency_days_log', F.log1p('product_recency_days'))

In [97]:
#df.head(100)

In [98]:
#Smoothing price long-tailed distribution
df_indexed = df_indexed.withColumn('price_log', F.log1p('price'))

In [99]:
#Relative Price to the average price for the category_id
avg_category_id_prices_df = df_indexed.groupBy('category_id').agg(F.mean('price').alias('avg_category_id_price'))
df_indexed = df_indexed.join(avg_category_id_prices_df, on='category_id', how='inner') \
        .withColumn('relative_price_to_avg_category_id', (F.col('price') - F.col('avg_category_id_price')) / F.col('avg_category_id_price'))

In [100]:
#_df_sb_a.groupBy('product_id').agg(F.stddev('price').alias('std')).where(~F.isnan('std')).agg(F.mean('std')).show()

In [101]:
#_df_sb_a.groupBy('category_id').agg(F.stddev('price').alias('std')).where(~F.isnan('std')).agg(F.mean('std')).show()

In [102]:
#_df_sb_a.groupBy('category_code').agg(F.stddev('price').alias('std')).agg(F.mean('std')).show()

In [103]:
df_indexed.dtypes

[('category_id', 'bigint'),
 ('product_id', 'int'),
 ('event_time', 'string'),
 ('event_type', 'string'),
 ('category_code', 'string'),
 ('brand', 'string'),
 ('price', 'double'),
 ('user_session', 'string'),
 ('event_time_dt', 'timestamp'),
 ('event_time_ts', 'bigint'),
 ('user_id', 'int'),
 ('product_idx', 'int'),
 ('category_sub_idx', 'int'),
 ('category_code_idx', 'int'),
 ('brand_idx', 'int'),
 ('event_type_idx', 'int'),
 ('user_idx', 'int'),
 ('et_hour', 'int'),
 ('et_dayofweek', 'int'),
 ('et_dayofmonth', 'int'),
 ('et_month', 'int'),
 ('et_hour_sin', 'float'),
 ('et_hour_cos', 'float'),
 ('et_dayofweek_sin', 'float'),
 ('et_dayofweek_cos', 'float'),
 ('et_dayofmonth_sin', 'float'),
 ('et_dayofmonth_cos', 'float'),
 ('et_month_sin', 'float'),
 ('et_month_cos', 'float'),
 ('prod_first_event_time_ts', 'bigint'),
 ('product_recency_days', 'double'),
 ('product_recency_days_log', 'double'),
 ('price_log', 'double'),
 ('avg_category_id_price', 'double'),
 ('relative_price_to_avg_cate

### Normalize Continuous Features

#### Price (log)

In [104]:
%%time
price_log_mean, price_log_std = tuple(df_indexed.agg(F.mean('price_log'), F.stddev('price_log')).take(1)[0])
print(price_log_mean, price_log_std)

4.985035180576169 1.2631603275977517
CPU times: user 10.4 ms, sys: 9.86 ms, total: 20.2 ms
Wall time: 1min 35s


In [105]:
#Z-norm
df_indexed = df_indexed.withColumn('price_log_norm', (F.col('price_log') - price_log_mean) / price_log_std)

#### Elapsed days (log)

In [106]:
%%time
product_recency_days_log_mean, product_recency_days_log_std = tuple(df_indexed.agg(F.mean('product_recency_days_log'), F.stddev('product_recency_days_log')).take(1)[0])
print(product_recency_days_log_mean, product_recency_days_log_std)

2.4707174040997737 0.8161943934923657
CPU times: user 1.96 ms, sys: 18.1 ms, total: 20 ms
Wall time: 1min 20s


In [107]:
#Z-norm
df_indexed = df_indexed.withColumn('product_recency_days_log_norm', (F.col('product_recency_days_log') - product_recency_days_log_mean) / product_recency_days_log_std)

In [108]:
df_indexed.count()

23312920

In [109]:
'''
df_first_user_item_interaction_df = df_indexed.groupBy(['user_idx', 'product_idx']).agg(F.min('event_time_ts').alias('first_user_item_event_time_ts'))

df_first_user_item_interaction_df.head(10)

df_indexed_not_repeated = df_indexed.join(df_first_user_item_interaction_df, how='inner',
                                          on=((df_indexed['user_idx'] == df_first_user_item_interaction_df['user_idx'])) &
                                              (df_indexed['product_idx'] == df_first_user_item_interaction_df['product_idx']) &
                                              (df_indexed['event_time_ts'] == df_first_user_item_interaction_df['first_user_item_event_time_ts'])) \
                            .drop(df_first_user_item_interaction_df['user_idx']) \
                            .drop(df_first_user_item_interaction_df['product_idx']) \
                            .drop(df_first_user_item_interaction_df['first_user_item_event_time_ts'])

df_indexed_not_repeated.count()
'''

"\ndf_first_user_item_interaction_df = df_indexed.groupBy(['user_idx', 'product_idx']).agg(F.min('event_time_ts').alias('first_user_item_event_time_ts'))\n\ndf_first_user_item_interaction_df.head(10)\n\ndf_indexed_not_repeated = df_indexed.join(df_first_user_item_interaction_df, how='inner',\n                                          on=((df_indexed['user_idx'] == df_first_user_item_interaction_df['user_idx'])) &\n                                              (df_indexed['product_idx'] == df_first_user_item_interaction_df['product_idx']) &\n                                              (df_indexed['event_time_ts'] == df_first_user_item_interaction_df['first_user_item_event_time_ts']))                             .drop(df_first_user_item_interaction_df['user_idx'])                             .drop(df_first_user_item_interaction_df['product_idx'])                             .drop(df_first_user_item_interaction_df['first_user_item_event_time_ts'])\n\ndf_indexed_not_repeated.count()\n"

## Computing elapsed time since last interaction (on non-repeated items)

In [110]:
user_window = Window.partitionBy('user_idx').orderBy('event_time_ts')

In [111]:
df_indexed = df_indexed.withColumn('prev_event_time_ts', F.lag('event_time_ts').over(user_window)) \
                            .withColumn('delta_event_secs',F.when(F.isnull(F.col('prev_event_time_ts')), 0) \
                                                            .otherwise(F.col('event_time_ts') - F.col('prev_event_time_ts'))) \
                            .withColumn('delta_event_secs_log', F.log1p('delta_event_secs'))

In [112]:
%%time
session_delta_time_mean, session_delta_time_std = tuple(df_indexed.agg(
                            F.mean('delta_event_secs_log').alias('delta_event_secs_log_mean'),
                            F.stddev('delta_event_secs_log').alias('delta_event_secs_log_std')).take(1)[0])
print(session_delta_time_mean, session_delta_time_std)

#Z-norm
df_indexed = df_indexed.withColumn('delta_event_secs_log_norm', (F.col('delta_event_secs_log') - session_delta_time_mean) / session_delta_time_std)

4.783448763418383 3.492059719833784
CPU times: user 3.88 ms, sys: 16.6 ms, total: 20.5 ms
Wall time: 1min 21s


In [113]:
df_indexed.printSchema()

root
 |-- category_id: long (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_session: string (nullable = true)
 |-- event_time_dt: timestamp (nullable = true)
 |-- event_time_ts: long (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- product_idx: integer (nullable = true)
 |-- category_sub_idx: integer (nullable = true)
 |-- category_code_idx: integer (nullable = true)
 |-- brand_idx: integer (nullable = true)
 |-- event_type_idx: integer (nullable = true)
 |-- user_idx: integer (nullable = true)
 |-- et_hour: integer (nullable = true)
 |-- et_dayofweek: integer (nullable = true)
 |-- et_dayofmonth: integer (nullable = true)
 |-- et_month: integer (nullable = true)
 |-- et_hour_sin: float (nullable = true)
 |-- et_hour_cos: float (nullable = true)
 |--

## Processing sessions sequences

#### Aggregate by session id (create sequence as type of array)

In [114]:
session_window = Window.partitionBy('user_session').orderBy('event_time_ts')

In [115]:
def get_non_repeated_items(values):
    #Returns unique items, keep the order of their first occurence
    result = []
    for v in values:
        if v not in result:
            result.append(v)
    return result

@udf(returnType=ArrayType(IntegerType()))
def get_non_repeated_items_integer_udf(values):
    result = get_non_repeated_items(values)
    result = list([int(x) for x in result])
    return result

#########################################

def get_non_repeated_additional_items(item_ids, additional_feature_values):
    #Returns unique items, keep the order of their first occurence
    ids = []
    result = []
    for i, v in zip(item_ids, additional_feature_values):
        if i not in ids:
            ids.append(i)
            result.append(v)
    return result

@udf(returnType=ArrayType(IntegerType()))
def get_non_repeated_additional_items_integer_udf(item_ids, additional_feature_values):
    result = get_non_repeated_additional_items(item_ids, additional_feature_values)
    result = list([int(x) for x in result])
    return result

@udf(returnType=ArrayType(FloatType()))
def get_non_repeated_additional_items_float_udf(item_ids, additional_feature_values):
    result =  get_non_repeated_additional_items(item_ids, additional_feature_values)
    result = list([float(x) for x in result])
    return result

In [116]:
SESSIONS_MAX_LENGTH = 20

In [117]:
df_sb_grouped_df = df_indexed \
                .select('user_idx', 'user_session', 
                        F.first('event_time_ts').over(session_window).alias('session_start_ts'),
                        F.last('event_time_ts').over(session_window).alias('session_end_ts'),
                        F.slice(F.collect_list('product_idx').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('pid_seq_als'), 
                        F.slice(F.collect_list('event_time_ts').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('etime_seq_als'),
                        F.slice(F.collect_list('event_type_idx').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('event_type_seq_als'), 
                        F.slice(F.collect_list('category_sub_idx').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('csid_seq_als'),
                        F.slice(F.collect_list('category_code_idx').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('ccid_seq_als'),
                        F.slice(F.collect_list('brand_idx').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('bid_seq_als'),
                        F.slice(F.collect_list('price_log_norm').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('price_log_norm_seq_als'),                    
                        F.slice(F.collect_list('delta_event_secs').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('dtime_secs_seq_als'),
                        F.slice(F.collect_list('delta_event_secs_log_norm').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('dtime_secs_log_norm_seq_als'),                     
                        F.slice(F.collect_list('product_recency_days').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('prod_recency_days_als'),
                        F.slice(F.collect_list('product_recency_days_log_norm').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('prod_recency_days_log_norm_als'),
                        F.slice(F.collect_list('relative_price_to_avg_category_id').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('relative_price_to_avg_category_als'),
                        F.slice(F.collect_list('et_hour_sin').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_hour_sin_seq_als'),
                        F.slice(F.collect_list('et_hour_cos').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_hour_cos_seq_als'),
                        F.slice(F.collect_list('et_month_sin').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_month_sin_seq_als'),
                        F.slice(F.collect_list('et_month_cos').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_month_cos_seq_als'),
                        F.slice(F.collect_list('et_dayofweek_sin').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_dayofweek_sin_seq_als'),
                        F.slice(F.collect_list('et_dayofweek_cos').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_dayofweek_cos_seq_als'),
                        F.slice(F.collect_list('et_dayofmonth_sin').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_dayofmonth_sin_seq_als'),
                        F.slice(F.collect_list('et_dayofmonth_cos').over(session_window), 1, SESSIONS_MAX_LENGTH).alias('et_dayofmonth_cos_seq_als'),                        
                       )\
                .groupBy('user_idx', 'user_session').agg(
                    F.min('session_start_ts').alias('session_start_ts'),
                    F.max('session_end_ts').alias('session_end_ts'),
                    F.max('pid_seq_als').alias('sess_pid_seq'),
                    F.max('etime_seq_als').alias('sess_etime_seq'),
                    F.max('event_type_seq_als').alias('sess_etype_seq'),    
                    F.max('csid_seq_als').alias('sess_csid_seq'),
                    F.max('ccid_seq_als').alias('sess_ccid_seq'),
                    F.max('bid_seq_als').alias('sess_bid_seq'),
                    F.max('price_log_norm_seq_als').alias('sess_price_log_norm_seq'),        
                    F.max('dtime_secs_seq_als').alias('sess_dtime_secs_seq'),
                    F.max('dtime_secs_log_norm_seq_als').alias('sess_dtime_secs_log_norm_seq'),                    
                    F.max('prod_recency_days_als').alias('sess_prod_recency_days_seq'),
                    F.max('prod_recency_days_log_norm_als').alias('sess_prod_recency_days_log_norm_seq'),
                    F.max('relative_price_to_avg_category_als').alias('sess_relative_price_to_avg_category_seq'),
                    F.max('et_hour_sin_seq_als').alias('sess_et_hour_sin_seq'),
                    F.max('et_hour_cos_seq_als').alias('sess_et_hour_cos_seq'),
                    F.max('et_month_sin_seq_als').alias('sess_et_month_sin_seq'),
                    F.max('et_month_cos_seq_als').alias('sess_et_month_cos_seq'),
                    F.max('et_dayofweek_sin_seq_als').alias('sess_et_dayofweek_sin_seq'),
                    F.max('et_dayofweek_cos_seq_als').alias('sess_et_dayofweek_cos_seq'),
                    F.max('et_dayofmonth_sin_seq_als').alias('sess_et_dayofmonth_sin_seq'),
                    F.max('et_dayofmonth_cos_seq_als').alias('sess_et_dayofmonth_cos_seq'),               
                        )

In [118]:
'''
df_sb_grouped_nr_df = df_sb_grouped_df.select('user_idx', 'user_session', 'session_start_ts', 'session_end_ts',
                        get_non_repeated_items_integer_udf('sess_pid_seq').alias('sess_pid_seq'),
                        get_non_repeated_additional_items_integer_udf(F.col("sess_pid_seq"), F.col('sess_etime_seq')).alias('sess_etime_seq'),
                        get_non_repeated_additional_items_integer_udf(F.col("sess_pid_seq"), F.col('sess_etype_seq')).alias('sess_etype_seq'),                        
                        get_non_repeated_additional_items_integer_udf(F.col("sess_pid_seq"), F.col('sess_csid_seq')).alias('sess_csid_seq'),
                        get_non_repeated_additional_items_integer_udf(F.col("sess_pid_seq"), F.col('sess_ccid_seq')).alias('sess_ccid_seq'),
                        get_non_repeated_additional_items_integer_udf(F.col("sess_pid_seq"), F.col('sess_bid_seq')).alias('sess_bid_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_price_log_norm_seq')).alias('sess_price_log_norm_seq'),                                              
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_dtime_secs_seq')).alias('sess_dtime_secs_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_dtime_secs_log_norm_seq')).alias('sess_dtime_secs_log_norm_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_prod_recency_days_seq')).alias('sess_prod_recency_days_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_prod_recency_days_log_norm_seq')).alias('sess_prod_recency_days_log_norm_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_relative_price_to_avg_category_seq')).alias('sess_relative_price_to_avg_category_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_hour_sin_seq')).alias('sess_et_hour_sin_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_hour_cos_seq')).alias('sess_et_hour_cos_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_month_sin_seq')).alias('sess_et_month_sin_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_month_cos_seq')).alias('sess_et_month_cos_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_dayofweek_sin_seq')).alias('sess_et_dayofweek_sin_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_dayofweek_cos_seq')).alias('sess_et_dayofweek_cos_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_dayofmonth_sin_seq')).alias('sess_et_dayofmonth_sin_seq'),
                        get_non_repeated_additional_items_float_udf(F.col("sess_pid_seq"), F.col('sess_et_dayofmonth_cos_seq')).alias('sess_et_dayofmonth_cos_seq'),
                       ) \
            .where(F.size('sess_pid_seq') >= 2) \
            .withColumn('sess_seq_len', F.size('sess_pid_seq'))
'''            


df_sb_grouped_nr_df = df_sb_grouped_df.select('user_idx', 'user_session', 'session_start_ts', 'session_end_ts',
                        'sess_pid_seq', 'sess_etime_seq', 'sess_etype_seq', 'sess_csid_seq', 'sess_ccid_seq', 'sess_bid_seq', 
                        'sess_price_log_norm_seq', 'sess_dtime_secs_seq', 'sess_dtime_secs_log_norm_seq',
                        'sess_prod_recency_days_seq', 'sess_prod_recency_days_log_norm_seq', 'sess_relative_price_to_avg_category_seq',
                        'sess_et_hour_sin_seq', 'sess_et_hour_cos_seq', 'sess_et_month_sin_seq', 'sess_et_month_cos_seq',
                        'sess_et_dayofweek_sin_seq', 'sess_et_dayofweek_cos_seq', 'sess_et_dayofmonth_sin_seq', 'sess_et_dayofmonth_cos_seq',
                       ) \
            .where(F.size('sess_pid_seq') >= 2) \
            .withColumn('sess_seq_len', F.size('sess_pid_seq'))

In [119]:
df_sb_grouped_nr_df.printSchema()

root
 |-- user_idx: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- session_start_ts: long (nullable = true)
 |-- session_end_ts: long (nullable = true)
 |-- sess_pid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_etime_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- sess_etype_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_csid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_ccid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_bid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_price_log_norm_seq: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- sess_dtime_secs_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- sess_dtime_secs_log_norm_seq: array (nullable = true)
 |    |-

In [120]:
percentiles = np.arange(0., 1.1, 0.1).tolist()+[0.95,0.99,0.999]

In [121]:
'''
session_sizes_pdf = pd.DataFrame(zip(percentiles, 
                                          df_sb_grouped_nr_df.approxQuantile('sess_seq_len', probabilities=percentiles, relativeError=0.00001)),
                                         columns=['percentile', 'session_sizes']).sort_values('percentile')
session_sizes_pdf
'''

"\nsession_sizes_pdf = pd.DataFrame(zip(percentiles, \n                                          df_sb_grouped_nr_df.approxQuantile('sess_seq_len', probabilities=percentiles, relativeError=0.00001)),\n                                         columns=['percentile', 'session_sizes']).sort_values('percentile')\nsession_sizes_pdf\n"

In [122]:
df_sb_grouped_nr_df.printSchema()

root
 |-- user_idx: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- session_start_ts: long (nullable = true)
 |-- session_end_ts: long (nullable = true)
 |-- sess_pid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_etime_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- sess_etype_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_csid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_ccid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_bid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_price_log_norm_seq: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- sess_dtime_secs_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- sess_dtime_secs_log_norm_seq: array (nullable = true)
 |    |-

In [123]:
##Generating an example parquet file with sequences with different length (sparse) to test with NVT + PyTorch pipeline
#df_sb_grouped_nr_df.withColumn('session_start_date', F.date_trunc('day', F.to_timestamp(F.col('session_start_ts'))).cast('string').substr(0,10).alias()) \
#            .repartition(F.col('session_start_date')) \
#                .write.partitionBy('session_start_date').parquet('/mount/workspace/transformers/data_sparse_example')

## Processing user sequences

In [124]:
MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION = 20

#### Delta time between user clicks (secs)

In [125]:
df_user_seq_grouped_df = df_indexed \
                .select('user_idx',  
                        F.collect_list('user_session').over(user_window).alias('session_seq_als'),
                        F.collect_list('product_idx').over(user_window).alias('pid_seq_als'), 
                        F.collect_list('event_time_ts').over(user_window).alias('etime_seq_als'),
                        F.collect_list('event_type_idx').over(user_window).alias('event_type_seq_als'), 
                        F.collect_list('category_sub_idx').over(user_window).alias('csid_seq_als'),
                        F.collect_list('category_code_idx').over(user_window).alias('ccid_seq_als'),
                        F.collect_list('brand_idx').over(user_window).alias('bid_seq_als'),
                        F.collect_list('price_log_norm').over(user_window).alias('price_log_seq_als'),
                        
                        F.collect_list('delta_event_secs').over(session_window).alias('dtime_secs_seq_als'),
                        F.collect_list('delta_event_secs_log_norm').over(session_window).alias('dtime_secs_log_norm_seq_als'),                     
                        F.collect_list('product_recency_days').over(session_window).alias('prod_recency_days_als'),
                        F.collect_list('product_recency_days_log_norm').over(session_window).alias('prod_recency_days_log_norm_als'),
                                        
                        F.collect_list('relative_price_to_avg_category_id').over(user_window).alias('relative_price_to_avg_category_als'),
                        F.collect_list('et_hour_sin').over(user_window).alias('et_hour_sin_seq_als'),
                        F.collect_list('et_hour_cos').over(user_window).alias('et_hour_cos_seq_als'),
                        F.collect_list('et_month_sin').over(user_window).alias('et_month_sin_seq_als'),
                        F.collect_list('et_month_cos').over(user_window).alias('et_month_cos_seq_als'),
                        F.collect_list('et_dayofweek_sin').over(user_window).alias('et_dayofweek_sin_seq_als'),
                        F.collect_list('et_dayofweek_cos').over(user_window).alias('et_dayofweek_cos_seq_als'),
                        F.collect_list('et_dayofmonth_sin').over(user_window).alias('et_dayofmonth_sin_seq_als'),
                        F.collect_list('et_dayofmonth_cos').over(user_window).alias('et_dayofmonth_cos_seq_als'),                     
                       )\
                .groupBy('user_idx').agg(
                    F.max('session_seq_als').alias('user_session_seq'),
                    F.max('pid_seq_als').alias('user_pid_seq'),
                    F.max('etime_seq_als').alias('user_etime_seq'),
                    F.max('event_type_seq_als').alias('user_etype_seq'),    
                    F.max('csid_seq_als').alias('user_csid_seq'),
                    F.max('ccid_seq_als').alias('user_ccid_seq'),
                    F.max('bid_seq_als').alias('user_bid_seq'),
                    F.max('price_log_seq_als').alias('user_price_log_seq'),  
    
                    F.max('dtime_secs_seq_als').alias('user_dtime_secs_seq'),
                    F.max('dtime_secs_log_norm_seq_als').alias('user_dtime_secs_log_norm_seq'),
                    F.max('prod_recency_days_als').alias('user_prod_recency_days_seq'),
                    F.max('prod_recency_days_log_norm_als').alias('user_prod_recency_days_log_norm_seq'),
    
                    F.max('relative_price_to_avg_category_als').alias('user_relative_price_to_avg_category_seq'),
                    F.max('et_hour_sin_seq_als').alias('user_et_hour_sin_seq'),
                    F.max('et_hour_cos_seq_als').alias('user_et_hour_cos_seq'),
                    F.max('et_month_sin_seq_als').alias('user_et_month_sin_seq'),
                    F.max('et_month_cos_seq_als').alias('user_et_month_cos_seq'),
                    F.max('et_dayofweek_sin_seq_als').alias('user_et_dayofweek_sin_seq'),
                    F.max('et_dayofweek_cos_seq_als').alias('user_et_dayofweek_cos_seq'),
                    F.max('et_dayofmonth_sin_seq_als').alias('user_et_dayofmonth_sin_seq'),
                    F.max('et_dayofmonth_cos_seq_als').alias('user_et_dayofmonth_cos_seq'),               
                        )

In [126]:
df_user_seq_grouped_df.printSchema()

root
 |-- user_idx: integer (nullable = true)
 |-- user_session_seq: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- user_pid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_etime_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- user_etype_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_csid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_ccid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_bid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- user_price_log_seq: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- user_dtime_secs_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- user_dtime_secs_log_norm_seq: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |--

In [127]:
df_sb_grouped_nr_df.columns

['user_idx',
 'user_session',
 'session_start_ts',
 'session_end_ts',
 'sess_pid_seq',
 'sess_etime_seq',
 'sess_etype_seq',
 'sess_csid_seq',
 'sess_ccid_seq',
 'sess_bid_seq',
 'sess_price_log_norm_seq',
 'sess_dtime_secs_seq',
 'sess_dtime_secs_log_norm_seq',
 'sess_prod_recency_days_seq',
 'sess_prod_recency_days_log_norm_seq',
 'sess_relative_price_to_avg_category_seq',
 'sess_et_hour_sin_seq',
 'sess_et_hour_cos_seq',
 'sess_et_month_sin_seq',
 'sess_et_month_cos_seq',
 'sess_et_dayofweek_sin_seq',
 'sess_et_dayofweek_cos_seq',
 'sess_et_dayofmonth_sin_seq',
 'sess_et_dayofmonth_cos_seq',
 'sess_seq_len']

### Joining users and sessions sequences

In [128]:
df_user_seq_grouped_df = df_user_seq_grouped_df.withColumnRenamed('user_idx', 'user_idx2')
users_and_session_seq_joined_df = df_sb_grouped_nr_df \
                .join(df_user_seq_grouped_df,
                      on=(df_sb_grouped_nr_df['user_idx'] == df_user_seq_grouped_df['user_idx2']), how='inner') \
                .drop(F.col('user_idx2'))

In [129]:
users_and_session_seq_joined_df.printSchema()

root
 |-- user_idx: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- session_start_ts: long (nullable = true)
 |-- session_end_ts: long (nullable = true)
 |-- sess_pid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_etime_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- sess_etype_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_csid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_ccid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_bid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_price_log_norm_seq: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- sess_dtime_secs_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- sess_dtime_secs_log_norm_seq: array (nullable = true)
 |    |-

In [130]:
def user_seq_feature_before_session(session_start, user_ts_seq, user_feature_seq, limit):
    new_seq = list(map(lambda y: y[1], filter(lambda x: x[0] < session_start, zip(user_ts_seq, user_feature_seq))))[-limit:]
    return new_seq

@udf(returnType=ArrayType(IntegerType()))
def user_seq_before_session_integer_udf(session_start, user_ts_seq, user_feature_seq, limit):
    result = user_seq_feature_before_session(session_start, user_ts_seq, user_feature_seq, limit)
    return list([int(x) for x in result])

@udf(returnType=ArrayType(FloatType()))
def user_seq_before_session_float_udf(session_start, user_ts_seq, user_feature_seq, limit):
    result = user_seq_feature_before_session(session_start, user_ts_seq, user_feature_seq, limit)
    return list([float(x) for x in result])

@udf(returnType=ArrayType(StringType()))
def user_seq_before_session_str_udf(session_start, user_ts_seq, user_feature_seq, limit):
    result = user_seq_feature_before_session(session_start, user_ts_seq, user_feature_seq, limit)
    return list([str(x) for x in result])

In [131]:
def sessions_reversed_order(session_ids):
    last_session_id = ""
    counter = 1
    sessions_orders = []
    for session_id in reversed(session_ids):
        if session_id != last_session_id:
            counter += 1
            last_session_id = session_id
        sessions_orders.append(counter)
    return list(reversed(sessions_orders))


@udf(returnType=ArrayType(IntegerType()))
def sessions_reversed_order_udf(session_ids):
    result = sessions_reversed_order(session_ids)
    return list([int(x) for x in result])

In [132]:
@udf(returnType=ArrayType(IntegerType()))
def get_repeated_values_array_int_udf(value, repeat_times):
    return [value] * repeat_times

In [133]:
sessions_reversed_order(["a", "a", "a", "a", "b", "c", "c", "d"])

[5, 5, 5, 5, 4, 3, 3, 2]

In [134]:
session_seq_and_prev_users_seq_df = users_and_session_seq_joined_df \
            .withColumn('bef_sess_pid_seq', user_seq_before_session_integer_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_pid_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_etime_seq', user_seq_before_session_integer_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_etime_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_etype_seq', user_seq_before_session_integer_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_etype_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_csid_seq', user_seq_before_session_integer_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_csid_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_ccid_seq', user_seq_before_session_integer_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_ccid_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_bid_seq', user_seq_before_session_integer_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_bid_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_price_log_norm_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_price_log_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_dtime_secs_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_dtime_secs_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_dtime_secs_log_norm_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_dtime_secs_log_norm_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_prod_recency_days_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_prod_recency_days_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_prod_recency_days_log_norm_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_prod_recency_days_log_norm_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_relative_price_to_avg_category_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_relative_price_to_avg_category_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_hour_sin_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_hour_sin_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_hour_cos_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_hour_cos_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_month_sin_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_month_sin_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_month_cos_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_month_cos_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_dayofweek_sin_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_dayofweek_sin_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_dayofweek_cos_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_dayofweek_cos_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_dayofmonth_sin_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_dayofmonth_sin_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_et_dayofmonth_cos_seq', user_seq_before_session_float_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_et_dayofmonth_cos_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION))) \
            .withColumn('bef_sess_session_reversed_order_seq', sessions_reversed_order_udf(user_seq_before_session_str_udf(F.col('session_start_ts'), F.col('user_etime_seq'),  F.col('user_session_seq'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION)))) \
            .withColumn('sess_session_reversed_order_seq', get_repeated_values_array_int_udf(F.lit(1), F.lit(SESSIONS_MAX_LENGTH))) \
            .withColumn('bef_sess_seq_length', F.size('bef_sess_pid_seq'))

# Exporting data

#### Option A: Pad zeros to make all sequences same length
NOTE: it would be not memory-efficient. But currently it seems no way to utilize Petastorm (parquet -> dataloader) without this option.

In [135]:
from pyspark.sql.types import ArrayType, IntegerType

def pad_array(values, expected_length, trunc_start, dtype=int):
    #if dtype is float:
    #    value_to_pad = [0.]
    #else:
    #    value_to_pad = [0]
    #value_to_pad = [dtype(0)]
        
    res = list([dtype(x) for x in values]) + ([dtype(0)] * (expected_length-len(values)))
    
    # Pick first N
    if trunc_start:
        res = res[:expected_length]
    #Pick last N
    else:
        res = res[-expected_length:]
        
    return res


@udf(returnType=ArrayType(IntegerType()))
def pad_array_int(values, expected_length, trunc_start):
    return pad_array(values, expected_length, trunc_start)

@udf(returnType=ArrayType(LongType()))
def pad_array_long(values, expected_length, trunc_start):
    return pad_array(values, expected_length, trunc_start)

@udf(returnType=ArrayType(FloatType()))
def pad_array_float(values, expected_length, trunc_start):
    return pad_array(values, expected_length, trunc_start, dtype=float)

In [136]:
'''
#Export padded sequences for compatibility with PetaStorm data loader. NVTabular and PyArrow supports list columns with different lengths
sessions_users_seqs_to_export_df = \
        session_seq_and_prev_users_seq_df.select('user_idx', 'user_session', 'sess_seq_len', 
                                                 'session_start_ts',  
                                                 F.date_trunc('day', F.to_timestamp(F.col('session_start_ts'))).cast('string').substr(0,10).alias('session_start_date'), 
                                                 'user_seq_length_bef_sess', 'user_elapsed_days_bef_sess', 'user_elapsed_days_log_bef_sess_norm',
                        # Session sequences (first N interactions)
                        pad_array_long(F.col('sess_pid_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_pid_seq'),
                        pad_array_long(F.col('sess_etime_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_etime_seq'),
                        pad_array_int(F.col('sess_etype_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_etype_seq'),
                        pad_array_int(F.col('sess_csid_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_csid_seq'),
                        pad_array_int(F.col('sess_ccid_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_ccid_seq'),
                        pad_array_int(F.col('sess_bid_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_bid_seq'),
                        pad_array_float(F.col('sess_price_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_price_seq'),
                        pad_array_float(F.col('sess_dtime_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_dtime_seq'),
                        pad_array_float(F.col('sess_product_recency_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_product_recency_seq'),
                        pad_array_float(F.col('sess_relative_price_to_avg_category_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_relative_price_to_avg_category_seq'),
                        pad_array_float(F.col('sess_et_hour_sin_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_hour_sin_seq'), 
                        pad_array_float(F.col('sess_et_hour_cos_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_hour_cos_seq'),
                        pad_array_float(F.col('sess_et_month_sin_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_month_sin_seq'),
                        pad_array_float(F.col('sess_et_month_cos_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_month_cos_seq'),
                        pad_array_float(F.col('sess_et_dayofweek_sin_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_dayofweek_sin_seq'),
                        pad_array_float(F.col('sess_et_dayofweek_cos_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_dayofweek_cos_seq'),
                        pad_array_float(F.col('sess_et_dayofmonth_sin_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_dayofmonth_sin_seq'),
                        pad_array_float(F.col('sess_et_dayofmonth_cos_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_et_dayofmonth_cos_seq'),
                         # Users sequences before session (last M interactions) 
                         pad_array_long(F.col('user_pid_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_pid_seq_bef_sess'),
                         pad_array_long(F.col('user_etime_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_etime_seq_bef_sess'),
                         pad_array_int(F.col('user_etype_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_etype_seq_bef_sess'),
                         pad_array_int(F.col('user_csid_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_csid_seq_bef_sess'),
                         pad_array_int(F.col('user_ccid_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_ccid_seq_bef_sess'),
                         pad_array_int(F.col('user_bid_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_bid_seq_bef_sess'),
                         pad_array_float(F.col('user_price_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_price_seq_bef_sess'),
                         pad_array_float(F.col('user_dtime_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_dtime_seq_bef_sess'),
                         pad_array_float(F.col('user_product_recency_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_product_recency_seq_bef_sess'),
                         pad_array_float(F.col('user_relative_price_to_avg_category_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_relative_price_to_avg_category_seq_bef_sess'),
                         pad_array_float(F.col('user_et_hour_sin_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_hour_sin_seq_bef_sess'),
                         pad_array_float(F.col('user_et_hour_cos_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_hour_cos_seq_bef_sess'),
                         pad_array_float(F.col('user_et_month_sin_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_month_sin_seq_bef_sess'),
                         pad_array_float(F.col('user_et_month_cos_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_month_cos_seq_bef_sess'),
                         pad_array_float(F.col('user_et_dayofweek_sin_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_dayofweek_sin_seq_bef_sess'),
                         pad_array_float(F.col('user_et_dayofweek_cos_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_dayofweek_cos_seq_bef_sess'),
                         pad_array_float(F.col('user_et_dayofmonth_sin_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_dayofmonth_sin_seq_bef_sess'),
                         pad_array_float(F.col('user_et_dayofmonth_cos_seq_bef_sess'), F.lit(MAX_LENGTH_USER_SEQUENCE_BEFORE_SESSION), F.lit(False)).alias('user_et_dayofmonth_cos_seq_bef_sess'),
                    )
'''                    

"\n#Export padded sequences for compatibility with PetaStorm data loader. NVTabular and PyArrow supports list columns with different lengths\nsessions_users_seqs_to_export_df =         session_seq_and_prev_users_seq_df.select('user_idx', 'user_session', 'sess_seq_len', \n                                                 'session_start_ts',  \n                                                 F.date_trunc('day', F.to_timestamp(F.col('session_start_ts'))).cast('string').substr(0,10).alias('session_start_date'), \n                                                 'user_seq_length_bef_sess', 'user_elapsed_days_bef_sess', 'user_elapsed_days_log_bef_sess_norm',\n                        # Session sequences (first N interactions)\n                        pad_array_long(F.col('sess_pid_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_pid_seq'),\n                        pad_array_long(F.col('sess_etime_seq'), F.lit(SESSIONS_MAX_LENGTH), F.lit(True)).alias('sess_etime_seq'),\n            

In [137]:
sessions_users_seqs_to_export_df = \
        session_seq_and_prev_users_seq_df.select(
                        'user_idx', 'user_session', 'session_start_ts', 'sess_seq_len',  'bef_sess_seq_length',  
                        F.date_trunc('day', F.to_timestamp(F.col('session_start_ts'))).cast('string').substr(0,10).alias('session_start_date'), 
                        # Session sequences (first N interactions)
                        'sess_pid_seq',
                        'sess_etime_seq',
                        'sess_etype_seq',
                        'sess_csid_seq',
                        'sess_ccid_seq',
                        'sess_bid_seq',
                        'sess_price_log_norm_seq',
                        'sess_dtime_secs_seq',
                        'sess_dtime_secs_log_norm_seq',
                        'sess_prod_recency_days_seq',
                        'sess_prod_recency_days_log_norm_seq',
                        'sess_relative_price_to_avg_category_seq',
                        'sess_et_hour_sin_seq', 
                        'sess_et_hour_cos_seq',
                        'sess_et_month_sin_seq',
                        'sess_et_month_cos_seq',
                        'sess_et_dayofweek_sin_seq',
                        'sess_et_dayofweek_cos_seq',
                        'sess_et_dayofmonth_sin_seq',
                        'sess_et_dayofmonth_cos_seq',
                        'sess_session_reversed_order_seq',
                         # Users sequences before session (last M interactions) 
                         'bef_sess_pid_seq',
                         'bef_sess_etime_seq',
                         'bef_sess_etype_seq',
                         'bef_sess_csid_seq',
                         'bef_sess_ccid_seq',
                         'bef_sess_bid_seq',
                         'bef_sess_price_log_norm_seq',
                         'bef_sess_dtime_secs_seq',
                         'bef_sess_dtime_secs_log_norm_seq',                         
                         'bef_sess_prod_recency_days_seq',
                         'bef_sess_prod_recency_days_log_norm_seq',   
                         'bef_sess_relative_price_to_avg_category_seq',
                         'bef_sess_et_hour_sin_seq',
                         'bef_sess_et_hour_cos_seq',
                         'bef_sess_et_month_sin_seq',
                         'bef_sess_et_month_cos_seq',
                         'bef_sess_et_dayofweek_sin_seq',
                         'bef_sess_et_dayofweek_cos_seq',
                         'bef_sess_et_dayofmonth_sin_seq',
                         'bef_sess_et_dayofmonth_cos_seq',
                         'bef_sess_session_reversed_order_seq',
                    )

In [138]:
sessions_users_seqs_to_export_df.printSchema()

root
 |-- user_idx: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- session_start_ts: long (nullable = true)
 |-- sess_seq_len: integer (nullable = false)
 |-- bef_sess_seq_length: integer (nullable = false)
 |-- session_start_date: string (nullable = true)
 |-- sess_pid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_etime_seq: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- sess_etype_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_csid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_ccid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_bid_seq: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- sess_price_log_norm_seq: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- sess_dtime_secs_seq: array (nullable = true)
 |    |-

In [139]:
#sessions_users_seqs_to_export_df.show()

#### Export dataset to parquet, partioned by the session date

In [140]:
%%time
OUTPUT_PATH = '/mount/results/repeated_interactions={}/total_months={}'.format(KEEP_REPEATED_USER_INTERACTIONS, NUM_MONTHS_TO_PREPROCESS)
sessions_users_seqs_to_export_df.repartition(F.col('session_start_date')) \
                .write.partitionBy('session_start_date') \
                .parquet(os.path.join(OUTPUT_PATH, 'ecommerce_preproc.parquet'))

CPU times: user 241 ms, sys: 217 ms, total: 457 ms
Wall time: 15min 20s


#### Check exported dataset

In [141]:
check_df = spark.read.parquet(os.path.join(OUTPUT_PATH, 'ecommerce_preproc.parquet'))

In [142]:
check_df.take(10)

[Row(user_idx=503175767, user_session='a0bd23ae-2ee8-4b0f-b628-437b9137ccc9', session_start_ts=1570938717, sess_seq_len=6, bef_sess_seq_length=1, sess_pid_seq=[19764, 23487, 20743, 30416, 19282, 48790], sess_etime_seq=[1570938717, 1570938775, 1570938826, 1570938875, 1570938890, 1570938930], sess_etype_seq=[2, 2, 2, 2, 2, 2], sess_csid_seq=[59, 59, 59, 59, 59, 59], sess_ccid_seq=[38, 38, 38, 38, 38, 38], sess_bid_seq=[80, 80, 80, 80, 80, 80], sess_price_log_norm_seq=[-0.051558973702671464, -0.19539975681449054, 0.22817432520591252, -0.187058017060996, -0.30135946042794687, 0.42666204410908015], sess_dtime_secs_seq=[63846, 58, 51, 49, 15, 40], sess_dtime_secs_log_norm_seq=[1.7985935567321982, -0.20214755077162974, -0.23831352027294844, -0.24954491844478394, -0.5758378156471834, -0.3063741122860863], sess_prod_recency_days_seq=[12.038055555555555, 11.815636574074073, 11.812222222222223, 11.476759259259259, 11.99837962962963, 11.054872685185185], sess_prod_recency_days_log_norm_seq=[0.1190

In [143]:
check_df.count()

3929658