In [None]:
#required installation

! pip install kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#download dataset from kaggle website

! kaggle datasets download mkechinov/ecommerce-events-history-in-cosmetics-shop
! unzip /content/ecommerce-events-history-in-cosmetics-shop.zip

Downloading ecommerce-events-history-in-cosmetics-shop.zip to /content
100% 430M/430M [00:07<00:00, 64.5MB/s]
100% 430M/430M [00:07<00:00, 59.9MB/s]
Archive:  /content/ecommerce-events-history-in-cosmetics-shop.zip
  inflating: 2019-Dec.csv            
  inflating: 2019-Nov.csv            
  inflating: 2019-Oct.csv            
  inflating: 2020-Feb.csv            
  inflating: 2020-Jan.csv            


In [None]:
#imports and drive mount

import pandas as pd
import numpy as np
from tqdm import tqdm
from google.colab import drive
from collections import defaultdict
import json
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#concat data from 5 months to one dataframe

df = pd.concat([pd.read_csv("/content/2019-Dec.csv")
                ,pd.read_csv("/content/2019-Nov.csv")
                ,pd.read_csv("/content/2019-Oct.csv")
                ,pd.read_csv("/content/2020-Jan.csv")
                ,pd.read_csv("/content/2020-Feb.csv")])

In [None]:
#show dataframe

df

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,f.o.x,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,cnd,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,runail,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,freedecor,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5
...,...,...,...,...,...,...,...,...,...
4156677,2020-02-29 23:59:32 UTC,view,5885416,1487580005092295511,,grattol,6.27,622082947,fb29909b-6ef5-4662-b4ee-288e73e5dc10
4156678,2020-02-29 23:59:39 UTC,cart,5550686,1487580008145748965,,,1.11,459705611,05d2add3-01f7-47ee-8364-27341673227f
4156679,2020-02-29 23:59:45 UTC,view,5850628,1602943681873052386,,grattol,5.24,622090043,ab7d349f-db5d-4790-8ab1-31e5c894459d
4156680,2020-02-29 23:59:54 UTC,view,5716351,1487580010872045658,,irisk,0.79,619841242,18af673b-7fb9-4202-a66d-5c855bc0fd2d


In [None]:
# delete sessions without session id 
df = df[df['user_session'].notna()]

# delete unnecessary columns
df = df.drop(['category_code', 'brand'], axis=1)

# change session id from string to integer
df['user_session'] = df['user_session'].astype('category').cat.codes

# change evet time to date time
df['event_time'] = pd.to_datetime(df['event_time'],infer_datetime_format=True)

# sort rows by user name and then by event time
df = df.sort_values(by=['user_session', 'event_time'])

In [None]:
# save results to "/content/drive/MyDrive/purchase_prediction_dataset_raw.csv"

path = '/content/drive/MyDrive/purchase_prediction_dataset_raw.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  df.to_csv(f)

In [None]:
# load raw dataframe from "/content/drive/MyDrive/purchase_prediction_dataset_raw.csv"

df = pd.read_csv("/content/drive/MyDrive/purchase_prediction_dataset_raw.csv", index_col=[0])
df

Unnamed: 0,event_time,event_type,product_id,category_id,price,user_id,user_session
1942595,2020-01-16 03:30:41+00:00,view,5560754,1487580006300255120,194.44,539262914,0
3194678,2020-01-24 22:22:20+00:00,view,5618277,1487580006434472855,25.71,605114412,1
522763,2019-11-05 07:57:05+00:00,view,5829298,1487580011585077370,0.79,556321594,2
3008486,2019-10-23 09:07:38+00:00,view,5859210,1487580010872045658,0.48,405771061,3
3009077,2019-10-23 09:12:50+00:00,view,30195,1487580010922377308,0.38,405771061,3
...,...,...,...,...,...,...,...
1274327,2020-02-09 12:52:44+00:00,purchase,5785422,1487580007256556476,3.16,606966882,4535940
1274328,2020-02-09 12:52:44+00:00,purchase,5802162,1487580013069861041,23.81,606966882,4535940
1274329,2020-02-09 12:52:44+00:00,purchase,5832415,1487580007256556476,2.38,606966882,4535940
1274330,2020-02-09 12:52:44+00:00,purchase,5846442,2151191071378375538,19.52,606966882,4535940


In [None]:
# label data
# delete sessions with length less then 10

ls = []
cur_session = -1
mask = True
cur_ls = []
labels = []

for index, row in tqdm(df.iterrows()):
    if row['user_session'] != cur_session:
        cur_session = row['user_session']
        if sum(cur_ls) < 10:
            cur_ls = [False for _ in cur_ls]
        ls.extend(cur_ls)
        labels.extend([mask for c in cur_ls if c])
            
        mask = True
        cur_ls = []
        
    cur_ls.append(mask)
    if row['event_type'] == 'purchase' and mask:
        mask = False

if sum(cur_ls) < 10:
    cur_ls = [False for _ in cur_ls]
ls.extend(cur_ls)
labels.extend([mask for c in cur_ls if c])
        
df = df[ls]
df['label'] = labels

20688242it [20:37, 16719.63it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# reset index for better visualization

df = df.reset_index(drop=True)

In [None]:
# show current data frame

df

Unnamed: 0,event_time,event_type,product_id,category_id,price,user_id,user_session,label
0,2019-10-23 09:07:38+00:00,view,5859210,1487580010872045658,0.48,405771061,3,False
1,2019-10-23 09:12:50+00:00,view,30195,1487580010922377308,0.38,405771061,3,False
2,2019-10-23 09:44:43+00:00,cart,30195,1487580010922377308,0.38,405771061,3,False
3,2019-10-23 09:57:19+00:00,remove_from_cart,5817690,1487580010872045658,0.79,405771061,3,False
4,2019-10-23 09:57:21+00:00,remove_from_cart,30195,1487580010922377308,0.38,405771061,3,False
...,...,...,...,...,...,...,...,...
11936678,2019-10-12 05:03:13+00:00,view,5815662,1487580006317032337,0.92,394410022,4535936,True
11936679,2019-10-17 22:42:31+00:00,remove_from_cart,5854827,1602943681873052386,5.24,394410022,4535936,True
11936680,2019-10-17 22:42:31+00:00,remove_from_cart,5854827,1602943681873052386,5.24,394410022,4535936,True
11936681,2019-10-17 22:42:32+00:00,remove_from_cart,5854828,1602943681873052386,5.24,394410022,4535936,True


Save dataframe

In [None]:
# save results to "/content/drive/MyDrive/purchase_prediction_dataset.csv"

path = '/content/drive/MyDrive/purchase_prediction_dataset.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  df.to_csv(f)

In [None]:
# load labeled dataframe from "/content/drive/MyDrive/purchase_prediction_dataset.csv"

df = pd.read_csv("/content/drive/MyDrive/purchase_prediction_dataset.csv", index_col=[0])
df

Unnamed: 0,event_time,event_type,product_id,category_id,price,user_id,user_session,label
0,2019-10-23 09:07:38+00:00,view,5859210,1487580010872045658,0.48,405771061,3,False
1,2019-10-23 09:12:50+00:00,view,30195,1487580010922377308,0.38,405771061,3,False
2,2019-10-23 09:44:43+00:00,cart,30195,1487580010922377308,0.38,405771061,3,False
3,2019-10-23 09:57:19+00:00,remove_from_cart,5817690,1487580010872045658,0.79,405771061,3,False
4,2019-10-23 09:57:21+00:00,remove_from_cart,30195,1487580010922377308,0.38,405771061,3,False
...,...,...,...,...,...,...,...,...
11936678,2019-10-12 05:03:13+00:00,view,5815662,1487580006317032337,0.92,394410022,4535936,True
11936679,2019-10-17 22:42:31+00:00,remove_from_cart,5854827,1602943681873052386,5.24,394410022,4535936,True
11936680,2019-10-17 22:42:31+00:00,remove_from_cart,5854827,1602943681873052386,5.24,394410022,4535936,True
11936681,2019-10-17 22:42:32+00:00,remove_from_cart,5854828,1602943681873052386,5.24,394410022,4535936,True


In [None]:
# show a single user session for better understanding

df[df['user_session'] == 4535936]

Unnamed: 0,event_time,event_type,product_id,category_id,price,user_id,user_session,label
11936649,2019-10-03 11:25:15+00:00,remove_from_cart,5809910,1602943681873052386,5.24,394410022,4535936,True
11936650,2019-10-03 11:25:15+00:00,remove_from_cart,5809910,1602943681873052386,5.24,394410022,4535936,True
11936651,2019-10-03 11:26:55+00:00,cart,5756076,1526733091857498510,4.44,394410022,4535936,True
11936652,2019-10-03 11:29:28+00:00,cart,5752480,1487580005511725929,4.27,394410022,4535936,True
11936653,2019-10-03 11:31:03+00:00,view,5752480,1487580005511725929,4.27,394410022,4535936,True
11936654,2019-10-03 11:31:50+00:00,view,5756076,1526733091857498510,4.44,394410022,4535936,True
11936655,2019-10-03 11:32:04+00:00,remove_from_cart,5756076,1526733091857498510,4.44,394410022,4535936,True
11936656,2019-10-03 11:32:05+00:00,remove_from_cart,5756076,1526733091857498510,4.44,394410022,4535936,True
11936657,2019-10-09 04:30:44+00:00,remove_from_cart,5886768,1487580006317032337,1.59,394410022,4535936,True
11936658,2019-10-09 04:31:00+00:00,remove_from_cart,5773374,1487580005134238553,2.78,394410022,4535936,True


In [None]:
# print information about labels of dataset

print("Purchased:     ", df[df['label'] == False]['user_session'].nunique())
print("Not Purchased: ", df[df['label']]['user_session'].nunique())
print("Total:         ", df['user_session'].nunique())

Purchased:      80631
Not Purchased:  336514
Total:          417145


In [None]:
# feature extraction function

# features:
    # "number of previous user actions on website during the session": num_page,
    # "number of distinct categories before current category in session": len(cat_dict), 
    # "maximum number of times same categery is visited before current category in session": cat_dict[max_cat], 
    # "number of distinct products before current product in session": len(product_dict), 
    # "maximum number of times same product is visited before current product in session": product_dict[max_product], 
    # "current action type": row['event_type'], 
    # "number of times cart type has happened until current action": type_dict['cart'],
    # "number of times removecart type has happened until current action": type_dict['remove_from_cart'],
    # "number of times view type has happened until current action": type_dict['view'],
    # "current product price": row['price'],
    # "maximum price until current action": max_price,
    # "minimum price until current action": min_price,
    # "mean of prices prices till this action": mean_price 
    # "session label": label 
    
# types of actions in a session: cart = 0, removecart = 1, view = 2, purchase = 3

def get_sess_features(cur_ls):
    
    sess_features = []
    
    num_page = 0
    
    cat_dict = defaultdict(lambda: 0)
    max_cat = None
    
    product_dict = defaultdict(lambda: 0)
    max_product = None
    
    product_dict = defaultdict(lambda: 0)
    max_product = None

    type_dict = defaultdict(lambda: 0)

    max_price = None
    min_price = None
    mean_price = None
    sum = 0


    for row in cur_ls:

        if row['event_type'] != "purchase":
            num_page += 1

            cat_dict[row['category_id']] += 1
            
            if max_cat is None or cat_dict[row['category_id']] > cat_dict[max_cat]:
                max_cat = row['category_id']
            

            product_dict[row['product_id']] += 1
            
            if max_product is None or product_dict[row['product_id']] > product_dict[max_product]:
                max_product = row['product_id']
            

            type_dict[row['event_type']] += 1

            p = row['price']
            sum += p
            mean_price = float(sum)/num_page
            if max_price is None or max_price < p:
                max_price = p
            if min_price is None or min_price > p:
                min_price = p
            
            t = row['event_type']
            x = 4
            if t == "cart":
                x = 0
            elif t == "remove_from_cart":
                x = 1
            elif t == "view":
                x = 2

            sess_features.append(
                [
                num_page,
                len(cat_dict), 
                cat_dict[max_cat], 
                len(product_dict), 
                product_dict[max_product], 
                x, 
                type_dict['cart'],
                type_dict['remove_from_cart'],
                type_dict['view'],
                row['price'],
                max_price,
                min_price,
                mean_price,
                int(row['label']),
                ]
            )


    return sess_features

In [None]:
# call get_sess_features for every action in session

cur_session = -1
cur_ls = []
features = []

for index, row in tqdm(df.iterrows()):
    if row['user_session'] != cur_session:
        cur_session = row['user_session']
        sess_features = get_sess_features(cur_ls)
        if sess_features:
            features.append(sess_features)
        cur_ls = []

    cur_ls.append(row)

sess_features = get_sess_features(cur_ls)
if sess_features:
    features.append(sess_features)

11936683it [22:02, 9027.18it/s] 


In [None]:
# save results to '/content/drive/MyDrive/purchase_prediction_features.pickle'

path = '/content/drive/MyDrive/purchase_prediction_features.pickle'
with open(path, 'wb') as f:
    pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# load featured dataset from '/content/drive/MyDrive/purchase_prediction_features.pickle'

path = '/content/drive/MyDrive/purchase_prediction_features.pickle'
with open(path, 'rb') as handle:
    features = pickle.load(handle)

In [None]:
# show one session features

features[0]

[[1, 1, 1, 1, 1, 2, 0, 0, 1, 0.48, 0.48, 0.48, 0.48, 0],
 [2, 2, 1, 2, 1, 2, 0, 0, 2, 0.38, 0.48, 0.38, 0.43, 0],
 [3, 2, 2, 2, 2, 0, 1, 0, 2, 0.38, 0.48, 0.38, 0.41333333333333333, 0],
 [4, 2, 2, 3, 2, 1, 1, 1, 2, 0.79, 0.79, 0.38, 0.5075000000000001, 0],
 [5, 2, 3, 3, 3, 1, 1, 2, 2, 0.38, 0.79, 0.38, 0.48200000000000004, 0],
 [6, 3, 3, 4, 3, 1, 1, 3, 2, 0.79, 0.79, 0.38, 0.5333333333333333, 0],
 [7, 4, 3, 5, 3, 1, 1, 4, 2, 0.63, 0.79, 0.38, 0.5471428571428572, 0],
 [8, 5, 3, 6, 3, 1, 1, 5, 2, 5.56, 5.56, 0.38, 1.17375, 0],
 [9, 5, 3, 7, 3, 1, 1, 6, 2, 0.38, 5.56, 0.38, 1.0855555555555556, 0],
 [10, 6, 3, 8, 3, 1, 1, 7, 2, 2.06, 5.56, 0.38, 1.1830000000000003, 0],
 [11, 6, 3, 9, 3, 1, 1, 8, 2, 2.06, 5.56, 0.38, 1.262727272727273, 0]]

In [None]:
# robot detection threshold 
th_bot = 300

# pruning threshold
th_prune = 100

features = [f[-th_prune:]  for f in features if len(f) < th_bot]

In [None]:
print("max length of a page view:", np.max([len(f) for f in features]))
print("min length of a page view:", np.min([len(f) for f in features]))

max length of a page view: 100
min length of a page view: 9


In [None]:
# add padding for sessions with less than 100 actions

features = tf.keras.preprocessing.sequence.pad_sequences(
    features, padding="post", dtype='float32'
)

In [None]:
print("max length of a page view:", np.max([len(f) for f in features]))
print("min length of a page view:", np.min([len(f) for f in features]))
print("shape of features:", features.shape)

max length of a page view: 100
min length of a page view: 100
shape of features: (416452, 100, 14)


In [None]:
# show one session features with padding

print(features[0])

[[1.         1.         1.         ... 0.48       0.48       0.        ]
 [2.         2.         1.         ... 0.38       0.43       0.        ]
 [3.         2.         2.         ... 0.38       0.41333333 0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
# shuffle dataset

np.random.shuffle(features)

In [None]:
# save dataset to '/content/drive/MyDrive/purchase_prediction_features.pickle'

path = '/content/drive/MyDrive/purchase_prediction_features.pickle'
with open(path, 'wb') as f:
    pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)