In [1]:
import pandas as pd
import numpy as np

In [2]:
df_view = pd.read_csv('view_ecommerce.dat', delimiter='\t')

In [3]:
df_view.head(20)

Unnamed: 0,visitorid,itemid,event
0,1,1,1
1,1,2,1
2,1,3,1
3,1,4,1
4,1,5,1
5,1,6,1
6,1,7,1
7,1,8,1
8,1,9,1
9,1,10,1


In [4]:
df_view.describe()

Unnamed: 0,visitorid,itemid,event
count,78371.0,78371.0,78371.0
mean,1672.27528,3128.034579,1.0
std,985.121303,2248.049066,0.0
min,1.0,1.0,1.0
25%,805.0,1185.0,1.0
50%,1679.0,2742.0,1.0
75%,2518.0,4773.0,1.0
max,3431.0,8885.0,1.0


In [5]:
df_cart = pd.read_csv('add_to_cart_ecommerce.dat', delimiter='\t')

In [6]:
df_cart.head()

Unnamed: 0,visitorid,itemid,event
0,1,1,1
1,1,16,1
2,2,21,1
3,4,60,1
4,7,103,1


In [7]:
df_cart.describe()

Unnamed: 0,visitorid,itemid,event
count,9028.0,9028.0,9028.0
mean,1682.938857,3099.533451,1.0
std,983.557954,2210.969939,0.0
min,1.0,1.0,1.0
25%,819.0,1227.75,1.0
50%,1729.0,2649.0,1.0
75%,2541.0,4703.25,1.0
max,3429.0,8869.0,1.0


In [8]:
df_purchase = pd.read_csv('purchase_ecommerce.dat', delimiter='\t')

In [9]:
df_purchase.head()

Unnamed: 0,visitorid,itemid,event
0,1,1,1
1,1,16,1
2,7,103,1
3,7,105,1
4,7,106,1


In [10]:
df_purchase.describe()

Unnamed: 0,visitorid,itemid,event
count,5088.0,5088.0,5088.0
mean,1664.824489,3028.611046,1.0
std,978.161166,2186.526289,0.0
min,1.0,1.0,1.0
25%,805.0,1195.0,1.0
50%,1690.0,2534.0,1.0
75%,2584.0,4607.25,1.0
max,3429.0,8852.0,1.0


In [11]:
df_purchase.loc[df_view['visitorid']==69]

Unnamed: 0,visitorid,itemid,event
1044,579,3892,1
1045,579,3979,1
1046,579,3981,1
1047,579,4208,1
1048,579,4233,1
1049,579,4296,1
1050,579,4333,1
1051,579,4340,1


# Two main ideas:

1. Consider only the visitors that made a purchase and build recommendations in this pool.
2. Use the full list of visitors and items assigning the rating to the relationship value: 1 is viewed, 2 is added to cart, 3 is purchased.

## Purchased only

In [12]:
#create a sparse matrix
from scipy.sparse import csr_matrix
row = df_purchase['visitorid'].astype(int)
col = df_purchase['itemid'].astype(int)
data = df_purchase['event'].astype(int)
purchase_grid = csr_matrix((data, (row, col)), shape=None)

In [13]:
purchase_grid

<3430x8853 sparse matrix of type '<class 'numpy.int64'>'
	with 5088 stored elements in Compressed Sparse Row format>

## All visitors and items

In [14]:
all_visitors = np.concatenate(
    (df_view['visitorid'].unique().astype(int),
     df_cart['visitorid'].unique().astype(int),
     df_purchase['visitorid'].unique().astype(int)
    ), axis=0)

In [15]:
len(all_visitors)

5804

In [16]:
all_items = np.concatenate(
    (df_view['itemid'].unique().astype(int),
     df_cart['itemid'].unique().astype(int),
     df_purchase['itemid'].unique().astype(int)),
    axis=0)

In [17]:
len(all_items)

16402

In [18]:
# Use the full list of visitors and items.
# Personalisation of the items per visitor is done by assigning the rating to each of the items per visitor.
# Assuming that:
# 1 point - item was viewed by visitor;
# 2 points - item was added to cart;
# 3 points - item was purchased.
# Since the dataset consists of three separate tables and goal is purchase, the highest of the ratings will be used.

In [19]:
# df_view dataframe values in column 'event' are '1' which fits our needs.
event_view = df_view
event_view.head()

Unnamed: 0,visitorid,itemid,event
0,1,1,1
1,1,2,1
2,1,3,1
3,1,4,1
4,1,5,1


In [20]:
# df_cart dataframe values in column 'event' are '1' which needs to be replaced by '2'.
event_cart = df_cart
event_cart['event'].replace(1,2, inplace=True)
event_cart.head()

Unnamed: 0,visitorid,itemid,event
0,1,1,2
1,1,16,2
2,2,21,2
3,4,60,2
4,7,103,2


In [21]:
# df_purchase dataframe values in column 'event' are '1' which needs to be replaced by '3'.
event_purchase = df_purchase
event_purchase['event'].replace(1,3, inplace=True)
event_purchase.head()

Unnamed: 0,visitorid,itemid,event
0,1,1,3
1,1,16,3
2,7,103,3
3,7,105,3
4,7,106,3


In [23]:
# Concatenate all three dataframes in one.
all_visitors_items = pd.concat([event_view, event_cart, event_purchase], ignore_index=True)

In [24]:
all_visitors_items.describe()

Unnamed: 0,visitorid,itemid,event
count,92487.0,92487.0,92487.0
mean,1672.9063,3119.782878,1.20764
std,984.583543,2241.210952,0.52398
min,1.0,1.0,1.0
25%,807.0,1190.0,1.0
50%,1688.0,2723.0,1.0
75%,2527.0,4751.0,1.0
max,3431.0,8885.0,3.0


In [31]:
# Remove duplicates keeping the highest value in 'event' column.
all_visitors_items.sort_values('event').drop_duplicates(
    subset=['visitorid', 'itemid'], 
    keep='last').reset_index(drop=True)

Unnamed: 0,visitorid,itemid,event
0,2210,3576,1
1,2210,2541,1
2,2210,2123,1
3,2210,2015,1
4,2210,1831,1
...,...,...,...
79590,1130,1490,3
79591,1130,1177,3
79592,1129,7304,3
79593,1130,3221,3


In [32]:
#create a sparse matrix
from scipy.sparse import csr_matrix
row = all_visitors_items['visitorid'].astype(int)
col = all_visitors_items['itemid'].astype(int)
data = all_visitors_items['event'].astype(int)
all_visitors_items_grid = csr_matrix((data, (row, col)), shape=None)

In [33]:
all_visitors_items_grid

<3432x8886 sparse matrix of type '<class 'numpy.int64'>'
	with 79595 stored elements in Compressed Sparse Row format>