# Preprocessing

In [124]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.sparse import csr_matrix

### Import data

In [80]:
# Importing Events data and sorting by timestamp column which corresponds the historical order of events.
df_events = pd.read_csv("events.csv")
df_events = df_events.sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)

In [81]:
df_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1430622004384,693516,addtocart,297662,
1,1430622011289,829044,view,60987,
2,1430622013048,652699,view,252860,
3,1430622024154,1125936,view,33661,
4,1430622026228,693516,view,297662,


In [82]:
df_events.event.unique()

array(['addtocart', 'view', 'transaction'], dtype=object)

In [83]:
# Events types “view”, “addtocart”, “transaction” are the implicit customer feedback.
# They can be considered as rating and will be transformed from categorical to numerical format.

# The weights are subject to tuning together with hyperparameters to achieve better performance.
# Initial weights: view=1, add to cart=2, purchase=3.

weight_view = 1
weight_addtocart = 2
weight_transaction = 3

df_events.event.replace(to_replace=dict(
    view=weight_view, addtocart=weight_addtocart, transaction=weight_transaction), inplace=True)

In [86]:
# Now the events replaced with corresponding weights.
df_events.event.unique()

array([2, 1, 3])

In [106]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            int64
itemid           int64
transactionid    float64
dtypes: float64(1), int64(4)
memory usage: 105.1 MB


In [5]:
# Import Categories data

df_categories = pd.read_csv("category_tree.csv")
df_categories.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [107]:
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1669 entries, 0 to 1668
Data columns (total 2 columns):
categoryid    1669 non-null int64
parentid      1644 non-null float64
dtypes: float64(1), int64(1)
memory usage: 26.2 KB


In [6]:
# Import Properties data

df_properties1 = pd.DataFrame(pd.read_csv("item_properties_part1.csv"))
df_properties2 = pd.DataFrame(pd.read_csv("item_properties_part2.csv"))
df_properties = pd.concat([df_properties1, df_properties2])

df_properties.head(10)

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
5,1436065200000,285026,available,0
6,1434250800000,89534,213,1121373
7,1431831600000,264312,6,319724
8,1433646000000,229370,202,1330310
9,1434250800000,98113,451,1141052 n48.000


In [108]:
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20275902 entries, 0 to 9275902
Data columns (total 4 columns):
timestamp    int64
itemid       int64
property     object
value        object
dtypes: int64(2), object(2)
memory usage: 773.5+ MB


### Train / Test split. Sparse matrices

In [101]:
# Use timestamps for split which mimics the real-life case as the events are sorted in historical order.
# Split ratio is 80% for train set, and 20% for test set.

split_point = int(np.ceil(len(df_events)*0.8))
df_events_train = df_events.loc[0:split_point]
df_events_test = df_events.loc[split_point+1:]

In [111]:
df_events_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2204882 entries, 0 to 2204881
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            int64
itemid           int64
transactionid    float64
dtypes: float64(1), int64(4)
memory usage: 84.1 MB


In [113]:
df_events_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551219 entries, 2204882 to 2756100
Data columns (total 5 columns):
timestamp        551219 non-null int64
visitorid        551219 non-null int64
event            551219 non-null int64
itemid           551219 non-null int64
transactionid    4593 non-null float64
dtypes: float64(1), int64(4)
memory usage: 21.0 MB


In [120]:
# Train set. Create sparse matrix of visitor-item iterations.
# The events are coded as numbers reflecting the level of implicit feedback (the higher the better).\
# For the interaction matrix the highest feedback will be used.


df_events_train = df_events_train.sort_values('event').drop_duplicates(
    subset=['visitorid', 'itemid'], 
    keep='last').sort_values('timestamp').reset_index(drop=True)

In [121]:
df_events_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1713176 entries, 0 to 1713175
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            int64
itemid           int64
transactionid    float64
dtypes: float64(1), int64(4)
memory usage: 65.4 MB


In [126]:
#create a sparse matrix
row = df_events_train['visitorid'].astype(int)
col = df_events_train['itemid'].astype(int)
data = df_events_train['event'].astype(int)
interactions_grid_train = csr_matrix((data, (row, col)), shape=None)

In [127]:
interactions_grid_train

<1407580x466868 sparse matrix of type '<class 'numpy.int64'>'
	with 1713176 stored elements in Compressed Sparse Row format>

In [128]:
# Test set. Same approach.

df_events_test = df_events_test.sort_values('event').drop_duplicates(
    subset=['visitorid', 'itemid'], 
    keep='last').sort_values('timestamp').reset_index(drop=True)

#create a sparse matrix
row = df_events_test['visitorid'].astype(int)
col = df_events_test['itemid'].astype(int)
data = df_events_test['event'].astype(int)
interactions_grid_test = csr_matrix((data, (row, col)), shape=None)

In [129]:
df_events_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438134 entries, 0 to 438133
Data columns (total 5 columns):
timestamp        438134 non-null int64
visitorid        438134 non-null int64
event            438134 non-null int64
itemid           438134 non-null int64
transactionid    4382 non-null float64
dtypes: float64(1), int64(4)
memory usage: 16.7 MB


In [130]:
interactions_grid_test

<1407572x466865 sparse matrix of type '<class 'numpy.int64'>'
	with 438134 stored elements in Compressed Sparse Row format>