# Preprocessing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.evaluation import auc_score
from sklearn.metrics import roc_auc_score
from lightfm.data import Dataset



### Import data

In [2]:
# Importing Events data and sorting by timestamp column which corresponds the historical order of events.

df_events = pd.read_csv("events.csv")
df_events = df_events.sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)

In [3]:
df_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1430622004384,693516,addtocart,297662,
1,1430622011289,829044,view,60987,
2,1430622013048,652699,view,252860,
3,1430622024154,1125936,view,33661,
4,1430622026228,693516,view,297662,


In [4]:
df_events.event.unique()

array(['addtocart', 'view', 'transaction'], dtype=object)

In [5]:
# Events types “view”, “addtocart”, “transaction” are the implicit customer feedback.
# They can be considered as rating and will be transformed from categorical to numerical format.

# The weights are subject to tuning together with hyperparameters to achieve better performance.
# Initial weights: view=1, add to cart=2, purchase=3.

weight_view = 1
weight_addtocart = 2
weight_transaction = 3

df_events.event.replace(to_replace=dict(
    view=weight_view, addtocart=weight_addtocart, transaction=weight_transaction), inplace=True)

In [6]:
# Now the events replaced with corresponding weights.
df_events.event.unique()

array([2, 1, 3])

In [7]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            int64
itemid           int64
transactionid    float64
dtypes: float64(1), int64(4)
memory usage: 105.1 MB


In [8]:
# Import Categories data

df_categories = pd.read_csv("category_tree.csv")
df_categories.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [9]:
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1669 entries, 0 to 1668
Data columns (total 2 columns):
categoryid    1669 non-null int64
parentid      1644 non-null float64
dtypes: float64(1), int64(1)
memory usage: 26.2 KB


In [10]:
# Import Properties data

df_properties1 = pd.DataFrame(pd.read_csv("item_properties_part1.csv"))
df_properties2 = pd.DataFrame(pd.read_csv("item_properties_part2.csv"))
df_properties = pd.concat([df_properties1, df_properties2])

# data to be sorted by timestamp to reflect the historical change log.
df_properties = df_properties.sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)

df_properties.head(10)

Unnamed: 0,timestamp,itemid,property,value
0,1431226800000,317951,790,n32880.000
1,1431226800000,422842,480,1133979
2,1431226800000,310185,776,103591
3,1431226800000,110973,112,679677
4,1431226800000,179597,available,0
5,1431226800000,260136,available,1
6,1431226800000,138592,764,1285872
7,1431226800000,216269,364,336749
8,1431226800000,299944,764,1285872
9,1431226800000,146103,112,679677


In [11]:
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20275902 entries, 0 to 20275901
Data columns (total 4 columns):
timestamp    int64
itemid       int64
property     object
value        object
dtypes: int64(2), object(2)
memory usage: 618.8+ MB


### Train / Test split

In [12]:
# Use timestamps for split which mimics the real-life case as the events are sorted in historical order.
# Split ratio is 80% for train set, and 20% for test set.

split_point = int(np.ceil(len(df_events)*0.8))
df_events_train = df_events.loc[0:split_point]
df_events_test = df_events.loc[split_point+1:]

In [13]:
df_events_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2204882 entries, 0 to 2204881
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            int64
itemid           int64
transactionid    float64
dtypes: float64(1), int64(4)
memory usage: 84.1 MB


In [14]:
df_events_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551219 entries, 2204882 to 2756100
Data columns (total 5 columns):
timestamp        551219 non-null int64
visitorid        551219 non-null int64
event            551219 non-null int64
itemid           551219 non-null int64
transactionid    4593 non-null float64
dtypes: float64(1), int64(4)
memory usage: 21.0 MB


### Interaction matrices

In [15]:
# Train set. Create sparse matrix of visitor-item iterations.
# The events are coded as numbers reflecting the level of implicit feedback (the higher the better).\
# For the interaction matrix the highest feedback will be used.

df_events_train = df_events_train.sort_values('event').drop_duplicates(
    subset=['visitorid', 'itemid'], 
    keep='last').sort_values('timestamp').reset_index(drop=True)

In [16]:
df_events_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1713176 entries, 0 to 1713175
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            int64
itemid           int64
transactionid    float64
dtypes: float64(1), int64(4)
memory usage: 65.4 MB


In [17]:
# Create a sparse matrix for Train set interactions.

row = df_events_train['visitorid'].astype(int)
col = df_events_train['itemid'].astype(int)
data = df_events_train['event'].astype(int)
interactions_grid_train = csr_matrix((data, (row, col)), shape=None)

In [18]:
interactions_grid_train

<1407580x466868 sparse matrix of type '<class 'numpy.int64'>'
	with 1713176 stored elements in Compressed Sparse Row format>

In [19]:
# Test set. Same approach.

df_events_test = df_events_test.sort_values('event').drop_duplicates(
    subset=['visitorid', 'itemid'], 
    keep='last').sort_values('timestamp').reset_index(drop=True)

#create a sparse matrix
row = df_events_test['visitorid'].astype(int)
col = df_events_test['itemid'].astype(int)
data = df_events_test['event'].astype(int)
interactions_grid_test = csr_matrix((data, (row, col)), shape=None)

In [20]:
df_events_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438134 entries, 0 to 438133
Data columns (total 5 columns):
timestamp        438134 non-null int64
visitorid        438134 non-null int64
event            438134 non-null int64
itemid           438134 non-null int64
transactionid    4382 non-null float64
dtypes: float64(1), int64(4)
memory usage: 16.7 MB


In [21]:
interactions_grid_test

<1407572x466865 sparse matrix of type '<class 'numpy.int64'>'
	with 438134 stored elements in Compressed Sparse Row format>

### Item features matrix

In [22]:
len(df_properties.property.unique())

1104

# LightFM model training

In [25]:
# !!!! dd item_features to have a hybrid model.
# Otherwise, it is item based recommender, which is not solving the cold start problem.

start_time = time.time()

model = LightFM(no_components=30, loss='warp')
model.fit(interactions_grid_train, epochs=20, num_threads=4)

print('Model trained in: ', round((time.time()-start_time)/60, 2), " minutes")

Model trained in:  2.18  minutes


start_time = time.time()
auc_train = auc_score(model, interactions_grid_train).mean()
print("Train AUC score: ", auc_train)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

start_time = time.time()
auc_test = auc_score(model, interactions_grid_test).mean()
print("Test AUC score: ", auc_test)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")