# Preprocessing

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import coo_matrix  # LightFM fit method requires coo matrix format as input.

from lightfm import LightFM
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from sklearn.metrics import roc_auc_score
from lightfm.data import Dataset

### Data Import and Cleaning - Events

In [None]:
# Importing Events data and sorting by timestamp column which corresponds the historical order of events.

df_events = pd.read_csv("events.csv")
df_events = df_events.sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)

In [None]:
df_events.head()

In [None]:
# View on the ratio between different types of events.
sns.countplot(x='event', data=df_events)

In [None]:
df_events.event.unique()

In [None]:
# Events types “view”, “addtocart”, “transaction” are the implicit customer feedback.
# They can be considered as rating and will be transformed from categorical to numerical format.

# The weights are subject to tuning together with hyperparameters to achieve better performance.
# Initial weights: view=1, add to cart=2, purchase=3.

weight_view = 1
weight_addtocart = 2
weight_transaction = 3

df_events.event.replace(to_replace=dict(
    view=weight_view, addtocart=weight_addtocart, transaction=weight_transaction), inplace=True)

In [None]:
# Now the events replaced with corresponding weights.
df_events.event.unique()

In [None]:
df_events.head()

In [None]:
df_events.info()

In [None]:
# !!!!!
# Select only the rows which are PURCHASE.
# df_events = df_events.loc[df_events['event'] == weight_transaction].reset_index(drop=True)

In [None]:
df_events.event.unique()

In [None]:
df_events.info()

In [None]:
all_users = df_events['visitorid']

In [None]:
df_events.head()

### Train / Test split

In [None]:
# Use timestamps for split which mimics the real-life case as the events are sorted in historical order.
# Split ratio is 80% for train set, and 20% for test set.

split_point = int(np.ceil(len(df_events)*0.8))  # Index of split point.
split_point_time = int(df_events.loc[split_point]['timestamp'])  # Timestamp of split point.

df_events_train = df_events.loc[0:split_point]
df_events_test = df_events.loc[split_point+1:]

In [None]:
df_events_train.info()

In [None]:
df_events_test.info()

In [None]:
### Train / Test split

# Use timestamps for split which mimics the real-life case as the events are sorted in historical order.
# Split ratio is 80% for train set, and 20% for test set.

split_point = int(np.ceil(len(df_events)*0.8))  # Index of split point.
split_point_time = int(df_events.loc[split_point]['timestamp'])  # Timestamp of split point.

df_events_train = df_events.loc[0:split_point]
df_events_test = df_events.loc[split_point+1:]

df_events_train.info()
print()
df_events_test.info()

### Building LightFM datasets

In [None]:
# For model evaluation purposes (auc_score) dimensionality of train/test interaction matrices should be the same.
# In order to achieve this, need to create mapping for all users and all items.
# Then separately for train and test - the interactions will be filled in.



In [None]:
# Create mapping.

# The fit method of class Dataset takes the list of all the visitors and items.
# The implementation allows to ignore duplicates.

# Train set mapping.
dataset_train = Dataset()
dataset_train.fit(
    df_events['visitorid'].to_numpy(),
    df_events['itemid'].to_numpy()
)

# Train set mapping.
dataset_test = Dataset()
dataset_test.fit(
    df_events['visitorid'].to_numpy(),
    df_events['itemid'].to_numpy()
)

In [None]:
# Transform interactions in required format.

# Dataset class has the method build_interactions that allows to fill in the matrix created at previous step.
# As the input for this method need to pass the list of tuples (visitorid, itemid, weight).

start_time = time.time()

# Train set interactions transformed.
df_events_train_interactions = []
for index, row in df_events_train.iterrows():
    df_events_train_interactions.append((int(row['visitorid']), int(row['itemid']), row['event']))
    
# Test set interactions transformed.
df_events_test_interactions = []
for index, row in df_events_test.iterrows():
    df_events_test_interactions.append((int(row['visitorid']), int(row['itemid']), row['event']))
    
print('Finished in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
print('Check original VS transformed length TRAIN: ', 
     len(df_events_train),
     ' / ',
     len(df_events_train_interactions))

print('Check original VS transformed length TEST: ', 
     len(df_events_test),
     ' / ',
     len(df_events_test_interactions))

In [None]:
# Build interactions matrix for train and test sets

start_time = time.time()

(interactions_train, weights_train) = dataset_train.build_interactions(df_events_train_interactions)
(interactions_test, weights_test) = dataset_test.build_interactions(df_events_test_interactions)

print('Finished in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
weights_train

In [None]:
weights_test

# LightFM model training

In [None]:
# Building model without item features first. This will mean collaboration based predictions.

start_time = time.time()

model = LightFM(no_components=500, loss='warp')
model.fit(weights_train, epochs=20, num_threads=4)

print('Model trained in: ', round((time.time()-start_time)/60, 2), " minutes")

# Model evaluation (auc_score, precision_at_k)

In [None]:
start_time = time.time()

train_auc = auc_score(model, weights_train).mean()

print('Train AUC score: ', train_auc)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
start_time = time.time()

test_auc = auc_score(model, weights_test).mean()

print('Test AUC score: ', test_auc)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
start_time = time.time()
train_precision = precision_at_k(model, weights_train, k=10).mean()
print('Train precision for k=10: ', train_precision)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
start_time = time.time()
test_precision = precision_at_k(model, weights_test, k=10).mean()
print('Test precision for k=10: ', test_precision)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")