# Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import json
from itertools import islice

from scipy.sparse import coo_matrix  # LightFM fit method requires coo matrix format as input.
from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from sklearn.metrics import roc_auc_score
from lightfm.data import Dataset

# Preprocessing
## Data Import and Cleaning - Events

In [None]:
# Importing Events data and sorting by timestamp column which corresponds the historical order of events.

df_events = pd.read_csv("events.csv")
df_events = df_events.sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)

In [None]:
df_events.head()

In [None]:
df_events.info()

In [None]:
# View on the ratio between different types of events.
sns.countplot(x='event', data=df_events)

In [None]:
# Events types “view”, “addtocart”, “transaction” are the implicit customer feedback.
# They can be considered as rating and will be transformed from categorical to numerical format.

# The weights are subject to tuning together with hyperparameters to achieve better performance.

weight_view = 1
weight_addtocart = 2
weight_transaction = 3

df_events.event.replace(to_replace=dict(
    view=weight_view, addtocart=weight_addtocart, transaction=weight_transaction), inplace=True)

In [None]:
# Now the events replaced with corresponding weights.
df_events.event.unique()

In [None]:
df_events.head()

In [None]:
# The user may have interacted with item multiple times which is now stored in dataframe.
# For the purpose of recommendation we're interested in the highest level of user interest to the item.
# Therefore, the data can be further cleaned.

df_events = df_events.sort_values('event').drop_duplicates(
    subset=['visitorid', 'itemid'], 
    keep='last').sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)

In [None]:
df_events.info()

In [None]:
# Select which types of interaction to be used: weight_view / weight_addtocart / weight_transaction.

df_events = df_events.loc[df_events['event'].isin(
    [weight_addtocart, 
     weight_transaction])].reset_index(drop=True)

df_events.info()

In [None]:
# Those users that had low activity must be removed from the data.
# Considering number of users and items it won't be possible to evaluate predictions having no user features.
# Therefore, it makes sense to delete these users, which will remove noise and save computational cost.
# At the same time, items with low interactions can be kept as there's item features data available.

# !!!
# Before production the model should be additionally trained on the interactions of removed users.

In [None]:
# Count activities by user.
users_activity = df_events.groupby('visitorid').visitorid.count().to_frame(name='activity_count')
users_activity.head()

In [None]:
users_activity.loc[users_activity['activity_count'] > 10].head()

In [None]:
df_events.loc[df_events['visitorid'] == 627]

In [None]:
# See the ratio of users having only low level of interaction.

interaction_threshold = 3

users_activity_low = users_activity.loc[users_activity['activity_count'] <= interaction_threshold]
ratio = len(users_activity_low) / len(users_activity)
print('Users with <=', interaction_threshold, 'interactions represent', round(ratio*100,2),
      '% of total users and are to be removed.')

In [None]:
# Create list of users that need to be removed from events data.
users_to_remove = users_activity_low.index.tolist()
len(users_to_remove)

In [None]:
# Remove users with only 1 interaction from df_events dataframe.
df_events = df_events[~df_events.visitorid.isin(users_to_remove)].reset_index(drop=True)

In [None]:
df_events.info()

In [None]:
# Final view on users and items participating in model training and testing. 

qty_all_items = len(df_events['itemid'].unique())
print('Cleaned dataset number of items: ', qty_all_items)
print()
qty_all_users = len(df_events['visitorid'].unique())
print('Cleaned dataset number of users: ', qty_all_users)

### Train / Test split

In [None]:
# Use timestamps for split which mimics the real-life case as the events are sorted in historical order.
# Split ratio is 80% for train set, and 20% for test set.

split_point = int(np.ceil(len(df_events)*0.8))  # Index of split point.
split_point_time = int(df_events.loc[split_point]['timestamp'])  # Timestamp of split point.

df_events_train = df_events.loc[0:split_point]
df_events_test = df_events.loc[split_point+1:]


# !!!!!!
# Exclude from test set those users and items that are no included in train set.
# This avoids facing cold start problem on evaluation phase.
df_events_test = df_events_test[(df_events_test['visitorid'].isin(df_events_train['visitorid'])) & 
                                (df_events_test['itemid'].isin(df_events_train['itemid']))]


df_events_train.info()
print()
df_events_test.info()

### Transforming interactions data into the format acceptable by lightFM model

In [None]:
# Dataset class of LightFM package has method build_interactions that allows to fill in the interactions matrix.
# As the input for this method need to pass the list of tuples (visitorid, itemid, weight).

start_time = time.time()

# Train set interactions transformed.
df_events_train_interactions = []
for index, row in df_events_train.iterrows():
    df_events_train_interactions.append((int(row['visitorid']), int(row['itemid']), int(row['event'])))
    
# Test set interactions transformed.
df_events_test_interactions = []
for index, row in df_events_test.iterrows():
    df_events_test_interactions.append((int(row['visitorid']), int(row['itemid']), int(row['event'])))
    
print('Finished in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
# Check original VS transformed length, should be equal.

print('Check original VS transformed length TRAIN: ', 
     len(df_events_train),
     '/',
     len(df_events_train_interactions))

print('Check original VS transformed length TEST: ', 
     len(df_events_test),
     '/',
     len(df_events_test_interactions))

# Preprocessing
## Data Import and Cleaning - Item Properties

In [None]:
# Import Properties

df_properties1 = pd.DataFrame(pd.read_csv("item_properties_part1.csv"))
df_properties2 = pd.DataFrame(pd.read_csv("item_properties_part2.csv"))
df_properties = pd.concat([df_properties1, df_properties2])

# data to be sorted by timestamp to reflect the historical change log.
df_properties = df_properties.sort_values(by=['timestamp'], ascending=True).reset_index(drop=True)

df_properties.head(10)

In [None]:
df_properties.info()

In [None]:
df_properties_len_orig = len(df_properties)

In [None]:
# All the categories names and values are hashed excepting "categoryid" and "available".

df_properties.loc[df_properties['itemid'] == 216269].head()

In [None]:
# The model is unable to process the historical log so there's a need to trim the properties data.
# The latest properties data is considered to be the best to describe items assuming the ecommerce team was 
# constantly improving the catalogue.
# This action should lead to decrease of the dataframe size.

df_properties = df_properties.sort_values(by=['timestamp'], ascending=True).drop_duplicates(
    subset=['itemid', 'property'], 
    keep='last').reset_index(drop=True)

df_properties.info()

In [None]:
# Additionally, we can get rid of the 'available' property completely.
# It won't make sense to consider any value as fixed (in stock or not in stock) for trainig purposes.
# In production this property can be used in real time to filter out unavailable items from prediction.

df_properties = df_properties[~df_properties.property.isin(['available'])].reset_index(drop=True)

df_properties.info()

In [None]:
# As the next step the items which are not present in the cleaned dataframe can also be removed.

df_properties = df_properties[df_properties.itemid.isin(df_events['itemid'])].reset_index(drop=True)

df_properties.info()

In [None]:
# Timestamp column can also be removed as it is redundant.

df_properties = df_properties.drop(['timestamp'], axis=1)

In [None]:
# Sort dataframe by itemid.

df_properties = df_properties.sort_values(by=['itemid'], ascending=True).reset_index(drop=True)

In [None]:
print('Cleaned properties dataframe represents ',
      round(100*len(df_properties)/df_properties_len_orig,2),
     ' % of original dataframe.')

In [None]:
qty_all_properties = len(df_properties['property'].unique())
print('Cleaned dataset number of properties: ', qty_all_properties)
print('Cleaned dataset number of properties values: ', len(df_properties))

print('Cleaned dataset number of items: ', qty_all_items)
print('Cleaned dataset number of users: ', qty_all_users)
print('Cleaned dataset number of interactions: ', len(df_events))

In [None]:
df_properties.head(60)

### Transforming item features data into the format acceptable by lightFM model.

In [None]:
# The item features information should be passed to the lightFM model in a format of csr matrix.
# This matrix must have mapping of all items related to the train/test process and 
# all features available in cleaned properties dataframe.
# Then the matrix is to be filled in with the properties values.
# LightFM model would be then capable to create a latent features vector for each item.


In [None]:
df_properties.loc[df_properties['itemid'] == 332253].head()

In [None]:
# Dataset class has the method build_item_features that allows to fill in the properties data using created mapping.
# Add feature mapping to the existing dataset using fit_partial method.
# Data needs to be transformed to the acceptable format.
# https://github.com/lyst/lightfm/issues/393#issuecomment-438237971


# Transform item features list to the format required by Dataset class:
# ['property:value']
item_features_mapping = []
for index, row in df_properties.iterrows():
    item_features_mapping.append(str(row['property']) + ':' + str(row['value']))
    
print('Properties mapping has:', len(item_features_mapping), 'records')

In [None]:
# As the next step need to remove duplicates in this list.
# So basically the feature mapping lenght is not equal to the number of features, 
# but is equal to the number of combinations ['property:value'] available in the dataframe.
# There's no need to "create" all possible combinations mapping as this is redundant info.
# Just need to create mapping for existing combinations.
# Additionally, the LightFM package allows to add weight to this combination if needed.

item_features_mapping = list( dict.fromkeys(item_features_mapping) )
print('Properties mapping has:', len(item_features_mapping), 'records')

In [None]:
# Transform item features values to the format required by Dataset class:
# [ (itemid_1, ['property_1:value_1', 'property_2:value_2']) ]

start_time = time.time()

item_features_values = []
current_item = df_properties.itemid[0]
current_item_features = []
for index, row in df_properties.iterrows():
    if row['itemid'] == current_item:
        current_item_features.append(str(row['property']) + ':' + str(row['value']))
    else:
        item_features_values.append((current_item, current_item_features))
        current_item = row['itemid']
        current_item_features = [str(row['property']) + ':' + str(row['value'])]
item_features_values.append((current_item, current_item_features))

print('Finished in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
item_features_values[:2]

# Preprocessing

## Create LightFM dataset mapping

In [None]:
# For model evaluation purposes (auc_score) dimensionality of train/test interaction matrices should be the same.
# In order to achieve this, need to create mapping for all users and all items.
# Then separately for train and test - the interactions will be filled in.

In [None]:
# Create mapping for users, items and item features.

# The fit method of class Dataset takes the list of all the visitors and items.
# The implementation allows to ignore duplicates.

# Train set mapping.
dataset = Dataset()
dataset.fit(
    users = (x for x in df_events['visitorid']),
    items = (x for x in df_events['itemid']), 
    user_features=None, 
    item_features=item_features_mapping
)

dataset_train = dataset
dataset_test = dataset

## Populate LightFM dataset with data

In [None]:
# Populate interactions matrix for train and test sets.

start_time = time.time()

(interactions_train, weights_train) = dataset_train.build_interactions(df_events_train_interactions)
(interactions_test, weights_test) = dataset_test.build_interactions(df_events_test_interactions)

print('Finished in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
# Populate item features matrix with values.

start_time = time.time()

item_features = dataset.build_item_features(item_features_values)

print('Finished in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
print('Dataset class cross-check.')
print()

num_users, num_items = dataset.interactions_shape()
print('All users expected:', qty_all_users)
print('Actual number of users:', num_users)
print()
print('All items expected:', qty_all_items)
print('Actual number of items:', num_items)
print()

print('Item features matrix is expected to be of size: (', qty_all_items, ',', 
      qty_all_items+len(item_features_mapping), ')')
# Reason for this size is that there's a space for latent vector representation of each item plus all features.
print('Actual size is:', dataset.item_features_shape())
print()

print('Item features matrix number of values is expected to be:', len(df_properties)+qty_all_items)
# Reason for this size is that there's a space for latent vector representation of each item plus all features.
print('Actual number of values is:', item_features.getnnz())
print()

print('Interactions matrix is expected to be of the size: (', qty_all_users, ',', qty_all_items, ')')
print('Actual size is:', dataset.interactions_shape())
print()

print('Interactions matrix number of values is expected to be:', len(df_events_train) + len(df_events_test))
print('Actual number of values is:', weights_train.getnnz() + weights_test.getnnz())
print()

# LightFM model training (with item features)

In [None]:
# Update the model with item features. This will mean hybrid predictions.

start_time = time.time()

model_hybrid = LightFM(no_components=100, loss='warp', random_state=2020)

model_hybrid.fit(weights_train, 
                 item_features=item_features, 
                 epochs=100, 
                 num_threads=4)

print('Model trained in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
model_hybrid.get_params()

In [None]:
# Model evaluation (auc_score, precision_at_k)

In [None]:
# Since the recommendation engine is a ranking problem, AUC ROC and Precision at K will be used.
# Both of them are measuring the ranking quality.


In [None]:
start_time = time.time()

train_auc = auc_score(model_hybrid, 
                      weights_train, 
                      item_features=item_features, 
                      num_threads=4).mean()

print('Train AUC score: ', train_auc)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
start_time = time.time()

# Train interactions fill be also passed to avoid model re-recommending items to users.
test_auc = auc_score(model_hybrid,
                     weights_test, 
                     item_features=item_features, 
                     train_interactions = weights_train, 
                     num_threads=4).mean()

print('Test AUC score: ', test_auc)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
# Set parameter "k" value to check precision at "k"

k = 10

In [None]:
start_time = time.time()

train_precision = precision_at_k(model_hybrid, 
                                 weights_train,
                                 item_features=item_features, 
                                 num_threads=4, 
                                 k=k).mean()

print('Train precision at k: ', train_precision)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
start_time = time.time()

test_precision = precision_at_k(model_hybrid, 
                                weights_test, 
                                item_features=item_features, 
                                train_interactions = weights_train, 
                                num_threads=4, 
                                k=k).mean()

print('Test precision at k: ', test_precision)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

# Hyperparameter tuning

In [None]:
# Split train set into train_train and validate subsets.
from lightfm.cross_validation import random_train_test_split

(train_train, validate) = random_train_test_split(weights_train, 
                                                  test_percentage=0.2)

In [None]:
import itertools

import numpy as np

from lightfm import LightFM
from lightfm.evaluation import precision_at_k


def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(10, 200),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-10),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(10, 100),
            "random_state": 2020,
        }


def random_search(train, validate, num_samples=20, k=10, num_threads=4):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    validate: np.float32 coo_matrix of shape [n_users, n_items]
        Validation data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (precision_at_k, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model_tuned = LightFM(**hyperparams)
        model_tuned.fit(train, epochs=num_epochs, num_threads=num_threads)

        ranking_score = precision_at_k(model_tuned, validate, train_interactions=train, 
                                       num_threads=num_threads, 
                                       k=k).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (ranking_score, hyperparams, model_tuned)





start_time = time.time()

if __name__ == "__main__":
    
    (ranking_score, hyperparams, model_tuned) = max(random_search(train_train, validate), 
                                                    key=lambda x: x[0])

    print('Best presicion ranking score {} at {}'.format(ranking_score, hyperparams))
    print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
# Tuned LightFM model training (with item features)

In [None]:
# Use the tuned model.

start_time = time.time()

model_hybrid = model_tuned

model_hybrid.fit(weights_train, 
                 item_features=item_features, 
                 epochs=hyperparams['num_epochs'], 
                 num_threads=4)

print('Model trained in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
# Tuned model evaluation (auc_score, precision_at_k)

In [None]:
start_time = time.time()

train_auc = auc_score(model_hybrid, 
                      weights_train, 
                      item_features=item_features, 
                      num_threads=4).mean()

print('Train AUC score: ', train_auc)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
start_time = time.time()

# Train interactions fill be also passed to avoid model re-recommending items to users.
test_auc = auc_score(model_hybrid,
                     weights_test, 
                     item_features=item_features, 
                     train_interactions = weights_train, 
                     num_threads=4).mean()

print('Test AUC score: ', test_auc)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
# Set parameter "k" value to check precision at "k"

k = 10

In [None]:
start_time = time.time()

train_precision = precision_at_k(model_hybrid, 
                                 weights_train,
                                 item_features=item_features, 
                                 num_threads=4, 
                                 k=k).mean()

print('Train precision at k: ', train_precision)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")

In [None]:
start_time = time.time()

test_precision = precision_at_k(model_hybrid, 
                                weights_test, 
                                item_features=item_features, 
                                train_interactions = weights_train, 
                                num_threads=4, 
                                k=k).mean()

print('Test precision at k: ', test_precision)
print('Calculated in: ', round((time.time()-start_time)/60, 2), " minutes")