First we set up the system and the library imports.

In [4]:
import sys

import itertools
import logging
import os

import numpy as np
import pandas as pd
import papermill as pm

from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.sar import SAR

from pandas.io.formats.format import DataFrameFormatter

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0]
Pandas version: 1.3.5


In the code blocks below we define the top k items to recommend as 10, read the dataset, drop duplicate rows, convert the 'Interaction Type' column to float32 data type and convert the 'Interaction Timestamp' column to a datetime object followed by a numerical value (timestamp in seconds).

In [13]:
TOP_K = 10

data = pd.read_csv('supermarket_data.csv')
data = data.drop_duplicates()
data.head()

Unnamed: 0,User ID,Product ID,Interaction Type,Interaction Timestamp,category_numerical,Age,Gender,Occupation,Location,Product Name,Expiration Date
0,769,67,3,2023-03-09 17:26:53,3,56,Male,Student,Phoenix,photo/film,2024-04-03
1,869,67,4,2023-03-09 17:26:53,4,27,Male,Teacher,Chicago,photo/film,2024-04-03
2,289,67,3,2023-03-09 17:26:53,3,34,Male,Businessman,Houston,photo/film,2024-04-03
3,289,67,4,2023-03-09 17:26:53,4,34,Male,Businessman,Houston,photo/film,2024-04-03
4,963,67,2,2023-03-09 17:26:53,2,53,Female,Engineer,New York,photo/film,2024-04-03


In [14]:
data.loc[:, 'Interaction Type'] = data['Interaction Type'].astype(np.float32)


data["Interaction Timestamp"] = pd.to_datetime(data["Interaction Timestamp"])


data['unix_timestamp'] = data['Interaction Timestamp'].astype(int)

data.head()


  data['unix_timestamp'] = data['Interaction Timestamp'].astype(int)


Unnamed: 0,User ID,Product ID,Interaction Type,Interaction Timestamp,category_numerical,Age,Gender,Occupation,Location,Product Name,Expiration Date,unix_timestamp
0,769,67,3.0,2023-03-09 17:26:53,3,56,Male,Student,Phoenix,photo/film,2024-04-03,1678382813000000000
1,869,67,4.0,2023-03-09 17:26:53,4,27,Male,Teacher,Chicago,photo/film,2024-04-03,1678382813000000000
2,289,67,3.0,2023-03-09 17:26:53,3,34,Male,Businessman,Houston,photo/film,2024-04-03,1678382813000000000
3,289,67,4.0,2023-03-09 17:26:53,4,34,Male,Businessman,Houston,photo/film,2024-04-03,1678382813000000000
4,963,67,2.0,2023-03-09 17:26:53,2,53,Female,Engineer,New York,photo/film,2024-04-03,1678382813000000000


Below we create a header dictionary, used to specify the columns for the train and test datasets. We split the data into training and testing sets by using the python_stratified_split method from the recommenders.datasets.python_splitters package.

In [15]:
header = {
    "col_user": "User ID",
    "col_item": "Product ID",
    "col_rating": "Interaction Type",
    "col_timestamp": "unix_timestamp",
    "col_prediction": "Prediction",
}

In [16]:
train, test = python_stratified_split(data, ratio = 0.75, col_user=header["col_user"], col_item=header["col_item"], seed = 42)

 Here we fit a SAR (Smart Adaptive Recommendation) model on the training data.

In [17]:
logging.basicConfig(level = logging.DEBUG, 
                    format = '%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    similarity_type = "jaccard", 
    time_decay_coefficient = 30, 
    time_now = None, 
    timedecay_formula = True, 
    **header
)

model.fit(train)

top_k = model.recommend_k_items(test, top_k = TOP_K, remove_seen = True)

In [21]:
top_k_with_titles = (top_k.join(data[['Product ID', 'Product Name']].drop_duplicates().set_index('Product ID'), 
                                on='Product ID', 
                                how='inner').sort_values(by=['User ID', 'Prediction'], ascending=False))
display(top_k_with_titles.head(10))

Unnamed: 0,User ID,Product ID,Prediction,Product Name
9990,999,106,32.462301,liver loaf
9991,999,63,32.400738,UHT-milk
9992,999,139,31.888214,tidbits
9993,999,102,31.746948,semi-finished bread
9994,999,116,31.688878,rum
9995,999,44,31.661939,curd
9996,999,136,31.526966,male cosmetics
9997,999,62,31.47782,liquor (appetizer)
9998,999,0,31.003124,tropical fruit
9999,999,80,30.790492,Instant food products


Here we evaluate the model's performance using different ranking metrics with the same arguments: MAP, NDCG, Precision@K and Recall@K for the test data and generated recommendations.

In [22]:
args = [test, top_k]
kwargs = dict(col_user = 'User ID', 
              col_item = 'Product ID', 
              col_rating = 'Interaction Type', 
              col_prediction = 'Prediction', 
              relevancy_method = 'top_k', 
              k = TOP_K)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [23]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 10
MAP:		 0.005236
NDCG:		 0.045171
Precision@K:	 0.055900
Recall@K:	 0.023893


Below we generate the top recommendations for a particular user by calling the recommend_for_user function with the model, test data, user_id, and num_recs parameters and print the output. We first obtain the user's interactions from the test dataset, then generate the top-k recommendations, join with the product names and return the reccomendations.

In [24]:
def recommend_for_user(model, test_data, user_id, num_recs):
    
    user_interactions = test_data[test_data['User ID'] == user_id]
    
    
    top_k = model.recommend_k_items(user_interactions, top_k=num_recs, remove_seen = True)
    
    
    top_k_with_titles = top_k.join(data[['Product ID', 'Product Name']].drop_duplicates().set_index('Product ID'), 
                                    on = 'Product ID', 
                                    how = 'inner').sort_values(by='Prediction', ascending = False)
    
    
    return top_k_with_titles.head(num_recs)


Finally, we generate the top-k recommendations for a specific user using the recommend_for_user function.

In [25]:
user_id = 100
num_recs = 3
top_recs = recommend_for_user(model, test, user_id, num_recs)
print(top_recs)

   User ID  Product ID  Prediction Product Name
0      100          67   48.912530   photo/film
1      100          63   47.465668     UHT-milk
2      100          60   46.634788       liquor
