In [1]:
import pandas as pd
import numpy as np

In [2]:
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.scenarios import Timed
from recpack.algorithms import Random, Popularity, ItemKNN, EASE
from recpack.pipelines import PipelineBuilder, GridSearchInfo
from recpack.metrics import RecallK

# Testing various algorithms

In the following cells, I setup a recpack pipeline with the H&M dataset and test different algorithms. The algorithms are evaluated on recall@10/25/50.

Algorithm | R@10	 | R@25		| R@50   
----------|----------|----------|---------
Popularit | 0.015959 | 0.03930	| 0.068584
ItemKNN   | 0.047625 | 0.076670	| 0.106593 
EASE      | 0.046306 | 0.074782	| 0.103866
SVD       | 0.030904 | 0.051831	| 0.078366
NMF       | 0.027480 | 0.04821	| 0.072477
Prod2Vec  | 0.004908 | 0.010384	| 0.017368 
SLIM      | 0.000000 | 0.000000 | 0.000000

## Load data

The dataset is preprocessed by removing items with fewer than 100 customers to reduce the number of items to a more managable number. Only the last 10 weeks of transactions are used.

In [3]:
BASE_PATH = '../../data/'
DATA_PATH = BASE_PATH + 'parquet/'
# DATA_PATH = BASE_PATH + 'sample_0.05/'

transactions = pd.read_parquet(DATA_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(DATA_PATH + 'customers.parquet')
articles = pd.read_parquet(DATA_PATH + 'articles.parquet')

In [6]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(100, item_ix='article_id', user_ix='customer_id'))
interaction_matrix = proc.process(transactions[transactions.week > 94])

  0%|          | 0/2163645 [00:00<?, ?it/s]

  0%|          | 0/2163645 [00:00<?, ?it/s]

## Create scenario

A timed scenario is used with the final week as test week and the previous 4 weeks for training. For KNN a different scenario with validation enable is used.

In [8]:
scenario = Timed(104, delta_in=4)
# scenario = Timed(104, 103, delta_in=3, validation=True)
scenario.split(interaction_matrix)

## Experiments

After fitting and evaluating a few algorithms, ItemKNN was chosen as it perfomed very well and took by far the least amount of fitting time.

In [9]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

builder.add_algorithm('Popularity')
# builder.add_algorithm('ItemKNN', optimisation_info=GridSearchInfo({
#     'K': [50, 100, 200, 500],
#     'similarity': ['cosine', 'conditional_probability'],
#     'pop_discount': [None, 0.5],
#     'normalize_X': [True, False],
#     'normalize_sim': [True, False],
# }))
# builder.add_algorithm("EASE")
# builder.add_algorithm("SVD")
# builder.add_algorithm("NMF")
# builder.add_algorithm("Prod2Vec")
# builder.add_algorithm("SLIM")

builder.set_optimisation_metric('RecallK', K=10)
builder.add_metric('RecallK', K=[10, 25, 50])
pipeline = builder.build()

In [10]:
pipeline.run()

  0%|          | 0/1 [00:00<?, ?it/s]

2023-12-27 17:46:02,064 - base - recpack - INFO - Fitting Popularity complete - Took 0.121s


In [11]:
pipeline.get_metrics()

Unnamed: 0,RecallK_10,RecallK_25,RecallK_50
Popularity(K=200),0.017484,0.037025,0.068299


In [129]:
pipeline.optimisation_results

Unnamed: 0,algorithm,identifier,params,RecallK_10
0,ItemKNN,"ItemKNN(K=50,normalize_X=True,normalize_sim=Tr...","{'K': 50, 'normalize_X': True, 'normalize_sim'...",0.025129
1,ItemKNN,"ItemKNN(K=50,normalize_X=True,normalize_sim=Tr...","{'K': 50, 'normalize_X': True, 'normalize_sim'...",0.047095
2,ItemKNN,"ItemKNN(K=50,normalize_X=True,normalize_sim=Tr...","{'K': 50, 'normalize_X': True, 'normalize_sim'...",0.025129
3,ItemKNN,"ItemKNN(K=50,normalize_X=True,normalize_sim=Tr...","{'K': 50, 'normalize_X': True, 'normalize_sim'...",0.033812
4,ItemKNN,"ItemKNN(K=50,normalize_X=True,normalize_sim=Fa...","{'K': 50, 'normalize_X': True, 'normalize_sim'...",0.025121
...,...,...,...,...
59,ItemKNN,"ItemKNN(K=500,normalize_X=False,normalize_sim=...","{'K': 500, 'normalize_X': False, 'normalize_si...",0.030181
60,ItemKNN,"ItemKNN(K=500,normalize_X=False,normalize_sim=...","{'K': 500, 'normalize_X': False, 'normalize_si...",0.034381
61,ItemKNN,"ItemKNN(K=500,normalize_X=False,normalize_sim=...","{'K': 500, 'normalize_X': False, 'normalize_si...",0.045535
62,ItemKNN,"ItemKNN(K=500,normalize_X=False,normalize_sim=...","{'K': 500, 'normalize_X': False, 'normalize_si...",0.034381


# Creating similarity lookup inspired by itemKNN

In the following cells, I attempt to get the top-100 similar items for each item inside a dataframe. This way, candidate generation code used in the other notebooks can just use the dataframe and doesn't need to concern itself with sparse matrices and rescaled user and item id spaces.

The code is mostly copied from the DataframePreprocessor and ItemKNN classes in recpack. In a better world, I would have extracted the required data from a recpack pipeline, but this was faster at the moment (isues getting recpack to install on certain python environments). It also allowed me to better understand what was happening under the hood.

In [22]:
from recpack.util import get_top_K_ranks
from recpack.preprocessing.util import rescale_id_space
from recpack.algorithms.util import invert

from scipy.sparse import diags
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import Normalizer

## Load H&M dataset

Load the dataset as an interaction matrix, and remember the mappibngs between the original and new user/item ids.

In [18]:
df = transactions.copy()
a = df.article_id.value_counts()
a = set(a[a > 100].index)
df = df[df.article_id.isin(a)]

item_ids = list(df.article_id.unique())
user_ids = list(df.customer_id.unique())

user_id_mapping = rescale_id_space(user_ids)
item_id_mapping = rescale_id_space(item_ids)

df.loc[:, 'uid'] = df.customer_id.map(lambda x: user_id_mapping.get(x))
df.loc[:, 'iid'] = df.article_id.map(lambda x: item_id_mapping.get(x))
values = np.ones(df.shape[0])
indices = df[['uid', 'iid']].values
indices = indices[:, 0], indices[:, 1]
shape = (len(user_ids), len(item_ids))
matrix = csr_matrix((values, indices), shape=shape, dtype=np.int32)

## Fit ItemKNN

In [20]:
transformer = Normalizer(norm="l1", copy=False)

item_cosine_similarities = cosine_similarity(matrix.T, dense_output=False)
item_cosine_similarities.setdiag(0)
item_similarities = transformer.transform(item_cosine_similarities)

# X_binary = matrix.astype(bool).astype(matrix.dtype)
# co_mat = X_binary.T @ matrix
# A = invert(diags(X_binary.sum(axis=0).A[0]).tocsr())
# item_cond_prob_similarities = A @ co_mat
# item_cond_prob_similarities.setdiag(0)
# item_similarities = transformer.transform(item_cond_prob_similarities)

  self._set_arrayXarray(i, j, x)


In [23]:
top_K_ranks = get_top_K_ranks(item_similarities, 100)
top_K_ranks[top_K_ranks > 0] = 1

item_similarities = top_K_ranks.multiply(item_similarities) 

## Create similarity dataframe

For each item, record the top-100 most similar items. Requires converting the article ids back to the original ids.

In [26]:
sims = []
for i1, i2 in zip(*item_similarities.nonzero()):
    sims.append((i1, i2, item_similarities[i1, i2]))

result = pd.DataFrame(sims, columns=['article_id', 'similar_article_id', 'score'])

item_id_mapping_rev = {i1: i2 for (i2, i1) in item_id_mapping.items()}
result['article_id'] = result.article_id.map(lambda x: item_id_mapping_rev.get(x))
result['similar_article_id'] = result.similar_article_id.map(lambda x: item_id_mapping_rev.get(x))

result.sort_values(['article_id', 'score'], ascending=[True, False], inplace=True)
result.reset_index(inplace=True, drop=True)

In [27]:
result.to_parquet('sim_cosine.parquet')
# result.to_parquet('sim_cond_prob.parquet')