In [1]:
import tqdm as notebook_tqdm
import functions as f
import pandas as pd
import numpy as np

from recpack.datasets import MovieLens25M
from recpack.scenarios import WeakGeneralization
from recpack.pipelines import PipelineBuilder
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
transactions_path = '../00 - Data/transactions_train/transactions_train.csv'
short_transactions_path = f.shorten_transactions(transactions_path,2)
transactions = pd.read_csv(short_transactions_path)

Number of elements in database before : 31788324
Number of elements in database after : 2522561
Database contains recrods from : 2020-09-15 00:00:00 to : 2020-07-15 00:00:00


In [5]:
#turns pandas dataframe into interaction-matrix object
#       item1   item2   item3
#usr1      x                x
#usr2       x       x
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='t_dat')
#every user has at least 2 items bought
proc.add_filter(MinUsersPerItem(2, item_ix='article_id', user_ix='customer_id'))
#every item is bought at least twice
proc.add_filter(MinItemsPerUser(2, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions)

100%|██████████| 2429824/2429824 [00:02<00:00, 929027.67it/s] 
100%|██████████| 2429824/2429824 [00:02<00:00, 1043313.75it/s]


In [6]:
from recpack.scenarios import WeakGeneralization
#divide matrix into test-train (75-25)
scenario = WeakGeneralization(0.75, validation=True)
scenario.split(interaction_matrix)

builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

338534it [01:25, 3956.89it/s]
338534it [01:26, 3907.99it/s]


First, you calculate the Discounted Cumulative Gain (DCG) at K, which is the sum of the relevance scores of the top-K recommended items, each discounted by its position in the list. Relevance scores are often binary (relevant or not relevant) or graded (e.g., on a scale from 1 to 5).
Then, you calculate the Ideal DCG (IDCG) at K, which represents the best possible DCG score if the recommendations were perfectly relevant.
Finally, you compute NDCG@K as the ratio of DCG@K to IDCG@K, normalizing the score to be between 0 and 1. A higher NDCG@K indicates better recommendations.
Coverage@K:

Coverage is a metric that measures how diverse or comprehensive a recommendation system is in terms of the items it suggests.
The "@K" in this metric signifies that it is calculated for the top K recommendations.
The idea is to assess the ability of the system to cover a wide range of items in its recommendations, not just focusing on a few popular items.
The Coverage@K metric can be calculated in various ways, but a common approach is to count the unique items that appear in the top-K recommendations. A higher Coverage@K indicates that the recommendations cover a larger variety of items.

In [8]:
#adds algorithms to use later on
builder.add_algorithm('Popularity') # No real parameters to optimise
#we will evaluate similarity using K nearest neighbors and computing distance with cosoine
builder.add_algorithm('ItemKNN', grid={
    'K': [100, 200, 500],
    'similarity': ['cosine', 'conditional_probability'],
})

#Set the metric for optimisation of parameters in algorithms. What is NDCGK ??
builder.set_optimisation_metric('NDCGK', K=10)

#adds metric for evaluation
#Normalized Discounted Cumulative Gain at K
builder.add_metric('NDCGK', K=[10, 20, 50])
builder.add_metric('CoverageK', K=[10, 20])



In [11]:
pipeline = builder.build()
pipeline.run()

  0%|          | 0/2 [00:00<?, ?it/s]

2023-11-07 18:09:15,216 - base - recpack - INFO - Fitting Popularity complete - Took 0.541s


  self._set_arrayXarray(i, j, x)


2023-11-07 18:10:00,681 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.62s


  self._set_arrayXarray(i, j, x)


2023-11-07 18:10:18,900 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.92s


  self._set_arrayXarray(i, j, x)


2023-11-07 18:10:35,394 - base - recpack - INFO - Fitting ItemKNN complete - Took 3.55s


  self._set_arrayXarray(i, j, x)


2023-11-07 18:11:05,859 - base - recpack - INFO - Fitting ItemKNN complete - Took 4.05s


  self._set_arrayXarray(i, j, x)


2023-11-07 18:11:34,932 - base - recpack - INFO - Fitting ItemKNN complete - Took 6.26s


  self._set_arrayXarray(i, j, x)


2023-11-07 18:12:36,402 - base - recpack - INFO - Fitting ItemKNN complete - Took 6.52s


  self._set_arrayXarray(i, j, x)


2023-11-07 18:13:23,469 - base - recpack - INFO - Fitting ItemKNN complete - Took 3.28s


100%|██████████| 2/2 [04:55<00:00, 147.68s/it]


For the itemKNN recommendation, we can see how when suggesting in a range of k=10 it achieves a pretty high recommendation of varied items (0.77)
but looking at the metric NDCGK it is not very good at recommending stuff. This leads me to believe that i am recommending too much novel stuff


In [14]:
pipeline.get_metrics()


Unnamed: 0,NDCGK_10,NDCGK_20,NDCGK_50,CoverageK_10,CoverageK_20
Popularity(K=200),0.005446,0.007678,0.011997,0.000489,0.000848
"ItemKNN(K=100,normalize_X=False,normalize_sim=False,pop_discount=None,similarity=conditional_probability)",0.095057,0.10507,0.117266,0.773613,0.878079


In [13]:
pipeline.optimisation_results


Unnamed: 0,algorithm,identifier,params,NDCGK_10
0,ItemKNN,"ItemKNN(K=100,normalize_X=False,normalize_sim=...","{'K': 100, 'similarity': 'cosine'}",0.066662
1,ItemKNN,"ItemKNN(K=100,normalize_X=False,normalize_sim=...","{'K': 100, 'similarity': 'conditional_probabil...",0.08127
2,ItemKNN,"ItemKNN(K=200,normalize_X=False,normalize_sim=...","{'K': 200, 'similarity': 'cosine'}",0.067496
3,ItemKNN,"ItemKNN(K=200,normalize_X=False,normalize_sim=...","{'K': 200, 'similarity': 'conditional_probabil...",0.081221
4,ItemKNN,"ItemKNN(K=500,normalize_X=False,normalize_sim=...","{'K': 500, 'similarity': 'cosine'}",0.068564
5,ItemKNN,"ItemKNN(K=500,normalize_X=False,normalize_sim=...","{'K': 500, 'similarity': 'conditional_probabil...",0.081145
