In [1]:
import numpy as np
import pandas as pd
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.scenarios import Timed
from recpack.pipelines import PipelineBuilder

# import utils file from previous lecture
import sys
sys.path.append('../lecture4')
from utils import DATA_PATH, customer_hex_id_to_int

In [2]:
transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
# customers = pd.read_parquet(f'{DATA_PATH}/customers.parquet')
# articles = pd.read_parquet(f'{DATA_PATH}/articles.parquet')

In [3]:
test_week = transactions.week.max()
transactions = transactions[transactions.week > test_week - 20]

In [4]:
# print the amount of unique customers and articles
print(f'Unique customers: {transactions["customer_id"].nunique()}')
print(f'Unique articles: {transactions["article_id"].nunique()}')

Unique customers: 659008
Unique articles: 48548


## Preprocessing

In [5]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(20, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(20, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions)

  0%|          | 0/2223746 [00:00<?, ?it/s]

  0%|          | 0/2223746 [00:00<?, ?it/s]

In [6]:
scenario = Timed(t=test_week, t_validation=test_week - 1, delta_out=None, delta_in=None, validation=True)
scenario.split(interaction_matrix)

In [7]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

# [30, 400] => best: ItemKNN(K=80,normalize_X=False,normalize_sim=True,pop_discount=None,similarity=cosine), Recall12=0.013781
builder.add_algorithm('ItemKNN', grid={
    'K': [k for k in range(50, 100, 10)],  
    'similarity': ['cosine'],
    'normalize_X': [True, False],
    'normalize_sim': [True]
})

# [50, 800] => best: TARSItemKNN(K=720,fit_decay=0.05,predict_decay=0.3333333333333333,similarity=cosine), Recall12=0.026913
builder.add_algorithm('TARSItemKNN', grid={
    'K': [k for k in range(700, 800, 20)], 
    'similarity': ['cosine'],
    'fit_decay': [1/20],
    'predict_decay': [1/3],
})

builder.add_metric('PrecisionK', K=[12, 20, 30, 40])
builder.add_metric('RecallK', K=[12, 20, 30, 40])

builder.set_optimisation_metric('RecallK', K=12)

In [8]:
import warnings
from scipy.sparse import SparseEfficiencyWarning
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", SparseEfficiencyWarning)

pipeline = builder.build()
pipeline.run()

  0%|          | 0/2 [00:00<?, ?it/s]

2022-11-22 11:31:00,748 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.23s
2022-11-22 11:31:05,655 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.88s
2022-11-22 11:31:09,554 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.18s
2022-11-22 11:31:14,179 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.46s
2022-11-22 11:31:19,610 - base - recpack - INFO - Fitting ItemKNN complete - Took 3.39s
2022-11-22 11:31:24,880 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.44s
2022-11-22 11:31:30,662 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.51s
2022-11-22 11:31:36,092 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.54s
2022-11-22 11:31:41,589 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.68s
2022-11-22 11:31:47,483 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.73s
2022-11-22 11:31:53,161 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.68s
2022-11-22 11:32:19,828 - base -

In [9]:
pipeline.optimisation_results

Unnamed: 0,identifier,params,recallk_12
0,"ItemKNN(K=50,normalize_X=True,normalize_sim=Tr...","{'K': 50, 'normalize_X': True, 'normalize_sim'...",0.012669
1,"ItemKNN(K=50,normalize_X=False,normalize_sim=T...","{'K': 50, 'normalize_X': False, 'normalize_sim...",0.014281
2,"ItemKNN(K=60,normalize_X=True,normalize_sim=Tr...","{'K': 60, 'normalize_X': True, 'normalize_sim'...",0.012623
3,"ItemKNN(K=60,normalize_X=False,normalize_sim=T...","{'K': 60, 'normalize_X': False, 'normalize_sim...",0.013894
4,"ItemKNN(K=70,normalize_X=True,normalize_sim=Tr...","{'K': 70, 'normalize_X': True, 'normalize_sim'...",0.012633
5,"ItemKNN(K=70,normalize_X=False,normalize_sim=T...","{'K': 70, 'normalize_X': False, 'normalize_sim...",0.014195
6,"ItemKNN(K=80,normalize_X=True,normalize_sim=Tr...","{'K': 80, 'normalize_X': True, 'normalize_sim'...",0.012375
7,"ItemKNN(K=80,normalize_X=False,normalize_sim=T...","{'K': 80, 'normalize_X': False, 'normalize_sim...",0.014299
8,"ItemKNN(K=90,normalize_X=True,normalize_sim=Tr...","{'K': 90, 'normalize_X': True, 'normalize_sim'...",0.012451
9,"ItemKNN(K=90,normalize_X=False,normalize_sim=T...","{'K': 90, 'normalize_X': False, 'normalize_sim...",0.014236


In [10]:
pipeline.get_metrics()

Unnamed: 0,precisionk_12,precisionk_20,precisionk_30,precisionk_40,recallk_12,recallk_20,recallk_30,recallk_40
"ItemKNN(K=80,normalize_X=False,normalize_sim=True,pop_discount=None,similarity=cosine)",0.004519,0.003898,0.00354,0.00327,0.013781,0.020367,0.027977,0.034636
"TARSItemKNN(K=720,fit_decay=0.05,predict_decay=0.3333333333333333,similarity=cosine)",0.007849,0.006455,0.005455,0.004818,0.026913,0.036027,0.044954,0.051914
