In [1]:
import numpy as np
import pandas as pd
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.scenarios import Timed
from recpack.pipelines import PipelineBuilder
from recpack.algorithms import ItemKNN, KUNN
from recpack.metrics import RecallK, PrecisionK, NDCGK

# import utils file from previous lecture
import sys
sys.path.append('../lecture4')
from utils import DATA_PATH, customer_hex_id_to_int

In [2]:
transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
# customers = pd.read_parquet(f'{DATA_PATH}/customers.parquet')
# articles = pd.read_parquet(f'{DATA_PATH}/articles.parquet')

In [3]:
test_week = transactions.week.max()
transactions = transactions[transactions.week > test_week - 10]

In [4]:
# print the amount of unique customers and articles
print(f'Unique customers: {transactions["customer_id"].nunique()}')
print(f'Unique articles: {transactions["article_id"].nunique()}')

Unique customers: 437365
Unique articles: 38331


## Preprocessing

In [5]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(10, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(10, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions)

  0%|          | 0/1228106 [00:00<?, ?it/s]

  0%|          | 0/1228106 [00:00<?, ?it/s]

In [6]:
scenario = Timed(t=test_week, t_validation=test_week - 1, delta_out=None, delta_in=None, validation=True)
scenario.split(interaction_matrix)

In [7]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

# [50, 600] => best: ItemKNN(K=90,normalize_X=False,normalize_sim=True,pop_discount=None,similarity=cosine), Recall12=0.024356
builder.add_algorithm('ItemKNN', grid={
    'K': [k for k in range(50, 150, 10)],  
    'similarity': ['cosine'],
    'normalize_X': [True, False],
    'normalize_sim': [True]
})

# [50, 600] => best: TARSItemKNN(K=580,fit_decay=0.1,predict_decay=0.3333333333333333,similarity=cosine), Recall12=0.028117
builder.add_algorithm('TARSItemKNN', grid={
    'K': [k for k in range(570, 680, 10)], 
    'similarity': ['cosine'],
    'fit_decay': [1/2, 1/5, 1/10],
    'predict_decay': [1/3, 1/5, 1/10],
})

builder.add_metric('PrecisionK', K=[12, 20, 30, 40])
builder.add_metric('RecallK', K=[12, 20, 30, 40])

builder.set_optimisation_metric('RecallK', K=12)

In [8]:
import warnings
from scipy.sparse import SparseEfficiencyWarning
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", SparseEfficiencyWarning)

pipeline = builder.build()
pipeline.run()

  0%|          | 0/2 [00:00<?, ?it/s]

2022-11-22 08:44:12,974 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.873s
2022-11-22 08:44:14,781 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.883s
2022-11-22 08:44:16,693 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.96s
2022-11-22 08:44:18,764 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.979s
2022-11-22 08:44:20,874 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.05s
2022-11-22 08:44:23,233 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.08s
2022-11-22 08:44:25,539 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.13s
2022-11-22 08:44:28,202 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.21s
2022-11-22 08:44:30,856 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.28s
2022-11-22 08:44:33,674 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.31s
2022-11-22 08:44:36,631 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.38s
2022-11-22 08:44:39,670 - bas

In [9]:
pipeline.optimisation_results

Unnamed: 0,identifier,params,recallk_12
0,"ItemKNN(K=50,normalize_X=True,normalize_sim=Tr...","{'K': 50, 'normalize_X': True, 'normalize_sim'...",0.015098
1,"ItemKNN(K=50,normalize_X=False,normalize_sim=T...","{'K': 50, 'normalize_X': False, 'normalize_sim...",0.022547
2,"ItemKNN(K=60,normalize_X=True,normalize_sim=Tr...","{'K': 60, 'normalize_X': True, 'normalize_sim'...",0.015055
3,"ItemKNN(K=60,normalize_X=False,normalize_sim=T...","{'K': 60, 'normalize_X': False, 'normalize_sim...",0.022636
4,"ItemKNN(K=70,normalize_X=True,normalize_sim=Tr...","{'K': 70, 'normalize_X': True, 'normalize_sim'...",0.014897
...,...,...,...
114,"TARSItemKNN(K=670,fit_decay=0.2,predict_decay=...","{'K': 670, 'fit_decay': 0.2, 'predict_decay': ...",0.024195
115,"TARSItemKNN(K=670,fit_decay=0.2,predict_decay=...","{'K': 670, 'fit_decay': 0.2, 'predict_decay': ...",0.023652
116,"TARSItemKNN(K=670,fit_decay=0.1,predict_decay=...","{'K': 670, 'fit_decay': 0.1, 'predict_decay': ...",0.025925
117,"TARSItemKNN(K=670,fit_decay=0.1,predict_decay=...","{'K': 670, 'fit_decay': 0.1, 'predict_decay': ...",0.025667


In [10]:
pipeline.get_metrics()

Unnamed: 0,precisionk_12,precisionk_20,precisionk_30,precisionk_40,recallk_12,recallk_20,recallk_30,recallk_40
"ItemKNN(K=90,normalize_X=False,normalize_sim=True,pop_discount=None,similarity=cosine)",0.007474,0.006124,0.005108,0.004458,0.024356,0.033519,0.040821,0.047116
"TARSItemKNN(K=580,fit_decay=0.1,predict_decay=0.3333333333333333,similarity=cosine)",0.00838,0.00671,0.005677,0.005007,0.028117,0.036982,0.045927,0.053186
