In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.scenarios import Timed
from recpack.pipelines import PipelineBuilder

DATA_PATH = '../../data'

In [2]:
df = pd.read_csv(f'{DATA_PATH}/transactions_train.csv', parse_dates=['t_dat'])
df['ts'] = (pd.to_datetime(df['t_dat']).astype(np.int64) // 10**9).astype(np.int32)
df = df[['article_id', 'customer_id', 'ts', 't_dat']]

# Candidate Generation: hyperparameter tuning

In [3]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='ts')
proc.add_filter(MinUsersPerItem(5, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(50, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(df)

  0%|          | 0/15795409 [00:00<?, ?it/s]

  0%|          | 0/15795409 [00:00<?, ?it/s]

In [4]:
two_weeks_ago = (df["t_dat"].max() - timedelta(weeks=2)).timestamp()
four_weeks_ago = (df["t_dat"].max() - timedelta(weeks=4)).timestamp()

scenario = Timed(t=two_weeks_ago, t_validation=four_weeks_ago, delta_out=None, delta_in=None, validation=True)
scenario.split(interaction_matrix)

In [5]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)
builder.add_algorithm('Popularity')
builder.add_algorithm('ItemKNN', grid={
    'K': [10, 20, 30, 40],
    'similarity': ['cosine'],
})
builder.set_optimisation_metric('CoverageK', K=12)
builder.add_metric('CoverageK', K=[10, 20, 30, 40])

In [6]:
pipeline = builder.build()
pipeline.run()

  0%|          | 0/2 [00:00<?, ?it/s]

2022-11-10 16:12:39,004 - base - recpack - INFO - Fitting Popularity complete - Took 1.92s


  self._set_arrayXarray(i, j, x)


2022-11-10 16:14:13,598 - base - recpack - INFO - Fitting ItemKNN complete - Took 89.2s


  self._set_arrayXarray(i, j, x)


2022-11-10 16:15:28,311 - base - recpack - INFO - Fitting ItemKNN complete - Took 70.6s


  self._set_arrayXarray(i, j, x)


2022-11-10 16:16:45,841 - base - recpack - INFO - Fitting ItemKNN complete - Took 70.4s


  self._set_arrayXarray(i, j, x)


2022-11-10 16:18:13,646 - base - recpack - INFO - Fitting ItemKNN complete - Took 77.7s


  self._set_arrayXarray(i, j, x)


2022-11-10 16:19:35,417 - base - recpack - INFO - Fitting ItemKNN complete - Took 68.5s




In [7]:
pipeline.get_metrics()

Unnamed: 0,coveragek_10,coveragek_20,coveragek_30,coveragek_40
Popularity(K=200),0.000242,0.000385,0.000539,0.000682
"ItemKNN(K=10,normalize_X=False,normalize_sim=False,pop_discount=None,similarity=cosine)",0.248823,0.37676,0.470455,0.541623


In [8]:
pipeline.optimisation_results

Unnamed: 0,identifier,params,coveragek_12
0,"ItemKNN(K=10,normalize_X=False,normalize_sim=F...","{'K': 10, 'similarity': 'cosine'}",0.282317
1,"ItemKNN(K=20,normalize_X=False,normalize_sim=F...","{'K': 20, 'similarity': 'cosine'}",0.271878
2,"ItemKNN(K=30,normalize_X=False,normalize_sim=F...","{'K': 30, 'similarity': 'cosine'}",0.261143
3,"ItemKNN(K=40,normalize_X=False,normalize_sim=F...","{'K': 40, 'similarity': 'cosine'}",0.251694
