In [9]:
import pandas as pd
import numpy as np
from datetime import timedelta
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.scenarios import Timed
from recpack.pipelines import PipelineBuilder
DATA_PATH = "../data"

In [10]:
df = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
df['ts'] = (pd.to_datetime(df['t_dat']).astype(np.int64) // 10**9).astype(np.int32)

test_week = df.week.max() + 1
df = df[df.week > df.week.max() - 10]
df = df[['article_id', 'customer_id', 'ts', 't_dat']]

# Candidate Generation: hyperparameter tuning

In [11]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='ts')
proc.add_filter(MinUsersPerItem(10, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(10, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(df)

  0%|          | 0/420382 [00:00<?, ?it/s]

  0%|          | 0/420382 [00:00<?, ?it/s]

  0%|          | 0/420382 [00:00<?, ?it/s]

  0%|          | 0/420382 [00:00<?, ?it/s]

In [12]:
two_weeks_ago = (df["t_dat"].max() - timedelta(weeks=2)).timestamp()
four_weeks_ago = (df["t_dat"].max() - timedelta(weeks=4)).timestamp()

scenario = Timed(t=two_weeks_ago, t_validation=four_weeks_ago, delta_out=None, delta_in=None, validation=True)
scenario.split(interaction_matrix)

In [13]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)
builder.add_algorithm('Popularity')
builder.add_algorithm('KUNN', grid={
    'Ku': [10, 20, 30, 40],
    'Ki': [10, 20, 30, 40],
})
builder.set_optimisation_metric('CoverageK', K=12)
builder.add_metric('CoverageK', K=[10, 20, 30, 40])

In [None]:
pipeline = builder.build()
pipeline.run()

In [15]:
pipeline.get_metrics()

Unnamed: 0,coveragek_10,coveragek_20,coveragek_30,coveragek_40
Popularity(K=200),0.001141,0.001879,0.002684,0.003422
"KUNN(Ki=10,Ku=10)",0.792511,0.916588,0.943632,0.950409


Unnamed: 0,coveragek_10,coveragek_20,coveragek_30,coveragek_40
Popularity(K=200),0.001141,0.001879,0.002684,0.003422
"KUNN(Ki=10,Ku=10)",0.792511,0.916588,0.943632,0.950409


In [16]:
pipeline.optimisation_results

Unnamed: 0,identifier,params,coveragek_12
0,"KUNN(Ki=10,Ku=10)","{'Ki': 10, 'Ku': 10}",0.800899
1,"KUNN(Ki=10,Ku=20)","{'Ki': 10, 'Ku': 20}",0.785264
2,"KUNN(Ki=10,Ku=30)","{'Ki': 10, 'Ku': 30}",0.772916
3,"KUNN(Ki=10,Ku=40)","{'Ki': 10, 'Ku': 40}",0.765736
4,"KUNN(Ki=20,Ku=10)","{'Ki': 20, 'Ku': 10}",0.782647
5,"KUNN(Ki=20,Ku=20)","{'Ki': 20, 'Ku': 20}",0.755335
6,"KUNN(Ki=20,Ku=30)","{'Ki': 20, 'Ku': 30}",0.745001
7,"KUNN(Ki=20,Ku=40)","{'Ki': 20, 'Ku': 40}",0.737686
8,"KUNN(Ki=30,Ku=10)","{'Ki': 30, 'Ku': 10}",0.777949
9,"KUNN(Ki=30,Ku=20)","{'Ki': 30, 'Ku': 20}",0.747282


Unnamed: 0,identifier,params,coveragek_12
0,"KUNN(Ki=10,Ku=10)","{'Ki': 10, 'Ku': 10}",0.800899
1,"KUNN(Ki=10,Ku=20)","{'Ki': 10, 'Ku': 20}",0.785264
2,"KUNN(Ki=10,Ku=30)","{'Ki': 10, 'Ku': 30}",0.772916
3,"KUNN(Ki=10,Ku=40)","{'Ki': 10, 'Ku': 40}",0.765736
4,"KUNN(Ki=20,Ku=10)","{'Ki': 20, 'Ku': 10}",0.782647
5,"KUNN(Ki=20,Ku=20)","{'Ki': 20, 'Ku': 20}",0.755335
6,"KUNN(Ki=20,Ku=30)","{'Ki': 20, 'Ku': 30}",0.745001
7,"KUNN(Ki=20,Ku=40)","{'Ki': 20, 'Ku': 40}",0.737686
8,"KUNN(Ki=30,Ku=10)","{'Ki': 30, 'Ku': 10}",0.777949
9,"KUNN(Ki=30,Ku=20)","{'Ki': 30, 'Ku': 20}",0.747282
