In [2]:
import sys
sys.path.append("/Users/dmtikhonov/git_project/blondered/RecTools")

# `TwoStageModel` user guide

In [18]:
from rectools.models import PopularModel, ImplicitItemKNNWrapperModel
from implicit.nearest_neighbours import CosineRecommender
from rectools.models.rerank import TwoStageModel, CandidateGenerator, RerankerBase
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from sklearn.linear_model import RidgeClassifier
from pathlib import Path
import pandas as pd
from rectools import Columns

In [7]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 927 ms, sys: 245 ms, total: 1.17 s
Wall time: 1min 1s


In [9]:
# Prepare dataset

DATA_PATH = Path("data_original")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1
dataset = Dataset.construct(interactions)

In [13]:
# Prepare first stage models
first_stage = [
    CandidateGenerator(PopularModel(), 30, True, True), 
    CandidateGenerator(ImplicitItemKNNWrapperModel(CosineRecommender()), 30, True, True)
]

# Prepare splitter for selecting reranker train. Only one fold is expected!
splitter = TimeRangeSplitter("7D")

In [17]:
# Initialize TwoStageModel
# RerankerBase is not really used in final pipeline, we just didn't write the final class right now
# We can also pass negative sampler but here we are just using the default one

two_stage = TwoStageModel(first_stage, splitter, RerankerBase(RidgeClassifier()))

In [20]:
# Split dataset interactions
# Fit first stage models on history dataset
# Generate recommendations from first stage -> Get candidates for reranker
# Add targets to all candidates
# Sample negatives (here defult PerUserNegativeSampler is used) (we should probably make a public method to get data before sampling)
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [21]:
# This is train data for boosting or any other reranker. id columns will be dropped before training
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target
0,16090,12995,21577.0,18.0,,,0
1,708897,13713,,,0.656411,11.0,0
2,269372,512,,,0.104727,22.0,0
3,452915,8636,34148.0,11.0,0.142324,17.0,0
4,410073,7793,15221.0,29.0,,,0
5,924688,4436,16846.0,26.0,,,0
6,203333,11863,16231.0,29.0,0.081245,11.0,1
7,183613,4563,,,0.042168,12.0,0
8,1025300,15830,,,0.176777,7.0,0
9,336554,14431,20276.0,18.0,,,0
