In [1]:
import sys
sys.path.append("/home/oirvach1/RecTools")

# `TwoStageModel` user guide

In [2]:
from rectools.models import PopularModel, ImplicitItemKNNWrapperModel
from implicit.nearest_neighbours import CosineRecommender
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from sklearn.linear_model import RidgeClassifier
from pathlib import Path
import pandas as pd
import numpy as np
from rectools import Columns
from rectools.models.rerank import TwoStageModel, CandidateGenerator, RerankerBase, CatBoostRerankerWrapper
from lightgbm import LGBMClassifier, LGBMRanker
from catboost import CatBoostClassifier, CatBoostRanker
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 425 ms, sys: 161 ms, total: 586 ms
Wall time: 36 s


In [4]:
# Prepare dataset

DATA_PATH = Path("data_original")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1
dataset = Dataset.construct(interactions)

In [5]:
# Prepare first stage models
first_stage = [
    CandidateGenerator(PopularModel(), 30, True, True), 
    CandidateGenerator(ImplicitItemKNNWrapperModel(CosineRecommender()), 30, True, True)
]

# Prepare splitter for selecting reranker train. Only one fold is expected!
splitter = TimeRangeSplitter("7D")

In [6]:
# Initialize TwoStageModel
# RerankerBase is not really used in final pipeline, we just didn't write the final class right now
# We can also pass negative sampler but here we are just using the default one

two_stage = TwoStageModel(first_stage, splitter, RerankerBase(RidgeClassifier()))

In [7]:
# Split dataset interactions
# Fit first stage models on history dataset
# Generate recommendations from first stage -> Get candidates for reranker
# Add targets to all candidates
# Sample negatives (here defult PerUserNegativeSampler is used) (we should probably make a public method to get data before sampling)
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [8]:
# This is train data for boosting or any other reranker. id columns will be dropped before training
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target
0,870705,8636,34148.0,8.0,0.8683,7.0,0
1,461643,14526,,,0.063427,24.0,0
2,457329,1517,,,0.149156,24.0,0
3,709024,14431,20276.0,12.0,,,1
4,337461,7417,17346.0,20.0,,,0
5,294808,142,42877.0,9.0,,,0
6,359631,3784,,,0.245184,24.0,0
7,1004603,6636,,,0.591062,14.0,0
8,653465,6809,39498.0,10.0,,,1
9,644766,12173,,,0.108739,26.0,0


## What if we want to easily add features to candidates?

### Variant 1: from external source
Other options are:
- Get features from dataset
- Fet time-based features using_fold info from splitter
- Combine any of the above

In [9]:
from rectools.models.rerank import CandidatesFeatureCollectorBase
import typing as tp
from rectools.models.base import AnyIds

# Write custome feature collecting funcs for users, items and user/item pairs
class CustomFeatureCollector(CandidatesFeatureCollectorBase):
    def _get_user_features(
        self, users: AnyIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]], external_ids: bool
    ) -> pd.DataFrame:
        user_features = pd.read_csv(DATA_PATH / 'users.csv')
        return user_features[user_features[Columns.User].isin(users)]

In [10]:
# Now we specify our custom feature collector for TwoStageModel

two_stage = TwoStageModel(first_stage, splitter, RerankerBase(RidgeClassifier()), feature_collector=CustomFeatureCollector())

In [11]:
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [12]:
# Now our candidates also have features for users
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target,age,income,sex,kids_flg
0,979121,7107,16279.0,26.0,,,0,age_65_inf,income_20_40,М,0.0
1,58982,2657,66415.0,5.0,0.394153,7.0,0,age_35_44,income_20_40,М,1.0
2,659982,12995,21577.0,18.0,,,0,age_18_24,income_40_60,М,0.0
3,244104,4740,33831.0,12.0,,,0,age_25_34,income_20_40,М,0.0
4,781714,12995,21577.0,16.0,1.045084,7.0,0,age_35_44,income_20_40,Ж,1.0
5,100203,14120,,,0.084311,27.0,0,age_45_54,income_20_40,М,1.0
6,898376,16166,,,0.219737,6.0,0,age_35_44,income_20_40,Ж,0.0
7,551827,15531,,,0.312845,12.0,0,age_55_64,income_40_60,М,0.0
8,811434,6809,39498.0,9.0,,,0,age_18_24,income_20_40,М,1.0
9,591233,1844,24009.0,15.0,,,0,,,,


## GradientBoostingClassifier guide

In [13]:
# Prepare dataset

DATA_PATH = Path("data_original")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1
dataset = Dataset.construct(interactions)

In [14]:
# Prepare first stage models
first_stage = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the GradientBoostingClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the GradientBoostingClassifier, you need to fill in the empty ranks (e.g. min rank)
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the GradientBoostingClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the GradientBoostingClassifier, you need to fill in the empty ranks (e.g. min rank)
    )
]

# Prepare splitter for selecting reranker train. Only one fold is expected!
splitter = TimeRangeSplitter("7D")

In [15]:
two_stage = TwoStageModel(first_stage,
                          splitter,
                          RerankerBase(GradientBoostingClassifier())
                         )

In [16]:
two_stage.fit(dataset)

<rectools.models.rerank.TwoStageModel at 0x7f19244538b0>

In [17]:
reco = two_stage.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [18]:
reco

Unnamed: 0,user_id,item_id,score,rank
13958790,1097557,10440,0.616494,1
13958792,1097557,13865,0.505172,2
13958791,1097557,9728,0.471941,3
13958793,1097557,3734,0.356907,4
13958794,1097557,2657,0.289822,5
...,...,...,...,...
18576,0,142,0.227925,6
18575,0,4880,0.221551,7
18578,0,9996,0.208693,8
18597,0,12173,0.194356,9


In [18]:
# TODO: CrossValidate

## CatBoostClassifier guide

To successfully launch CatBoostClassifier at the ranking stage, it is necessary to **process categorical features**: fill in empty values.

In [19]:
# Prepare dataset

DATA_PATH = Path("data_original")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1

In [20]:
# your any helper functions for working with loaded data
def encode_and_clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df_encode = encode_cat_cols(df)    
    cols_with_nan = df_encode.columns[df_encode.isna().any()].tolist()
    df_encode[cols_with_nan] = df_encode[cols_with_nan].fillna(df_encode[cols_with_nan].median())
    return df_encode

def encode_cat_cols(df: pd.DataFrame) -> pd.DataFrame:    
    df_cat_cols = df.select_dtypes(include=['object']).columns
    df[df_cat_cols] = df[df_cat_cols].astype('category')
    
    for col in df_cat_cols:
        cat_col = df[col].astype('category').cat
        df[col] = cat_col.codes.astype('category')
    return df

In [21]:
# coding categorical data and handling missing values
users = encode_and_clean_data(users)

In [22]:
# we check if all users who have interactions have a feature description (users)
interactions[Columns.User].nunique(), users[Columns.User].nunique()

(962179, 840197)

In [23]:
# we leave only interactions where users have a characteristic description
user_ids_with_feature = np.intersect1d(interactions[Columns.User].unique(), users[Columns.User].unique())
interactions = interactions.query(f"{Columns.User} in @user_ids_with_feature")

In [24]:
dataset = Dataset.construct(interactions)

In [25]:
# Prepare first stage models
first_stage = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
    )
]

# Prepare splitter for selecting reranker train. Only one fold is expected!
splitter = TimeRangeSplitter("7D")

In [26]:
# Write custome feature collecting funcs for users, items and user/item pairs
class CustomFeatureCollectorCatBoost(CandidatesFeatureCollectorBase):
    def _get_user_features(
        self, users: AnyIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]], external_ids: bool
    ) -> pd.DataFrame:
        user_features = pd.read_csv(DATA_PATH / 'users.csv')
        user_features = encode_and_clean_data(user_features)
        return user_features[user_features[Columns.User].isin(users)]

In [27]:
# Now we specify our custom feature collector for TwoStageModel
# To transfer CatBoostClassifier we use CatBoostRerankerWrapper (for faster work with large amounts of data)

two_stage = TwoStageModel(first_stage,
                          splitter,
                          CatBoostRerankerWrapper(CatBoostClassifier()),
                          feature_collector=CustomFeatureCollectorCatBoost())

In [28]:
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [29]:
# Now our candidates also have features for users
# CatBoostClassifier can work with empty values of numeric data
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target,age,income,sex,kids_flg
0,1048737,16228,11855.0,21.0,0.210147,29.0,0,-1,-1,-1,0
1,574743,14703,14455.0,21.0,,,0,2,3,1,0
2,681048,11778,,,0.16638,12.0,0,5,2,0,0
3,936296,10440,125533.0,1.0,0.62735,2.0,1,2,3,1,0
4,390427,1844,20398.0,15.0,,,0,4,3,1,0
5,155509,3734,56265.0,4.0,0.53171,4.0,0,2,3,0,1
6,1022018,11502,,,0.120386,15.0,0,3,5,1,0
7,756483,15806,,,0.075165,28.0,0,1,2,1,1
8,763097,7107,14110.0,28.0,,,0,1,2,0,0
9,632618,142,32749.0,7.0,0.610799,7.0,0,2,2,0,0


In [30]:
cat_cols = ['age', 'income', 'sex']

# example parameters for running model training 
# more valid parameters here https://catboost.ai/en/docs/concepts/python-reference_pool
fit_params = {
    'cat_features': cat_cols,
}

In [31]:
two_stage.fit(dataset, **fit_params)

Learning rate set to 0.122961
0:	learn: 0.6228306	total: 138ms	remaining: 2m 17s
1:	learn: 0.5715496	total: 202ms	remaining: 1m 40s
2:	learn: 0.5343433	total: 261ms	remaining: 1m 26s
3:	learn: 0.5066188	total: 313ms	remaining: 1m 18s
4:	learn: 0.4860988	total: 366ms	remaining: 1m 12s
5:	learn: 0.4707388	total: 414ms	remaining: 1m 8s
6:	learn: 0.4587273	total: 463ms	remaining: 1m 5s
7:	learn: 0.4502151	total: 506ms	remaining: 1m 2s
8:	learn: 0.4435678	total: 556ms	remaining: 1m 1s
9:	learn: 0.4385905	total: 603ms	remaining: 59.7s
10:	learn: 0.4348583	total: 655ms	remaining: 58.9s
11:	learn: 0.4309688	total: 702ms	remaining: 57.8s
12:	learn: 0.4280513	total: 748ms	remaining: 56.8s
13:	learn: 0.4255529	total: 793ms	remaining: 55.9s
14:	learn: 0.4237005	total: 844ms	remaining: 55.4s
15:	learn: 0.4216868	total: 890ms	remaining: 54.7s
16:	learn: 0.4202455	total: 938ms	remaining: 54.2s
17:	learn: 0.4193165	total: 988ms	remaining: 53.9s
18:	learn: 0.4187122	total: 1.04s	remaining: 53.5s
19:	le

161:	learn: 0.4057390	total: 13.3s	remaining: 1m 8s
162:	learn: 0.4056988	total: 13.4s	remaining: 1m 8s
163:	learn: 0.4056853	total: 13.4s	remaining: 1m 8s
164:	learn: 0.4056760	total: 13.4s	remaining: 1m 8s
165:	learn: 0.4056583	total: 13.5s	remaining: 1m 7s
166:	learn: 0.4056133	total: 13.5s	remaining: 1m 7s
167:	learn: 0.4055993	total: 13.6s	remaining: 1m 7s
168:	learn: 0.4055774	total: 13.6s	remaining: 1m 7s
169:	learn: 0.4055563	total: 13.7s	remaining: 1m 6s
170:	learn: 0.4055460	total: 13.7s	remaining: 1m 6s
171:	learn: 0.4055233	total: 13.8s	remaining: 1m 6s
172:	learn: 0.4055165	total: 13.8s	remaining: 1m 6s
173:	learn: 0.4055012	total: 13.9s	remaining: 1m 5s
174:	learn: 0.4054941	total: 13.9s	remaining: 1m 5s
175:	learn: 0.4054487	total: 14s	remaining: 1m 5s
176:	learn: 0.4054157	total: 14s	remaining: 1m 5s
177:	learn: 0.4054015	total: 14.1s	remaining: 1m 4s
178:	learn: 0.4053727	total: 14.1s	remaining: 1m 4s
179:	learn: 0.4053459	total: 14.2s	remaining: 1m 4s
180:	learn: 0.40

323:	learn: 0.4026987	total: 21.1s	remaining: 44s
324:	learn: 0.4026838	total: 21.1s	remaining: 43.9s
325:	learn: 0.4026774	total: 21.2s	remaining: 43.8s
326:	learn: 0.4026709	total: 21.2s	remaining: 43.7s
327:	learn: 0.4026532	total: 21.3s	remaining: 43.6s
328:	learn: 0.4026317	total: 21.3s	remaining: 43.5s
329:	learn: 0.4026159	total: 21.4s	remaining: 43.4s
330:	learn: 0.4026120	total: 21.4s	remaining: 43.3s
331:	learn: 0.4025904	total: 21.5s	remaining: 43.2s
332:	learn: 0.4025839	total: 21.5s	remaining: 43.1s
333:	learn: 0.4025646	total: 21.6s	remaining: 43s
334:	learn: 0.4025529	total: 21.6s	remaining: 42.9s
335:	learn: 0.4025291	total: 21.7s	remaining: 42.8s
336:	learn: 0.4025184	total: 21.7s	remaining: 42.7s
337:	learn: 0.4025002	total: 21.8s	remaining: 42.6s
338:	learn: 0.4024951	total: 21.8s	remaining: 42.5s
339:	learn: 0.4024851	total: 21.9s	remaining: 42.5s
340:	learn: 0.4024703	total: 21.9s	remaining: 42.4s
341:	learn: 0.4024453	total: 22s	remaining: 42.2s
342:	learn: 0.4024

482:	learn: 0.4007723	total: 28.5s	remaining: 30.5s
483:	learn: 0.4007539	total: 28.5s	remaining: 30.4s
484:	learn: 0.4007484	total: 28.5s	remaining: 30.3s
485:	learn: 0.4007447	total: 28.6s	remaining: 30.2s
486:	learn: 0.4007260	total: 28.6s	remaining: 30.2s
487:	learn: 0.4007208	total: 28.7s	remaining: 30.1s
488:	learn: 0.4007131	total: 28.7s	remaining: 30s
489:	learn: 0.4006925	total: 28.8s	remaining: 29.9s
490:	learn: 0.4006740	total: 28.8s	remaining: 29.9s
491:	learn: 0.4006670	total: 28.8s	remaining: 29.8s
492:	learn: 0.4006595	total: 28.9s	remaining: 29.7s
493:	learn: 0.4006490	total: 28.9s	remaining: 29.7s
494:	learn: 0.4006344	total: 29s	remaining: 29.6s
495:	learn: 0.4006270	total: 29s	remaining: 29.5s
496:	learn: 0.4006267	total: 29.1s	remaining: 29.4s
497:	learn: 0.4006129	total: 29.1s	remaining: 29.3s
498:	learn: 0.4006085	total: 29.1s	remaining: 29.3s
499:	learn: 0.4005925	total: 29.2s	remaining: 29.2s
500:	learn: 0.4005841	total: 29.2s	remaining: 29.1s
501:	learn: 0.4005

645:	learn: 0.3990254	total: 36.1s	remaining: 19.8s
646:	learn: 0.3990098	total: 36.1s	remaining: 19.7s
647:	learn: 0.3989956	total: 36.2s	remaining: 19.7s
648:	learn: 0.3989901	total: 36.2s	remaining: 19.6s
649:	learn: 0.3989808	total: 36.3s	remaining: 19.5s
650:	learn: 0.3989791	total: 36.3s	remaining: 19.5s
651:	learn: 0.3989583	total: 36.4s	remaining: 19.4s
652:	learn: 0.3989508	total: 36.4s	remaining: 19.4s
653:	learn: 0.3989452	total: 36.5s	remaining: 19.3s
654:	learn: 0.3989279	total: 36.5s	remaining: 19.2s
655:	learn: 0.3989267	total: 36.6s	remaining: 19.2s
656:	learn: 0.3989161	total: 36.6s	remaining: 19.1s
657:	learn: 0.3989006	total: 36.6s	remaining: 19s
658:	learn: 0.3988966	total: 36.7s	remaining: 19s
659:	learn: 0.3988828	total: 36.7s	remaining: 18.9s
660:	learn: 0.3988733	total: 36.8s	remaining: 18.9s
661:	learn: 0.3988628	total: 36.8s	remaining: 18.8s
662:	learn: 0.3988451	total: 36.9s	remaining: 18.7s
663:	learn: 0.3988297	total: 36.9s	remaining: 18.7s
664:	learn: 0.39

807:	learn: 0.3976862	total: 43.5s	remaining: 10.3s
808:	learn: 0.3976827	total: 43.5s	remaining: 10.3s
809:	learn: 0.3976679	total: 43.6s	remaining: 10.2s
810:	learn: 0.3976612	total: 43.6s	remaining: 10.2s
811:	learn: 0.3976569	total: 43.7s	remaining: 10.1s
812:	learn: 0.3976481	total: 43.7s	remaining: 10.1s
813:	learn: 0.3976463	total: 43.8s	remaining: 10s
814:	learn: 0.3976409	total: 43.8s	remaining: 9.94s
815:	learn: 0.3976375	total: 43.9s	remaining: 9.89s
816:	learn: 0.3976330	total: 43.9s	remaining: 9.83s
817:	learn: 0.3976227	total: 43.9s	remaining: 9.78s
818:	learn: 0.3976155	total: 44s	remaining: 9.72s
819:	learn: 0.3976090	total: 44s	remaining: 9.67s
820:	learn: 0.3975984	total: 44.1s	remaining: 9.61s
821:	learn: 0.3975922	total: 44.1s	remaining: 9.55s
822:	learn: 0.3975868	total: 44.2s	remaining: 9.5s
823:	learn: 0.3975786	total: 44.2s	remaining: 9.45s
824:	learn: 0.3975701	total: 44.3s	remaining: 9.39s
825:	learn: 0.3975543	total: 44.3s	remaining: 9.33s
826:	learn: 0.39755

968:	learn: 0.3965205	total: 50.7s	remaining: 1.62s
969:	learn: 0.3965049	total: 50.8s	remaining: 1.57s
970:	learn: 0.3964911	total: 50.8s	remaining: 1.52s
971:	learn: 0.3964858	total: 50.9s	remaining: 1.46s
972:	learn: 0.3964768	total: 50.9s	remaining: 1.41s
973:	learn: 0.3964749	total: 50.9s	remaining: 1.36s
974:	learn: 0.3964694	total: 51s	remaining: 1.31s
975:	learn: 0.3964517	total: 51.1s	remaining: 1.25s
976:	learn: 0.3964385	total: 51.1s	remaining: 1.2s
977:	learn: 0.3964366	total: 51.1s	remaining: 1.15s
978:	learn: 0.3964354	total: 51.2s	remaining: 1.1s
979:	learn: 0.3964272	total: 51.2s	remaining: 1.04s
980:	learn: 0.3964202	total: 51.3s	remaining: 993ms
981:	learn: 0.3964062	total: 51.3s	remaining: 941ms
982:	learn: 0.3963947	total: 51.4s	remaining: 888ms
983:	learn: 0.3963897	total: 51.4s	remaining: 836ms
984:	learn: 0.3963789	total: 51.5s	remaining: 784ms
985:	learn: 0.3963774	total: 51.5s	remaining: 731ms
986:	learn: 0.3963670	total: 51.5s	remaining: 679ms
987:	learn: 0.39

<rectools.models.rerank.TwoStageModel at 0x7f15e45f9cd0>

In [32]:
reco = two_stage.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [33]:
reco

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,0.565327,1
11088451,1097557,9728,0.493843,2
11088452,1097557,13865,0.463521,3
11088479,1097557,16228,0.396493,4
11088453,1097557,3734,0.369894,5
...,...,...,...,...
15152,0,4151,0.279995,6
15168,0,7829,0.239109,7
15156,0,142,0.217433,8
15174,0,14703,0.215547,9


In [19]:
# TODO: CrossValidate

## CatBoostRanker guide

To successfully launch CatBoostClassifier at the ranking stage, it is necessary to **process categorical features**: fill in empty values.

In [30]:
# Now we specify our custom feature collector for TwoStageModel
# To transfer CatBoostRanker we use CatBoostRerankerWrapper (for faster work with large amounts of data)

two_stage = TwoStageModel(first_stage,
                          splitter,
                          CatBoostRerankerWrapper(), # CatBoostRanker is initialized by default
                          feature_collector=CustomFeatureCollectorCatBoost())

In [31]:
cat_cols = ['age', 'income', 'sex']

# example parameters for running model training 
# more valid parameters here https://catboost.ai/en/docs/concepts/python-reference_pool
fit_params = {
    'cat_features': cat_cols,
}

In [32]:
two_stage.fit(dataset, **fit_params)

Groupwise loss function. OneHotMaxSize set to 10
0:	total: 163ms	remaining: 2m 43s
1:	total: 266ms	remaining: 2m 12s
2:	total: 364ms	remaining: 2m 1s
3:	total: 463ms	remaining: 1m 55s
4:	total: 564ms	remaining: 1m 52s
5:	total: 663ms	remaining: 1m 49s
6:	total: 760ms	remaining: 1m 47s
7:	total: 858ms	remaining: 1m 46s
8:	total: 955ms	remaining: 1m 45s
9:	total: 1.05s	remaining: 1m 44s
10:	total: 1.15s	remaining: 1m 43s
11:	total: 1.25s	remaining: 1m 42s
12:	total: 1.34s	remaining: 1m 41s
13:	total: 1.44s	remaining: 1m 41s
14:	total: 1.53s	remaining: 1m 40s
15:	total: 1.65s	remaining: 1m 41s
16:	total: 1.75s	remaining: 1m 40s
17:	total: 1.85s	remaining: 1m 40s
18:	total: 1.95s	remaining: 1m 40s
19:	total: 2.04s	remaining: 1m 40s
20:	total: 2.15s	remaining: 1m 40s
21:	total: 2.24s	remaining: 1m 39s
22:	total: 2.34s	remaining: 1m 39s
23:	total: 2.44s	remaining: 1m 39s
24:	total: 2.53s	remaining: 1m 38s
25:	total: 2.63s	remaining: 1m 38s
26:	total: 2.73s	remaining: 1m 38s
27:	total: 2.82s	

232:	total: 23.1s	remaining: 1m 15s
233:	total: 23.2s	remaining: 1m 15s
234:	total: 23.3s	remaining: 1m 15s
235:	total: 23.4s	remaining: 1m 15s
236:	total: 23.5s	remaining: 1m 15s
237:	total: 23.6s	remaining: 1m 15s
238:	total: 23.7s	remaining: 1m 15s
239:	total: 23.8s	remaining: 1m 15s
240:	total: 23.9s	remaining: 1m 15s
241:	total: 24s	remaining: 1m 15s
242:	total: 24.1s	remaining: 1m 14s
243:	total: 24.2s	remaining: 1m 14s
244:	total: 24.3s	remaining: 1m 14s
245:	total: 24.4s	remaining: 1m 14s
246:	total: 24.5s	remaining: 1m 14s
247:	total: 24.6s	remaining: 1m 14s
248:	total: 24.7s	remaining: 1m 14s
249:	total: 24.8s	remaining: 1m 14s
250:	total: 24.9s	remaining: 1m 14s
251:	total: 25s	remaining: 1m 14s
252:	total: 25.1s	remaining: 1m 13s
253:	total: 25.2s	remaining: 1m 13s
254:	total: 25.3s	remaining: 1m 13s
255:	total: 25.3s	remaining: 1m 13s
256:	total: 25.4s	remaining: 1m 13s
257:	total: 25.5s	remaining: 1m 13s
258:	total: 25.6s	remaining: 1m 13s
259:	total: 25.7s	remaining: 1m 

467:	total: 47.2s	remaining: 53.7s
468:	total: 47.3s	remaining: 53.6s
469:	total: 47.4s	remaining: 53.5s
470:	total: 47.5s	remaining: 53.4s
471:	total: 47.6s	remaining: 53.3s
472:	total: 47.7s	remaining: 53.2s
473:	total: 47.8s	remaining: 53.1s
474:	total: 47.9s	remaining: 53s
475:	total: 48.1s	remaining: 52.9s
476:	total: 48.2s	remaining: 52.8s
477:	total: 48.3s	remaining: 52.7s
478:	total: 48.4s	remaining: 52.6s
479:	total: 48.5s	remaining: 52.5s
480:	total: 48.6s	remaining: 52.4s
481:	total: 48.7s	remaining: 52.3s
482:	total: 48.8s	remaining: 52.2s
483:	total: 48.9s	remaining: 52.1s
484:	total: 49s	remaining: 52s
485:	total: 49.1s	remaining: 51.9s
486:	total: 49.2s	remaining: 51.8s
487:	total: 49.3s	remaining: 51.7s
488:	total: 49.4s	remaining: 51.6s
489:	total: 49.5s	remaining: 51.5s
490:	total: 49.6s	remaining: 51.4s
491:	total: 49.7s	remaining: 51.3s
492:	total: 49.8s	remaining: 51.2s
493:	total: 49.9s	remaining: 51.1s
494:	total: 50s	remaining: 51s
495:	total: 50.1s	remaining: 5

704:	total: 1m 12s	remaining: 30.5s
705:	total: 1m 13s	remaining: 30.4s
706:	total: 1m 13s	remaining: 30.3s
707:	total: 1m 13s	remaining: 30.2s
708:	total: 1m 13s	remaining: 30.1s
709:	total: 1m 13s	remaining: 30s
710:	total: 1m 13s	remaining: 29.9s
711:	total: 1m 13s	remaining: 29.8s
712:	total: 1m 13s	remaining: 29.7s
713:	total: 1m 13s	remaining: 29.6s
714:	total: 1m 14s	remaining: 29.5s
715:	total: 1m 14s	remaining: 29.4s
716:	total: 1m 14s	remaining: 29.3s
717:	total: 1m 14s	remaining: 29.2s
718:	total: 1m 14s	remaining: 29.1s
719:	total: 1m 14s	remaining: 29s
720:	total: 1m 14s	remaining: 28.9s
721:	total: 1m 14s	remaining: 28.8s
722:	total: 1m 14s	remaining: 28.7s
723:	total: 1m 15s	remaining: 28.6s
724:	total: 1m 15s	remaining: 28.5s
725:	total: 1m 15s	remaining: 28.4s
726:	total: 1m 15s	remaining: 28.3s
727:	total: 1m 15s	remaining: 28.2s
728:	total: 1m 15s	remaining: 28.1s
729:	total: 1m 15s	remaining: 28s
730:	total: 1m 15s	remaining: 27.9s
731:	total: 1m 15s	remaining: 27.8

934:	total: 1m 38s	remaining: 6.87s
935:	total: 1m 38s	remaining: 6.76s
936:	total: 1m 39s	remaining: 6.66s
937:	total: 1m 39s	remaining: 6.55s
938:	total: 1m 39s	remaining: 6.45s
939:	total: 1m 39s	remaining: 6.34s
940:	total: 1m 39s	remaining: 6.24s
941:	total: 1m 39s	remaining: 6.13s
942:	total: 1m 39s	remaining: 6.03s
943:	total: 1m 39s	remaining: 5.92s
944:	total: 1m 39s	remaining: 5.82s
945:	total: 1m 40s	remaining: 5.71s
946:	total: 1m 40s	remaining: 5.6s
947:	total: 1m 40s	remaining: 5.5s
948:	total: 1m 40s	remaining: 5.39s
949:	total: 1m 40s	remaining: 5.29s
950:	total: 1m 40s	remaining: 5.18s
951:	total: 1m 40s	remaining: 5.08s
952:	total: 1m 40s	remaining: 4.97s
953:	total: 1m 40s	remaining: 4.87s
954:	total: 1m 41s	remaining: 4.76s
955:	total: 1m 41s	remaining: 4.66s
956:	total: 1m 41s	remaining: 4.55s
957:	total: 1m 41s	remaining: 4.44s
958:	total: 1m 41s	remaining: 4.34s
959:	total: 1m 41s	remaining: 4.23s
960:	total: 1m 41s	remaining: 4.13s
961:	total: 1m 41s	remaining: 

<rectools.models.rerank.TwoStageModel at 0x7f8aa0e3d730>

In [33]:
reco = two_stage.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [34]:
reco

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,2.058919,1
11088451,1097557,9728,1.749182,2
11088452,1097557,13865,1.673442,3
11088453,1097557,3734,1.146710,4
11088479,1097557,16228,0.888368,5
...,...,...,...,...
15169,0,4495,0.546036,6
15159,0,9996,0.499036,7
15160,0,7571,0.480380,8
15168,0,7829,0.475391,9


In [35]:
# TODO: CrossValidate

## LGBMClassifier guide

LGBMClassifier **cannot work with missing values**, so we must pre-process the data: 
1. Get rid of gaps in the data we want to work with (in the tutorial we want to use the feature description of users). 
2. Do not allow missing values to appear in the training sample for the second-stage model, obtained by adding the feature description. Therefore, to train the first-stage model, we will use user interactions with an available feature description

In [36]:
# Prepare dataset

DATA_PATH = Path("data_original")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1

In [37]:
# your any helper functions for working with loaded data
def encode_and_clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df_encode = encode_cat_cols(df)    
    cols_with_nan = df_encode.columns[df_encode.isna().any()].tolist()
    df_encode[cols_with_nan] = df_encode[cols_with_nan].fillna(df_encode[cols_with_nan].median())
    return df_encode

def encode_cat_cols(df: pd.DataFrame) -> pd.DataFrame:    
    df_cat_cols = df.select_dtypes(include=['object']).columns
    df[df_cat_cols] = df[df_cat_cols].astype('category')
    
    for col in df_cat_cols:
        cat_col = df[col].astype('category').cat
        df[col] = cat_col.codes.astype('category')
    return df

In [38]:
# coding categorical data and handling missing values
users = encode_and_clean_data(users)

In [39]:
# we check if all users who have interactions have a feature description (users)
interactions[Columns.User].nunique(), users[Columns.User].nunique()

(962179, 840197)

In [40]:
# we leave only interactions where users have a characteristic description
user_ids_with_feature = np.intersect1d(interactions[Columns.User].unique(), users[Columns.User].unique())
interactions = interactions.query(f"{Columns.User} in @user_ids_with_feature")

In [41]:
dataset = Dataset.construct(interactions)

In [42]:
# Prepare first stage models
first_stage = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the LGBMClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the LGBMClassifier, you need to fill in the empty ranks (e.g. min rank)
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1,  # when working with the LGBMClassifier, you need to fill in the empty scores
        ranks_fillna_value=31   # when working with the LGBMClassifier, you need to fill in the empty ranks
    )
]

# Prepare splitter for selecting reranker train. Only one fold is expected!
splitter = TimeRangeSplitter("7D")

In [43]:
# Write custome feature collecting funcs for users, items and user/item pairs
class CustomFeatureCollectorLGBM(CandidatesFeatureCollectorBase):
    def _get_user_features(
        self, users: AnyIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]], external_ids: bool
    ) -> pd.DataFrame:
        user_features = pd.read_csv(DATA_PATH / 'users.csv')
        user_features = encode_and_clean_data(user_features) # make sure descriptive description does not have empty values 
        
        return user_features[user_features[Columns.User].isin(users)]

In [44]:
# select all users who have interactions
users = dataset.user_id_map.external_ids

In [45]:
# Now we specify our custom feature collector for TwoStageModel

two_stage = TwoStageModel(first_stage,
                          splitter,
                          RerankerBase(LGBMClassifier()),
                          feature_collector=CustomFeatureCollectorLGBM())

In [46]:
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [47]:
# Now our candidates also have features for users (no empty values)
# LGBMClassifier cannot work with empty values
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target,age,income,sex,kids_flg
0,689820,13865,92354.0,4.0,0.209987,5.0,0,1,4,1,1
1,1089243,4436,15115.0,21.0,1.0,31.0,0,0,2,0,1
2,423967,4151,62343.0,3.0,0.63289,2.0,1,0,2,0,0
3,613250,7571,21718.0,16.0,1.0,31.0,1,2,2,0,1
4,307964,4471,1.01,31.0,0.166363,20.0,1,3,3,0,0
5,88022,13865,92354.0,4.0,1.0,31.0,1,0,2,1,0
6,544358,8387,1.01,31.0,0.065108,10.0,0,2,3,0,0
7,944197,7571,21718.0,16.0,0.245627,21.0,0,0,2,1,0
8,39366,9054,1.01,31.0,0.24215,25.0,0,2,4,1,1
9,166229,11312,1.01,31.0,0.060993,10.0,0,2,4,1,0


In [50]:
cat_cols = ['age', 'income', 'sex']

# example parameters for running model training 
# more valid parameters here https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier.fit
fit_params = {
    'categorical_feature': cat_cols,
}

In [51]:
two_stage.fit(dataset, **fit_params)

[LightGBM] [Info] Number of positive: 62765, number of negative: 269784
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 398
[LightGBM] [Info] Number of data points in the train set: 332549, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.188739 -> initscore=-1.458224
[LightGBM] [Info] Start training from score -1.458224


<rectools.models.rerank.TwoStageModel at 0x7f89335d5730>

In [52]:
reco = two_stage.recommend(
                    users=users, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [53]:
reco.head(10)

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,0.568291,1
11088451,1097557,9728,0.487937,2
11088452,1097557,13865,0.484011,3
11088453,1097557,3734,0.361763,4
11088454,1097557,4880,0.292965,5
11088479,1097557,16228,0.288336,6
11088456,1097557,142,0.251141,7
11088459,1097557,12192,0.220386,8
28315820,1097557,5658,0.213797,9
11088461,1097557,7571,0.212762,10


In [54]:
# TODO: CrossValidate

## LGBMRanker guide

In [55]:
# Now we specify our custom feature collector for TwoStageModel

two_stage = TwoStageModel(first_stage,
                          splitter,
                          RerankerBase(LGBMRanker()),
                          feature_collector=CustomFeatureCollectorLGBM())

In [56]:
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [57]:
# Now our candidates also have features for users (no empty values)
# LGBMRanker cannot work with empty values
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target,age,income,sex,kids_flg
0,306574,12324,1.01,31.0,0.311469,18.0,0,1,2,0,0
1,773247,10440,125533.0,1.0,0.318116,2.0,1,3,2,0,0
2,288928,7784,1.01,31.0,0.055205,22.0,0,4,2,0,0
3,94181,11985,1.01,31.0,0.15698,20.0,0,2,2,0,0
4,427974,6774,1.01,31.0,0.301239,15.0,0,2,2,1,1
5,1069352,4495,15845.0,20.0,0.433809,15.0,0,3,2,1,1
6,950223,4436,15115.0,21.0,1.0,31.0,0,1,2,0,0
7,802889,4436,15115.0,22.0,0.164605,20.0,0,0,4,0,0
8,925367,16509,1.01,31.0,0.172052,22.0,0,4,2,0,0
9,495023,15297,118602.0,2.0,1.398827,13.0,1,2,2,0,0


To learn ranking, you need to correctly compose groups. And I will pass them inside `fit_params`

Documentation on how to form groups for ranker (read about `group`):
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html#lightgbm.LGBMRanker.fit

In [58]:
def get_group(df: pd.DataFrame) -> np.ndarray:
    return np.array(
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        ['item_id']
    )

In [59]:
cat_cols = ['age', 'income', 'sex']
groups = get_group(candidates)

# example parameters for running model training 
# more valid parameters here
# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html#lightgbm.LGBMRanker.fit
fit_params = {
    'categorical_feature': cat_cols,
    'group': groups
}

In [60]:
two_stage.fit(dataset, **fit_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 397
[LightGBM] [Info] Number of data points in the train set: 332549, number of used features: 8


<rectools.models.rerank.TwoStageModel at 0x7f88e12d3640>

In [61]:
reco = two_stage.recommend(
                    users=users, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [62]:
reco

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,1.737577,1
11088452,1097557,13865,1.396927,2
11088451,1097557,9728,1.315158,3
11088453,1097557,3734,0.830932,4
11088454,1097557,4880,0.232542,5
...,...,...,...,...
15156,0,142,-0.042750,6
15176,0,12173,-0.096649,7
22337349,0,12324,-0.150246,8
22337350,0,5658,-0.274890,9


In [None]:
# TODO: CrossValidate