# CandidateRankingModel user guide


**Table of Contents**

* Load data: kion
* Initialization of CandidateRankingModel without features
* What if we want to easily add features to candidates?
    * From external source
* Using boosings from well-known libraries as a ranking model
    * CandidateRankingModel with gradient boosting from sklearn
        * Features of constructing model
    * CandidateRankingModel with gradient boosting from catboost
        * Features of constructing model
        * Using CatBoostClassifire
        * Using CatBoostRanker
    * CandidateRankingModel with gradient boosting from lightgbm
        * Features of constructing model
        * Using LGBMClassifier
        * Using LGBMRanker
            * An example of creating a class for a ranker different from those already provided in the implementation of the CandidateRankingModel
* CrossValidate
    * Evaluating the metrics of candidate ranking models and candidate generator models

In [1]:
import sys
sys.path.append("/home/oirvach1/RecTools")

In [2]:
from rectools.models import PopularModel, ImplicitItemKNNWrapperModel
from implicit.nearest_neighbours import CosineRecommender
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from sklearn.linear_model import RidgeClassifier
from pathlib import Path
import pandas as pd
import numpy as np
from rectools import Columns
from lightgbm import LGBMClassifier, LGBMRanker
from catboost import CatBoostClassifier, CatBoostRanker
from sklearn.ensemble import GradientBoostingClassifier
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.model_selection import cross_validate
from rectools.models.candidate_ranking import (
CandidateRankingModel,
CandidateGenerator,
Reranker,
CatBoostReranker, 
CandidatesFeatureCollectorBase,
RankerBase
)
import typing as tp
from rectools.models.base import AnyIds

## Load data: kion

In [None]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

In [3]:
# Prepare dataset

DATA_PATH = Path("data_original")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1

In [4]:
dataset = Dataset.construct(interactions)

## Initialization of CandidateRankingModel without features

In [5]:
# Prepare splitter for selecting reranker train. Only one fold is expected!
splitter = TimeRangeSplitter("7D")

In [6]:
# Prepare first stage models
first_stage = [
    CandidateGenerator(PopularModel(), 30, True, True), 
    CandidateGenerator(ImplicitItemKNNWrapperModel(CosineRecommender()), 30, True, True)
]

In [7]:
# Initialize CandidateRankingModel
# We can also pass negative sampler but here we are just using the default one

two_stage = CandidateRankingModel(first_stage, splitter, Reranker(RidgeClassifier()))

In [8]:
# Split dataset interactions
# Fit first stage models on history dataset
# Generate recommendations from first stage -> Get candidates for reranker
# Add targets to all candidates
# Sample negatives (here defult PerUserNegativeSampler is used) (we should probably make a public method to get data before sampling)

candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [9]:
# This is train data for boosting or any other reranker. id columns will be dropped before training
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target
0,802752,2657,66415.0,6.0,0.294856,10.0,1
1,1028865,5658,,,0.173871,14.0,0
2,1073856,14703,16864.0,23.0,0.309493,15.0,0
3,614488,6626,,,0.12764,22.0,0
4,159141,2360,,,0.190745,3.0,0
5,211445,12173,14092.0,29.0,,,0
6,757685,7107,16279.0,12.0,,,0
7,677007,6295,,,0.083656,2.0,0
8,937473,8636,34148.0,7.0,0.84306,7.0,0
9,95012,15297,180487.0,1.0,0.645057,2.0,1


## What if we want to easily add features to candidates?

### From external source
Other options are:
- Get features from dataset
- Fet time-based features using_fold info from splitter
- Combine any of the above

In [10]:
# Write custome feature collecting funcs for users, items and user/item pairs
class CustomFeatureCollector(CandidatesFeatureCollectorBase):
    
    def __init__(self, cat_cols: tp.List[str])-> None:        
        self.cat_cols = cat_cols
    
    # your any helper functions for working with loaded data
    def _encode_cat_cols(self, df: pd.DataFrame) -> pd.DataFrame:    
        df_cat_cols = self.cat_cols
        df[df_cat_cols] = df[df_cat_cols].astype("category")

        for col in df_cat_cols:
            cat_col = df[col].astype("category").cat
            df[col] = cat_col.codes.astype("category")
        return df
    
    def _get_user_features(
        self, users: AnyIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]], external_ids: bool
    ) -> pd.DataFrame:
        columns = self.cat_cols.copy()
        columns.append(Columns.User)
        user_features = pd.read_csv(DATA_PATH / "users.csv")[columns]        
        
        users_without_features = pd.DataFrame(
            np.setdiff1d(dataset.user_id_map.external_ids, user_features[Columns.User].unique()),
            columns=[Columns.User]
        )        
        user_features = pd.concat([user_features, users_without_features], axis=0)
        user_features = self._encode_cat_cols(user_features)
        
        return user_features[user_features[Columns.User].isin(users)]

In [11]:
# Now we specify our custom feature collector for TwoStageModel

two_stage = CandidateRankingModel(first_stage,
                          splitter,
                          Reranker(RidgeClassifier()),
                          feature_collector=CustomFeatureCollector(cat_cols = ["age", "income", "sex"]))

In [12]:
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [13]:
# Now our candidates also have features for users
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target,age,income,sex
0,1013374,13865,115095.0,3.0,0.457429,7.0,0,2,3,0
1,556537,4043,,,0.144062,21.0,0,2,2,0
2,344559,12995,21577.0,16.0,,,0,3,2,1
3,168400,7107,16279.0,27.0,,,0,1,2,0
4,763387,8707,,,0.113172,2.0,1,2,2,1
5,744933,11778,,,0.291469,16.0,0,2,3,1
6,760596,4495,19571.0,22.0,,,0,2,2,1
7,284815,13865,115095.0,3.0,0.294108,4.0,0,2,2,0
8,716459,2757,,,0.033903,15.0,0,-1,-1,-1
9,92264,9996,35718.0,9.0,0.478863,7.0,0,4,2,1


## Using boosings from well-known libraries as a ranking model

### CandidateRankingModel with gradient boosting from sklearn

**Features of constructing model:**
   - `GradientBoostingClassifier` works correctly with RerankerBase
   - `GradientBoostingClassifier` cannot work with missing values. When initializing CandidateGenerator, specify the parameter values `scores_fillna_value` and `ranks_fillna_value`.

In [14]:
# Prepare first stage models
first_stage_gbc = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the GradientBoostingClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the GradientBoostingClassifier, you need to fill in the empty ranks (e.g. min rank)
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the GradientBoostingClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the GradientBoostingClassifier, you need to fill in the empty ranks (e.g. min rank)
    )
]

In [15]:
two_stage_gbc = CandidateRankingModel(
                    first_stage_gbc,
                    splitter,
                    Reranker(GradientBoostingClassifier())
                    )

In [16]:
two_stage_gbc.fit(dataset)

<rectools.models.candidate_ranking.CandidateRankingModel at 0x7fc7af868bb0>

In [17]:
reco_gbc = two_stage_gbc.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [18]:
reco_gbc.head(5)

Unnamed: 0,user_id,item_id,score,rank
13958790,1097557,10440,0.625184,1
13958792,1097557,13865,0.498481,2
13958791,1097557,9728,0.471884,3
13958793,1097557,3734,0.341348,4
13958794,1097557,2657,0.289382,5


### CandidateRankingModel with gradient boosting from catboost

**Features of constructing model:**
- for `CatBoostClassifier` and `CatBoostRanker` it is necessary to process categorical features: fill in empty values (if there are categorical features in the training sample for Rerankers). You can do this with CustomFeatureCollector.

**Using CatBoostClassifire**
- `CatBoostClassifire` works correctly with CatBoostReranker

In [19]:
# Prepare first stage models
first_stage_catboost = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
    )
]

In [20]:
cat_cols = ["age", "income", "sex"]

# Categorical features are definitely transferred to the pool_kwargs
pool_kwargs = {
    "cat_features": cat_cols    
}

In [21]:
# To transfer CatBoostClassifier we use CatBoostReranker (for faster work with large amounts of data)
# You can also pass parameters in fit_kwargs and pool_kwargs in CatBoostReranker

two_stage_catboost_classifier = CandidateRankingModel(
                                    candidate_generators=first_stage_catboost,
                                    splitter=splitter,
                                    reranker=CatBoostReranker(CatBoostClassifier(verbose=False), pool_kwargs=pool_kwargs),
                                    feature_collector=CustomFeatureCollector(cat_cols)
                                    )

In [23]:
two_stage_catboost_classifier.fit(dataset)

<rectools.models.candidate_ranking.CandidateRankingModel at 0x7fc7c5b24130>

In [24]:
reco_catboost_classifier = two_stage_catboost_classifier.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [25]:
reco_catboost_classifier.head(5)

Unnamed: 0,user_id,item_id,score,rank
13958790,1097557,10440,0.628767,1
13958792,1097557,13865,0.483165,2
13958791,1097557,9728,0.460447,3
13958793,1097557,3734,0.338967,4
13958796,1097557,142,0.299403,5


**Using CatBoostRanker**
- `CatBoostRanker` works correctly with CatBoostReranker

In [26]:
# To transfer CatBoostRanker we use CatBoostReranker (for faster work with large amounts of data)

two_stage_catboost_ranker = CandidateRankingModel(
                                first_stage_catboost,
                                splitter,
                                CatBoostReranker(CatBoostRanker(verbose=False), pool_kwargs=pool_kwargs), # CatBoostRanker is initialized by default
                                feature_collector=CustomFeatureCollector(cat_cols),                
                                )

In [27]:
two_stage_catboost_ranker.fit(dataset)

<rectools.models.candidate_ranking.CandidateRankingModel at 0x7fc97a3b2ee0>

In [28]:
reco_catboost_ranker = two_stage_catboost_ranker.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [29]:
reco_catboost_ranker.head(5)

Unnamed: 0,user_id,item_id,score,rank
13958790,1097557,10440,2.357945,1
13958792,1097557,13865,1.706368,2
13958791,1097557,9728,1.652374,3
13958793,1097557,3734,1.201118,4
13958796,1097557,142,1.051281,5


### CandidateRankingModel with gradient boosting from lightgbm
**Features of constructing model:**
- `LGBMClassifier` and `LGBMRanker` cannot work with missing values

**Using LGBMClassifier**
- `LGBMClassifier` works correctly with RerankerBase

In [30]:
# Prepare first stage models
first_stage_lgbm = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the LGBMClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the LGBMClassifier, you need to fill in the empty ranks (e.g. min rank)
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1,  # when working with the LGBMClassifier, you need to fill in the empty scores
        ranks_fillna_value=31   # when working with the LGBMClassifier, you need to fill in the empty ranks
    )
]

In [31]:
cat_cols = ["age", "income", "sex"]

# example parameters for running model training 
# more valid parameters here https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier.fit
fit_params = {
    "categorical_feature": cat_cols,
}

In [32]:
two_stage_lgbm_classifier = CandidateRankingModel(
                                first_stage_lgbm,
                                splitter,
                                Reranker(LGBMClassifier(), fit_params),
                                feature_collector=CustomFeatureCollector(cat_cols)
                                )

In [33]:
two_stage_lgbm_classifier.fit(dataset)

[LightGBM] [Info] Number of positive: 78233, number of negative: 330228
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 397
[LightGBM] [Info] Number of data points in the train set: 408461, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191531 -> initscore=-1.440092
[LightGBM] [Info] Start training from score -1.440092


<rectools.models.candidate_ranking.CandidateRankingModel at 0x7fc9836905e0>

In [34]:
reco_lgbm_classifier = two_stage_lgbm_classifier.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [35]:
reco_lgbm_classifier.head(5)

Unnamed: 0,user_id,item_id,score,rank
13958790,1097557,10440,0.62283,1
13958792,1097557,13865,0.517122,2
13958791,1097557,9728,0.490347,3
13958793,1097557,3734,0.370404,4
13958796,1097557,142,0.298133,5


**Using LGBMRanker**
- `LGBMRanker` does not work correctly with RerankerBase!

When using LGBMRanker, you need to correctly compose groups. To do this, you can create a class inheriting from RerankerBase and override method `prepare_fit_kwargs` in it.

Documentation on how to form groups for LGBMRanker (read about `group`):
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html#lightgbm.LGBMRanker.fit

An example of creating a class for a ranker different from those already provided in the implementation of the CandidateRankingModel

In [36]:
class LGBMReranker(Reranker):
    def __init__(
        self,
        model: RankerBase = LGBMRanker(),
        fit_kwargs: tp.Optional[tp.Dict[str, tp.Any]] = None,
    ):
        super().__init__(model)
        self.fit_kwargs = fit_kwargs
        
    def _get_group(self, df: pd.DataFrame) -> np.ndarray:
        return df.groupby(by=["user_id"])["item_id"].count().values

    def prepare_fit_kwargs(self, candidates_with_target: pd.DataFrame) -> tp.Dict[str, tp.Any]:
        candidates_with_target = candidates_with_target.sort_values(by=[Columns.User])
        groups = self._get_group(candidates_with_target)
        candidates_with_target = candidates_with_target.drop(columns=Columns.UserItem)

        if isinstance(self.model, RankerBase):
            fit_kwargs = {
                "X": candidates_with_target.drop(columns=Columns.Target),
                "y": candidates_with_target[Columns.Target],
                "group": groups,
            }
        else:
            raise ValueError("Got unexpected model_type")

        if self.fit_kwargs is not None:
            fit_kwargs.update(self.fit_kwargs)

        return fit_kwargs

In [37]:
cat_cols = ["age", "income", "sex"]

# example parameters for running model training 
# more valid parameters here
# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html#lightgbm.LGBMRanker.fit
fit_params = {
    "categorical_feature": cat_cols,
}

In [38]:
# Now we specify our custom feature collector for CandidateRankingModel

two_stage_lgbm_ranker = CandidateRankingModel(
                            first_stage_lgbm,
                            splitter,
                            LGBMReranker(fit_kwargs=fit_params),
                            feature_collector=CustomFeatureCollector(cat_cols)
                            )

In [39]:
two_stage_lgbm_ranker.fit(dataset)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 408461, number of used features: 7


<rectools.models.candidate_ranking.CandidateRankingModel at 0x7fc980a79100>

In [40]:
reco_lgbm_ranker = two_stage_lgbm_ranker.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [41]:
reco_lgbm_ranker.head(5)

Unnamed: 0,user_id,item_id,score,rank
13958790,1097557,10440,2.139239,1
13958792,1097557,13865,1.326478,2
13958791,1097557,9728,1.322309,3
13958793,1097557,3734,0.936341,4
13958794,1097557,2657,0.590274,5


## CrossValidate
### Evaluating the metrics of candidate ranking models and candidate generator models.

In [42]:
# Take few models to compare
models = {
    "popular": PopularModel(),
    "cosine_knn": ImplicitItemKNNWrapperModel(CosineRecommender()),
    "two_stage_gbc": two_stage_gbc,
    "two_stage_catboost_classifier": two_stage_catboost_classifier,
    "two_stage_catboost_ranker": two_stage_catboost_ranker,
    "two_stage_lgbm_classifier": two_stage_lgbm_classifier,
    "two_stage_lgbm_ranker": two_stage_lgbm_ranker
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
}

K_RECS = 10

In [43]:
%%time

cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

[LightGBM] [Info] Number of positive: 73891, number of negative: 310533
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 394
[LightGBM] [Info] Number of data points in the train set: 384424, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192212 -> initscore=-1.435699
[LightGBM] [Info] Start training from score -1.435699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 395
[LightGBM] [Info] Number of data points in the train set: 384424, number of used features: 7
CPU times: user 2h 41min 10s, sys: 12min 36s, total: 2h 53min 47s
Wall time: 12min 17s


In [44]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean"])
)
pivot_results

Unnamed: 0_level_0,prec@1,prec@10,recall@10,novelty@10,serendipity@10
Unnamed: 0_level_1,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
popular,0.070806,0.032655,0.166089,3.715659,2e-06
cosine_knn,0.079372,0.036757,0.176609,5.75866,0.000189
two_stage_gbc,0.085232,0.039578,0.194186,4.830777,0.000155
two_stage_catboost_classifier,0.085486,0.038244,0.18683,4.897734,0.000152
two_stage_catboost_ranker,0.088675,0.039527,0.193629,4.845282,0.000155
two_stage_lgbm_classifier,0.085823,0.039226,0.192465,4.863045,0.000155
two_stage_lgbm_ranker,0.086795,0.039663,0.194782,4.807097,0.000147
