In [1]:
import sys
sys.path.append("/home/oirvach1/RecTools")

# `TwoStageModel` user guide

In [2]:
from rectools.models import PopularModel, ImplicitItemKNNWrapperModel
from implicit.nearest_neighbours import CosineRecommender
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from sklearn.linear_model import RidgeClassifier
from pathlib import Path
import pandas as pd
import numpy as np
from rectools import Columns
from rectools.models.rerank import TwoStageModel, CandidateGenerator, RerankerBase, CatBoostReranker
from lightgbm import LGBMClassifier, LGBMRanker
from catboost import CatBoostClassifier, CatBoostRanker
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

In [3]:
# Prepare dataset

DATA_PATH = Path("data_original")
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)
interactions["weight"] = 1

In [4]:
# we check if all users who have interactions have a feature description
interactions[Columns.User].nunique(), users[Columns.User].nunique()

(962179, 840197)

In [5]:
# we leave only interactions where users have a characteristic description
user_ids_with_feature = np.intersect1d(interactions[Columns.User].unique(), users[Columns.User].unique())
interactions = interactions.query(f"{Columns.User} in @user_ids_with_feature")

In [6]:
dataset = Dataset.construct(interactions)

In [7]:
# Prepare splitter for selecting reranker train. Only one fold is expected!
splitter = TimeRangeSplitter("7D")

In [8]:
# Prepare first stage models
first_stage = [
    CandidateGenerator(PopularModel(), 30, True, True), 
    CandidateGenerator(ImplicitItemKNNWrapperModel(CosineRecommender()), 30, True, True)
]

In [9]:
# Initialize TwoStageModel
# We can also pass negative sampler but here we are just using the default one

two_stage = TwoStageModel(first_stage, splitter, RerankerBase(RidgeClassifier()))

In [10]:
# Split dataset interactions
# Fit first stage models on history dataset
# Generate recommendations from first stage -> Get candidates for reranker
# Add targets to all candidates
# Sample negatives (here defult PerUserNegativeSampler is used) (we should probably make a public method to get data before sampling)
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [11]:
# This is train data for boosting or any other reranker. id columns will be dropped before training
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target
0,872876,4740,22025.0,15.0,,,0
1,277604,6402,,,0.132395,21.0,0
2,167687,9728,101721.0,3.0,0.236251,2.0,0
3,1073277,1844,20398.0,17.0,,,0
4,57311,3734,56265.0,5.0,,,0
5,152916,7102,14827.0,19.0,0.780292,9.0,0
6,611492,1819,,,0.21557,26.0,0
7,862159,4495,15845.0,16.0,0.950969,9.0,0
8,563584,12192,24217.0,8.0,0.818129,7.0,0
9,937248,3031,,,0.24751,14.0,0


## What if we want to easily add features to candidates?

### Variant 1: from external source
Other options are:
- Get features from dataset
- Fet time-based features using_fold info from splitter
- Combine any of the above

In [12]:
from rectools.models.rerank import CandidatesFeatureCollectorBase
import typing as tp
from rectools.models.base import AnyIds

# Write custome feature collecting funcs for users, items and user/item pairs
class CustomFeatureCollector(CandidatesFeatureCollectorBase):
    
    # your any helper functions for working with loaded data
    def _encode_and_clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df_encode = self._encode_cat_cols(df)    
        cols_with_nan = df_encode.columns[df_encode.isna().any()].tolist()
        df_encode[cols_with_nan] = df_encode[cols_with_nan].fillna(df_encode[cols_with_nan].median())
        return df_encode

    def _encode_cat_cols(self, df: pd.DataFrame) -> pd.DataFrame:    
        df_cat_cols = df.select_dtypes(include=['object']).columns
        df[df_cat_cols] = df[df_cat_cols].astype('category')

        for col in df_cat_cols:
            cat_col = df[col].astype('category').cat
            df[col] = cat_col.codes.astype('category')
        return df
    
    def _get_user_features(
        self, users: AnyIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]], external_ids: bool
    ) -> pd.DataFrame:
        user_features = pd.read_csv(DATA_PATH / 'users.csv')
        user_features = self._encode_and_clean_data(user_features)
        return user_features[user_features[Columns.User].isin(users)]

In [13]:
# Now we specify our custom feature collector for TwoStageModel

two_stage = TwoStageModel(first_stage,
                          splitter,
                          RerankerBase(RidgeClassifier()),
                          feature_collector=CustomFeatureCollector())

In [14]:
candidates = two_stage.get_train_with_targets_for_reranker(dataset)

In [15]:
# Now our candidates also have features for users
candidates.head(20)

Unnamed: 0,user_id,item_id,PopularModel_1_score,PopularModel_1_rank,ImplicitItemKNNWrapperModel_1_score,ImplicitItemKNNWrapperModel_1_rank,target,age,income,sex,kids_flg
0,727820,1844,20398.0,16.0,,,0,2,3,0,0
1,170491,8030,,,0.113228,13.0,0,2,3,1,0
2,735251,12995,18300.0,16.0,0.200564,18.0,0,1,2,1,1
3,718660,4151,62343.0,5.0,0.294224,11.0,1,2,4,1,1
4,476388,8636,28619.0,8.0,0.505805,6.0,0,3,2,0,0
5,418051,9728,101721.0,3.0,,,1,4,3,1,1
6,476990,4436,15115.0,24.0,,,0,0,3,0,0
7,609920,14741,18081.0,18.0,,,0,3,3,0,0
8,464805,14703,14455.0,27.0,,,0,3,2,0,0
9,903061,12173,12485.0,30.0,,,0,3,2,1,0


## GradientBoostingClassifier guide

**Pay attention to:**
   - `GradientBoostingClassifier` cannot work with missing values. When initializing CandidateGenerator, specify the parameter values `scores_fillna_value` and `ranks_fillna_value`.

In [16]:
# Prepare first stage models
first_stage_gbc = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the GradientBoostingClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the GradientBoostingClassifier, you need to fill in the empty ranks (e.g. min rank)
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the GradientBoostingClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the GradientBoostingClassifier, you need to fill in the empty ranks (e.g. min rank)
    )
]

In [17]:
two_stage_gbc = TwoStageModel(
                    first_stage_gbc,
                    splitter,
                    RerankerBase(GradientBoostingClassifier())
                    )

In [18]:
two_stage_gbc.fit(dataset)

<rectools.models.rerank.TwoStageModel at 0x7f7d14f4fc40>

In [19]:
reco_gbc = two_stage_gbc.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [20]:
reco_gbc.head(5)

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,0.578242,1
11088452,1097557,13865,0.521274,2
11088451,1097557,9728,0.466168,3
11088453,1097557,3734,0.340258,4
11088454,1097557,4880,0.25392,5


## CatBoost Reranker guide

**Pay attention to:**
- for `CatBoostClassifier` and `CatBoostRanker` it is necessary to process categorical features: fill in empty values (if there are categorical features in the training sample for Rerankers). You can do this with CustomFeatureCollector.

### CatBoostClassifier

In [21]:
# Write custome feature collecting funcs for users, items and user/item pairs
class CustomFeatureCollectorCatBoost(CandidatesFeatureCollectorBase):
    
    # your any helper functions for working with loaded data
    def _encode_and_clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df_encode = self._encode_cat_cols(df)    
        cols_with_nan = df_encode.columns[df_encode.isna().any()].tolist()
        df_encode[cols_with_nan] = df_encode[cols_with_nan].fillna(df_encode[cols_with_nan].median())
        return df_encode

    def _encode_cat_cols(self, df: pd.DataFrame) -> pd.DataFrame:    
        df_cat_cols = df.select_dtypes(include=['object']).columns
        df[df_cat_cols] = df[df_cat_cols].astype('category')

        for col in df_cat_cols:
            cat_col = df[col].astype('category').cat
            df[col] = cat_col.codes.astype('category')
        return df
    
    def _get_user_features(
        self, users: AnyIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]], external_ids: bool
    ) -> pd.DataFrame:
        user_features = pd.read_csv(DATA_PATH / 'users.csv')
        user_features = self._encode_and_clean_data(user_features)
        return user_features[user_features[Columns.User].isin(users)]

In [22]:
# Prepare first stage models
first_stage_catboost = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
    )
]

In [23]:
cat_cols = ["age", "income", "sex"]

# example parameters for running model training 
# more valid parameters here https://catboost.ai/en/docs/concepts/python-reference_pool
fit_params = {
    "cat_features": cat_cols,
}

In [24]:
# Now we specify our custom feature collector for TwoStageModel
# To transfer CatBoostClassifier we use CatBoostReranker (for faster work with large amounts of data)

two_stage_catboost_classifier = TwoStageModel(
                                    first_stage_catboost,
                                    splitter,
                                    CatBoostReranker(CatBoostClassifier(verbose=False), fit_params),
                                    feature_collector=CustomFeatureCollectorCatBoost(),
                                    )

In [25]:
two_stage_catboost_classifier.fit(dataset)

<rectools.models.rerank.TwoStageModel at 0x7f7b6a1c9b50>

In [26]:
reco_catboost_classifier = two_stage_catboost_classifier.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [27]:
reco_catboost_classifier.head(5)

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,0.57639,1
11088451,1097557,9728,0.546633,2
11088452,1097557,13865,0.498735,3
11088479,1097557,16228,0.382236,4
11088453,1097557,3734,0.366165,5


### CatBoostRanker

In [28]:
cat_cols = ["age", "income", "sex"]

# example parameters for running model training 
# more valid parameters here https://catboost.ai/en/docs/concepts/python-reference_pool
fit_params = {
    "cat_features": cat_cols,
}

In [29]:
# Now we specify our custom feature collector for TwoStageModel
# To transfer CatBoostRanker we use CatBoostReranker (for faster work with large amounts of data)

two_stage_catboost_ranker = TwoStageModel(
                                first_stage_catboost,
                                splitter,
                                CatBoostReranker(CatBoostRanker(verbose=False), fit_params), # CatBoostRanker is initialized by default
                                feature_collector=CustomFeatureCollectorCatBoost(),                
                                )

In [30]:
two_stage_catboost_ranker.fit(dataset)

<rectools.models.rerank.TwoStageModel at 0x7f7af951a7f0>

In [31]:
reco_catboost_ranker = two_stage_catboost_ranker.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [32]:
reco_catboost_ranker.head(5)

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,2.395013,1
11088451,1097557,9728,1.730144,2
11088452,1097557,13865,1.660024,3
11088453,1097557,3734,1.249525,4
11088479,1097557,16228,0.939496,5


## LGBM Reranker guide

**Pay attention to:**

- `LGBMClassifier` and `LGBMRanker` cannot work with missing values

So we must pre-process the data, for example: 
1. Get rid of gaps in the data we want to work with (in the tutorial we want to use the feature description of users). 
2. Do not allow missing values to appear in the training sample for the second-stage model, obtained by adding the feature description. Therefore, to train the first-stage model, we will use user interactions with an available feature description

### LGBMClassifier

In [33]:
# Write custome feature collecting funcs for users, items and user/item pairs
class CustomFeatureCollectorLGBM(CandidatesFeatureCollectorBase):
    
    # your any helper functions for working with loaded data
    def _encode_and_clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df_encode = self._encode_cat_cols(df)    
        cols_with_nan = df_encode.columns[df_encode.isna().any()].tolist()
        df_encode[cols_with_nan] = df_encode[cols_with_nan].fillna(df_encode[cols_with_nan].median())
        return df_encode

    def _encode_cat_cols(self, df: pd.DataFrame) -> pd.DataFrame:    
        df_cat_cols = df.select_dtypes(include=['object']).columns
        df[df_cat_cols] = df[df_cat_cols].astype('category')

        for col in df_cat_cols:
            cat_col = df[col].astype('category').cat
            df[col] = cat_col.codes.astype('category')
        return df
    
    def _get_user_features(
        self, users: AnyIds, dataset: Dataset, fold_info: tp.Optional[tp.Dict[str, tp.Any]], external_ids: bool
    ) -> pd.DataFrame:
        user_features = pd.read_csv(DATA_PATH / 'users.csv')
        user_features = self._encode_and_clean_data(user_features)
        return user_features[user_features[Columns.User].isin(users)]

In [34]:
# Prepare first stage models
first_stage_lgbm = [
    CandidateGenerator(
        model=PopularModel(),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1.01, # when working with the LGBMClassifier, you need to fill in the empty scores (e.g. max score)
        ranks_fillna_value=31  # when working with the LGBMClassifier, you need to fill in the empty ranks (e.g. min rank)
    ), 
    CandidateGenerator(
        model=ImplicitItemKNNWrapperModel(CosineRecommender()),
        num_candidates=30,
        keep_ranks=True,
        keep_scores=True,
        scores_fillna_value=1,  # when working with the LGBMClassifier, you need to fill in the empty scores
        ranks_fillna_value=31   # when working with the LGBMClassifier, you need to fill in the empty ranks
    )
]

In [35]:
cat_cols = ["age", "income", "sex"]

# example parameters for running model training 
# more valid parameters here https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier.fit
fit_params = {
    "categorical_feature": cat_cols,
}

In [36]:
# Now we specify our custom feature collector for TwoStageModel

two_stage_lgbm_classifier = TwoStageModel(
                                first_stage_lgbm,
                                splitter,
                                RerankerBase(LGBMClassifier(), fit_params),
                                feature_collector=CustomFeatureCollectorLGBM()
                                )

In [37]:
two_stage_lgbm_classifier.fit(dataset)

[LightGBM] [Info] Number of positive: 62765, number of negative: 269784
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 397
[LightGBM] [Info] Number of data points in the train set: 332549, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.188739 -> initscore=-1.458224
[LightGBM] [Info] Start training from score -1.458224


<rectools.models.rerank.TwoStageModel at 0x7f7b383768e0>

In [38]:
reco_lgbm_classifier = two_stage_lgbm_classifier.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [39]:
reco_lgbm_classifier.head(5)

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,0.582613,1
11088452,1097557,13865,0.523461,2
11088451,1097557,9728,0.497567,3
11088453,1097557,3734,0.360205,4
11088454,1097557,4880,0.278647,5


### LGBMRanker

When using LGBMRanker, you need to correctly compose groups. To do this, you can create a class inheriting from RerankerBase and override method `prepare_fit_kwargs` in it.

Documentation on how to form groups for LGBMRanker (read about `group`):
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html#lightgbm.LGBMRanker.fit

In [99]:
from rectools.models.rerank import ClassifierBase, RankerBase

class LGBMReranker(RerankerBase):
    def __init__(
        self,
        model: tp.Union[ClassifierBase, RankerBase] = LGBMRanker(),
        fit_kwargs: tp.Optional[tp.Dict[str, tp.Any]] = None,
    ):
        super().__init__(model)
        self.is_classifier = isinstance(model, ClassifierBase)
        self.fit_kwargs = fit_kwargs
        
    def _get_group(self, df: pd.DataFrame) -> np.ndarray:
        return np.array(
            df[["user_id", "item_id"]]
            .groupby(by=["user_id"]).count()
            ["item_id"]
        )

    def prepare_fit_kwargs(self, candidates_with_target: pd.DataFrame) -> tp.Dict[str, tp.Any]:
        candidates_with_target = candidates_with_target.sort_values(by=[Columns.User])
        groups = self._get_group(candidates_with_target)
        candidates_with_target = candidates_with_target.drop(columns=Columns.UserItem)

        if self.is_classifier:
            fit_kwargs = {
                "X": candidates_with_target.drop(columns=Columns.Target),
                "y": candidates_with_target[Columns.Target],
            }
        elif isinstance(self.model, RankerBase):
            fit_kwargs = {
                "X": candidates_with_target.drop(columns=Columns.Target),
                "y": candidates_with_target[Columns.Target],
                "group": groups,
            }
        else:
            raise ValueError("Got unexpected model_type")

        if self.fit_kwargs is not None:
            fit_kwargs.update(self.fit_kwargs)

        return fit_kwargs

In [100]:
cat_cols = ["age", "income", "sex"]

# example parameters for running model training 
# more valid parameters here
# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html#lightgbm.LGBMRanker.fit
fit_params = {
    "categorical_feature": cat_cols,
}

In [101]:
# Now we specify our custom feature collector for TwoStageModel

two_stage_lgbm_ranker = TwoStageModel(
                            first_stage_lgbm,
                            splitter,
                            LGBMReranker(fit_kwargs=fit_params),
                            feature_collector=CustomFeatureCollectorLGBM()
                            )

In [102]:
two_stage_lgbm_ranker.fit(dataset)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 399
[LightGBM] [Info] Number of data points in the train set: 332549, number of used features: 8


<rectools.models.rerank.TwoStageModel at 0x7f787a5979d0>

In [103]:
reco_lgbm_ranker = two_stage_lgbm_ranker.recommend(
                    users=dataset.user_id_map.external_ids, 
                    dataset=dataset,
                    k=10,
                    filter_viewed=True
                )

In [104]:
reco_lgbm_ranker.head(5)

Unnamed: 0,user_id,item_id,score,rank
11088450,1097557,10440,2.015062,1
11088452,1097557,13865,1.474856,2
11088451,1097557,9728,1.302547,3
11088453,1097557,3734,0.919638,4
11088456,1097557,142,0.449665,5


# CrossValidate

In [95]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.model_selection import cross_validate

In [96]:
# Take few models to compare
models = {
    "two_stage_gbc": two_stage_gbc,
    "two_stage_catboost_classifier": two_stage_catboost_classifier,
    "two_stage_catboost_ranker": two_stage_catboost_ranker,
    "two_stage_lgbm_classifier": two_stage_lgbm_classifier,
    "two_stage_lgbm_ranker": two_stage_lgbm_ranker
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
}

K_RECS = 10

In [97]:
%%time

cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K_RECS,
    filter_viewed=True,
)

[LightGBM] [Info] Number of positive: 59393, number of negative: 255132
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 314525, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.188834 -> initscore=-1.457605
[LightGBM] [Info] Start training from score -1.457605
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 398
[LightGBM] [Info] Number of data points in the train set: 314525, number of used features: 8
CPU times: user 2h 10min 57s, sys: 17min 5s, total: 2h 28min 2s
Wall time: 10min 18s


In [98]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"])
    .drop(columns="i_split")
    .groupby(["model"], sort=False)
    .agg(["mean"])
)
pivot_results.columns = pivot_results.columns.droplevel(1)

(
    pivot_results.style
    .set_caption("Mean values of metrics")
    .highlight_min(color='lightcoral', axis=0)
    .highlight_max(color='lightgreen', axis=0)
)

Unnamed: 0_level_0,prec@1,prec@10,recall@10,novelty@10,serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two_stage_gbc,0.079063,0.037887,0.186359,5.028077,0.000156
two_stage_catboost_classifier,0.075416,0.037692,0.18552,5.055426,0.000153
two_stage_catboost_ranker,0.083945,0.038354,0.18842,5.035265,0.000157
two_stage_lgbm_classifier,0.076317,0.037998,0.187403,5.058503,0.000157
two_stage_lgbm_ranker,0.081043,0.038042,0.188216,5.04315,0.000154
