MTSWebServices · blondered · May 31, 2024 · May 21, 2024 · May 22, 2024 · May 22, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Extended Theory&Practice RecSys baselines tutorial ([#139](https://github.com/MobileTeleSystems/RecTools/pull/139))
 - `MetricsApp` to create plotly scatterplot widgets for metric-to-metric trade-off analysis ([#140](https://github.com/MobileTeleSystems/RecTools/pull/140))
+- `Intersection` metric ([#148](https://github.com/MobileTeleSystems/RecTools/pull/148))
 
 ### Fixed
 - Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141))

diff --git a/rectools/metrics/__init__.py b/rectools/metrics/__init__.py
@@ -34,6 +34,7 @@
 `metrics.AvgRecPopularity`
 `metrics.Serendipity`
 `metrics.HitRate`
+`metrics.Intersection`
 
 Tools
 -----
@@ -50,6 +51,7 @@
     SparsePairwiseHammingDistanceCalculator,
 )
 from .diversity import IntraListDiversity
+from .intersection import Intersection
 from .novelty import MeanInvUserFreq
 from .popularity import AvgRecPopularity
 from .ranking import MAP, MRR, NDCG
@@ -74,4 +76,5 @@
     "PairwiseDistanceCalculator",
     "PairwiseHammingDistanceCalculator",
     "SparsePairwiseHammingDistanceCalculator",
+    "Intersection",
 )
diff --git a/rectools/metrics/intersection.py b/rectools/metrics/intersection.py
@@ -0,0 +1,137 @@
+from typing import Dict, Hashable, Optional, Union
+
+import attr
+import numpy as np
+import pandas as pd
+
+from rectools import Columns
+from rectools.metrics.base import MetricAtK
+from rectools.metrics.classification import Recall
+from rectools.utils import select_by_type
+
+
+@attr.s(auto_attribs=True)
+class Intersection(MetricAtK):
+    """
+    Metric to measure intersection in user-item pairs between recommendation lists.
+
+    The intersection@k equals the share of ``reco`` that is present in ``ref_reco``.
+
+    This corresponds to the following algorithm:
+        1) filter ``reco`` by ``k``
+        2) filter ``ref_reco`` by ``ref_k``
+        3) calculate the proportion of items in ``reco`` that are also present in ``ref_reco``
+    The second and third steps are equivalent to computing Recall@ref_k when:
+        - Interactions consists of ``reco`` without the `Columns.Rank` column.
+        - Recommendation table is ``ref_reco``
+
+    Parameters
+    ----------
+    k : int
+        Number of items in top of recommendations list that will be used to calculate metric.
+    ref_k : int, optional
+        Number of items in top of reference recommendations list that will be used to calculate metric.
+        If ``ref_k`` is None than ``ref_reco`` will be filtered with ``ref_k = k``. Default: None.
+    """
+
+    ref_k: Optional[int] = attr.ib(default=None)
+
+    def calc(self, reco: pd.DataFrame, ref_reco: pd.DataFrame) -> float:
+        """
+        Calculate metric value.
+
+        Parameters
+        ----------
+        reco : pd.DataFrame
+            Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
+        ref_reco : pd.DataFrame
+            Reference recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
+
+        Returns
+        -------
+        float
+            Value of metric (average between users).
+        """
+        per_user = self.calc_per_user(reco, ref_reco)
+        return per_user.mean()
+
+    def calc_per_user(self, reco: pd.DataFrame, ref_reco: pd.DataFrame) -> pd.Series:
+        """
+        Calculate metric values for all users.
+
+        Parameters
+        ----------
+        reco : pd.DataFrame
+            Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
+        ref_reco : pd.DataFrame
+            Reference recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
+
+        Returns
+        -------
+        pd.Series:
+            Values of metric (index - user id, values - metric value for every user).
+        """
+        self._check(reco)
+        assert set(ref_reco.columns) >= {Columns.User, Columns.Item, Columns.Rank}
+
+        if ref_reco.shape[0] == 0:
+            return pd.Series(index=pd.Series(name=Columns.User, dtype=int), dtype=np.float64)
+
+        if ref_reco is reco:
+            return pd.Series(
+                data=1,
+                index=pd.Series(data=reco[Columns.User].unique(), name=Columns.User, dtype=int),
+                dtype=np.float64,
+            )
+
+        filtered_reco = reco[reco[Columns.Rank] <= self.k]
+
+        if self.ref_k is None:
+            self.ref_k = self.k
+        recall = Recall(k=self.ref_k)
+
+        return recall.calc_per_user(ref_reco, filtered_reco[Columns.UserItem])
+
+
+IntersectionMetric = Intersection
+
+
+def calc_intersection_metrics(
+    metrics: Dict[str, IntersectionMetric],
+    reco: pd.DataFrame,
+    ref_reco: Union[pd.DataFrame, Dict[Hashable, pd.DataFrame]],
+) -> Dict[str, float]:
+    """
+    Calculate intersection metrics.
+
+    Warning: It is not recommended to use this function directly.
+    Use `calc_metrics` instead.
+
+    Parameters
+    ----------
+    metrics : dict(str -> IntersectionMetric)
+        Dict of metric objects to calculate,
+        where key is metric name and value is metric object.
+    reco : pd.DataFrame
+        Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
+    ref_reco : Union[pd.DataFrame, Dict[Hashable, pd.DataFrame]]
+        Reference recommendations table(s) with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
+
+    Returns
+    -------
+    dict(str->float)
+        Dictionary where keys are the same as keys in `metrics`
+        and values are metric calculation results.
+    """
+    results = {}
+
+    intersection_metrics: Dict[str, Intersection] = select_by_type(metrics, Intersection)
+    if isinstance(ref_reco, pd.DataFrame):
+        for name, metric in intersection_metrics.items():
+            results[name] = metric.calc(reco, ref_reco)
+    else:
+        for name, metric in intersection_metrics.items():
+            for key, ref_r in ref_reco.items():
+                results[f"{name}_{key}"] = metric.calc(reco, ref_r)
+
+    return results
diff --git a/rectools/metrics/scoring.py b/rectools/metrics/scoring.py
@@ -24,18 +24,20 @@
 from .base import Catalog, MetricAtK, merge_reco
 from .classification import ClassificationMetric, SimpleClassificationMetric, calc_classification_metrics
 from .diversity import DiversityMetric, calc_diversity_metrics
+from .intersection import IntersectionMetric, calc_intersection_metrics
 from .novelty import NoveltyMetric, calc_novelty_metrics
 from .popularity import PopularityMetric, calc_popularity_metrics
 from .ranking import RankingMetric, calc_ranking_metrics
 from .serendipity import SerendipityMetric, calc_serendipity_metrics
 
 
-def calc_metrics(  # noqa  # pylint: disable=too-many-branches
+def calc_metrics(  # noqa  # pylint: disable=too-many-branches,too-many-locals,too-many-statements
     metrics: tp.Dict[str, MetricAtK],
     reco: pd.DataFrame,
     interactions: tp.Optional[pd.DataFrame] = None,
     prev_interactions: tp.Optional[pd.DataFrame] = None,
     catalog: tp.Optional[Catalog] = None,
+    ref_reco: tp.Optional[tp.Union[pd.DataFrame, tp.Dict[tp.Hashable, pd.DataFrame]]] = None,
 ) -> tp.Dict[str, float]:
     """
     Calculate metrics.
@@ -57,6 +59,11 @@ def calc_metrics(  # noqa  # pylint: disable=too-many-branches
     catalog : collection, optional
         Collection of unique item ids that could be used for recommendations.
         Obligatory only if `ClassificationMetric` or `SerendipityMetric` instances present in `metrics`.
+    ref_reco : Union[pd.DataFrame, Dict[Hashable, pd.DataFrame]], optional
+        Reference recommendations table(s) with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
+        For multiple intersection calculations we can pass multiple models recommendations in a dict:
+        ``ref_reco = {"one": ref_reco_one, "two": ref_reco_two}``
+        Obligatory only if `IntersectionMetric` instances present in `metrics`.
 
     Returns
     -------
@@ -164,6 +171,20 @@ def calc_metrics(  # noqa  # pylint: disable=too-many-branches
         )
         results.update(serendipity_values)
 
-    if len(results) < len(metrics):
+    # Intersection
+    intersection_metrics = select_by_type(metrics, IntersectionMetric)
+    intersection_additional_metrics_len = 0
+    if intersection_metrics:
+        if not ref_reco:
+            raise ValueError("For calculating intersection metrics it's necessary to set 'ref_reco'")
+        intersection_values = calc_intersection_metrics(
+            intersection_metrics,
+            reco,
+            ref_reco,
+        )
+        results.update(intersection_values)
+        intersection_additional_metrics_len += len(intersection_values) - len(intersection_metrics)
+
+    if len(results) < len(metrics) + intersection_additional_metrics_len:
         warnings.warn("Custom metrics are not supported.")
     return results
diff --git a/rectools/model_selection/cross_validate.py b/rectools/model_selection/cross_validate.py
@@ -52,6 +52,60 @@ def _handle_features(features: tp.Optional[Features], id_map: IdMap) -> tp.Tuple
     return dataset
 
 
+def _get_ref_reco(
+    ref_models: tp.List[str],
+    models: tp.Dict[str, ModelBase],
+    test_users: np.ndarray,
+    fold_dataset: Dataset,
+    k: int,
+    filter_viewed: bool,
+    internal_item_ids_to_recommend: tp.Optional[np.ndarray] = None,
+) -> tp.Dict[str, pd.DataFrame]:
+    """
+    Construct the value of the `ref_reco` parameter for the calc_metrics() method.
+
+    Parameters
+    ----------
+    ref_models: list(str)
+        The keys from `models` argument to fit model and get reference recommendations.
+    models : dict(str -> ModelBase)
+        Dict of initialized model objects from which we select reference models,
+        where key is model name and value is model object.
+    test_users: np.ndarray
+        Array of user ids to recommend for.
+    fold_dataset: Dataset
+         Dataset that contains 2nd level of internal ids.
+    k: int
+        Derived number of recommendations for every user.
+        For some models actual number of recommendations may be less than `k`.
+    filter_viewed: bool
+        Whether to filter from recommendations items that user has already interacted with.
+    internal_item_ids_to_recommend: np.ndarray, optional
+        Whitelist of internal item ids.
+        If given, only these items will be used for recommendations.
+
+    Returns
+    -------
+    dict(str -> pd.DataFrame)
+        A dictionary in which the keys are the model names from `ref_models`
+        and the values are the recommendations for that models.
+    """
+    ref_reco = {}
+
+    for model_name in ref_models:
+        model = models[model_name]
+        model.fit(fold_dataset)
+        ref_reco[model_name] = model.recommend(
+            users=test_users,
+            dataset=fold_dataset,
+            k=k,
+            filter_viewed=filter_viewed,
+            items_to_recommend=internal_item_ids_to_recommend,
+        )
+
+    return ref_reco
+
+
 def cross_validate(  # pylint: disable=too-many-locals
     dataset: Dataset,
     splitter: Splitter,
@@ -61,6 +115,8 @@ def cross_validate(  # pylint: disable=too-many-locals
     filter_viewed: bool,
     items_to_recommend: tp.Optional[ExternalIds] = None,
     prefer_warm_inference_over_cold: bool = True,
+    ref_models: tp.Optional[tp.List[str]] = None,
+    validate_ref_models: bool = False,
 ) -> tp.Dict[str, tp.Any]:
     """
     Run cross validation on multiple models with multiple metrics.
@@ -76,7 +132,7 @@ def cross_validate(  # pylint: disable=too-many-locals
         where key is metric name and value is metric object.
     models : dict(str -> ModelBase)
         Dict of initialized model objects to fit and measure quality,
-        where key is metric name and value is metric object.
+        where key is model name and value is model object.
     k : int
         Derived number of recommendations for every user.
         For some models actual number of recommendations may be less than `k`.
@@ -90,6 +146,13 @@ def cross_validate(  # pylint: disable=too-many-locals
         Set to `True` to enable "warm" recommendations for all applicable models.
         Set to `False` to treat all new users and items as "cold" and not to provide features for them.
         If new users and items are filtered from test in splitter, this argument has no effect.
+    ref_models : list(str), optional
+        The keys from `models` argument to compute intersection metrics. These models
+        recommendations will be used as `ref_reco` for other models intersection metrics calculation.
+        Obligatory only if `IntersectionMetric` instances present in `metrics`.
+    validate_ref_models : bool
+        If True include models specified in `ref_models` to all metrics calculations
+        and receive their metrics from cross-validation. Default: False.
 
     Returns
     -------
@@ -137,21 +200,41 @@ def cross_validate(  # pylint: disable=too-many-locals
         else:
             item_ids_to_recommend = None
 
-        for model_name, model in models.items():
-            model.fit(fold_dataset)
-            reco = model.recommend(  # 1x internal
-                users=test_users,
-                dataset=fold_dataset,
+        ref_reco: tp.Dict[str, pd.DataFrame] = {}
+        if ref_models is not None:
+            ref_reco = _get_ref_reco(
+                ref_models=ref_models,
+                models=models,
+                test_users=test_users,
+                fold_dataset=fold_dataset,
                 k=k,
                 filter_viewed=filter_viewed,
-                items_to_recommend=item_ids_to_recommend,
+                internal_item_ids_to_recommend=item_ids_to_recommend,
             )
+
+        for model_name, model in models.items():
+            if model_name in ref_reco and not validate_ref_models:
+                continue
+
+            if model_name in ref_reco:
+                reco = ref_reco[model_name]
+            else:
+                model.fit(fold_dataset)
+                reco = model.recommend(  # 1x internal
+                    users=test_users,
+                    dataset=fold_dataset,
+                    k=k,
+                    filter_viewed=filter_viewed,
+                    items_to_recommend=item_ids_to_recommend,
+                )
+
             metric_values = calc_metrics(
                 metrics,
                 reco=reco,
                 interactions=interactions_df_test,
                 prev_interactions=interactions_df_train,
                 catalog=catalog,
+                ref_reco=ref_reco,
             )
             res = {"model": model_name, "i_split": split_info["i_split"]}
             res.update(metric_values)