From ce388b8fbb19e25114ff213aba4c5bbe15b806b6 Mon Sep 17 00:00:00 2001 From: Daria Tikhonovich Date: Sun, 14 Jul 2024 14:26:30 +0300 Subject: [PATCH 1/6] fixed popular in category multiple fit --- CHANGELOG.md | 1 + rectools/models/popular_in_category.py | 10 +++++++++- tests/models/test_popular_in_category.py | 24 +++++++++++++++++++----- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e362580..d0a57018 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141)) - Added restriction to `pytorch` version for MacOSX + x86_64 that allows to install it on such platforms ([#142](https://github.com/MobileTeleSystems/RecTools/pull/142)) +- `PopularInCategoryModel` fitting for multiple times and `cross_validate` compatibility ## [0.6.0] - 13.05.2024 diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py index 4918fd01..ab34e727 100644 --- a/rectools/models/popular_in_category.py +++ b/rectools/models/popular_in_category.py @@ -175,7 +175,7 @@ def _define_categories_for_analysis(self) -> None: if self.n_categories: if len(self.category_columns) >= self.n_categories: self.n_effective_categories = self.n_categories - relevant_categories = self.category_scores.head(self.n_categories).index + relevant_categories = self.category_scores.head(self.n_categories).index.to_list() self.category_scores = self.category_scores.loc[relevant_categories] self.category_columns = relevant_categories else: @@ -188,6 +188,14 @@ def _define_categories_for_analysis(self) -> None: self.n_effective_categories = len(self.category_columns) def _fit(self, dataset: Dataset) -> None: # type: ignore + + if self.is_fitted: + self.category_columns = [] + self.category_interactions = {} + self.models = {} + self.category_scores = pd.Series() + self.n_effective_categories = 0 + self._check_category_feature(dataset) interactions = self._filter_interactions(dataset.interactions.df) self._calc_category_scores(dataset, interactions) diff --git a/tests/models/test_popular_in_category.py b/tests/models/test_popular_in_category.py index 21c6f53b..b8c510f1 100644 --- a/tests/models/test_popular_in_category.py +++ b/tests/models/test_popular_in_category.py @@ -422,11 +422,25 @@ def test_i2i( actual, ) - def test_second_fit_refits_model(self, dataset: Dataset) -> None: + @pytest.mark.parametrize("popularity", ("mean_weight", "n_users", "n_interactions")) + @pytest.mark.parametrize("category_feature", ("f1", "f2")) + @pytest.mark.parametrize("mixing_strategy", ("group", "rotate")) + @pytest.mark.parametrize("ratio_strategy", ("equal", "proportional")) + @pytest.mark.parametrize("n_categories", (2, None)) + def test_second_fit_refits_model( + self, + dataset: Dataset, + popularity: str, + category_feature: str, + mixing_strategy: str, + ratio_strategy: str, + n_categories: tp.Optional[int] + ) -> None: model = PopularInCategoryModel( - category_feature="f2", - popularity="mean_weight", - mixing_strategy="group", - ratio_strategy="proportional", + category_feature=category_feature, + popularity=popularity, + mixing_strategy=mixing_strategy, + ratio_strategy=ratio_strategy, + n_categories=n_categories ) assert_second_fit_refits_model(model, dataset) From a50cb3a103eb77945dd7227ff12588b8dd56aa39 Mon Sep 17 00:00:00 2001 From: Daria Tikhonovich Date: Sun, 14 Jul 2024 14:58:45 +0300 Subject: [PATCH 2/6] moded to_list --- rectools/models/popular_in_category.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py index ab34e727..e4faa24f 100644 --- a/rectools/models/popular_in_category.py +++ b/rectools/models/popular_in_category.py @@ -175,9 +175,9 @@ def _define_categories_for_analysis(self) -> None: if self.n_categories: if len(self.category_columns) >= self.n_categories: self.n_effective_categories = self.n_categories - relevant_categories = self.category_scores.head(self.n_categories).index.to_list() + relevant_categories = self.category_scores.head(self.n_categories).index self.category_scores = self.category_scores.loc[relevant_categories] - self.category_columns = relevant_categories + self.category_columns = relevant_categories.to_list() else: self.n_effective_categories = len(self.category_columns) warnings.warn( From ac4021a6ca600812b80947b2f921d595afc1b448 Mon Sep 17 00:00:00 2001 From: Daria Tikhonovich Date: Tue, 16 Jul 2024 12:45:51 +0300 Subject: [PATCH 3/6] fixed empty category interactions --- CHANGELOG.md | 2 +- rectools/models/popular_in_category.py | 14 +++++++++----- tests/model_selection/test_cross_validate.py | 10 +++++++--- tests/models/test_popular_in_category.py | 8 ++++---- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0a57018..910c6ed7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141)) - Added restriction to `pytorch` version for MacOSX + x86_64 that allows to install it on such platforms ([#142](https://github.com/MobileTeleSystems/RecTools/pull/142)) -- `PopularInCategoryModel` fitting for multiple times and `cross_validate` compatibility +- `PopularInCategoryModel` fitting for multiple times, `cross_validate` compatibility, behaviour with empty category interactions ## [0.6.0] - 13.05.2024 diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py index e4faa24f..16bb4e74 100644 --- a/rectools/models/popular_in_category.py +++ b/rectools/models/popular_in_category.py @@ -160,15 +160,19 @@ def _check_category_feature(self, dataset: Dataset) -> None: def _calc_category_scores(self, dataset: Dataset, interactions: pd.DataFrame) -> None: scores_dict = {} + empty_columns = [] for column_num in self.category_columns: item_idx = dataset.item_features.values.getcol(column_num).nonzero()[0] # type: ignore - self.category_interactions[column_num] = interactions[interactions[Columns.Item].isin(item_idx)].copy() + category_interactions = interactions[interactions[Columns.Item].isin(item_idx)] # Category interactions might be empty - if self.category_interactions[column_num].shape[0] == 0: - self.category_columns.remove(column_num) + if category_interactions.shape[0] == 0: + empty_columns.append(column_num) else: + self.category_interactions[column_num] = category_interactions.copy() col, func = self._get_groupby_col_and_agg_func(self.popularity) scores_dict[column_num] = self.category_interactions[column_num][col].apply(func) + if empty_columns: + self.category_columns = [col for col in self.category_columns if col not in empty_columns] self.category_scores = pd.Series(scores_dict).sort_values(ascending=False) def _define_categories_for_analysis(self) -> None: @@ -188,14 +192,14 @@ def _define_categories_for_analysis(self) -> None: self.n_effective_categories = len(self.category_columns) def _fit(self, dataset: Dataset) -> None: # type: ignore - + if self.is_fitted: self.category_columns = [] self.category_interactions = {} self.models = {} self.category_scores = pd.Series() self.n_effective_categories = 0 - + self._check_category_feature(dataset) interactions = self._filter_interactions(dataset.interactions.df) self._calc_category_scores(dataset, interactions) diff --git a/tests/model_selection/test_cross_validate.py b/tests/model_selection/test_cross_validate.py index 213e6ce0..33b02d9c 100644 --- a/tests/model_selection/test_cross_validate.py +++ b/tests/model_selection/test_cross_validate.py @@ -28,7 +28,7 @@ from rectools.metrics.base import MetricAtK from rectools.model_selection import LastNSplitter, cross_validate from rectools.model_selection.cross_validate import _gen_2x_internal_ids_dataset -from rectools.models import ImplicitALSWrapperModel, PopularModel, RandomModel +from rectools.models import ImplicitALSWrapperModel, PopularInCategoryModel, PopularModel, RandomModel from rectools.models.base import ModelBase from tests.testing_utils import assert_sparse_matrix_equal @@ -146,6 +146,7 @@ def setup_method(self) -> None: [14, "f2", 1], [11, "f1", "y"], [11, "f2", 2], + [12, "f1", "y"], ], columns=["id", "feature", "value"], ) @@ -243,10 +244,11 @@ def test_happy_path( @pytest.mark.parametrize("prefer_warm_inference_over_cold", (True, False)) def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) -> None: - splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False) + splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False) # 2 splits models: tp.Dict[str, ModelBase] = { "als": ImplicitALSWrapperModel(AlternatingLeastSquares(factors=2, iterations=2, random_state=42)), + "pop_in_cat": PopularInCategoryModel(category_feature="f1", n_categories=2), } actual = cross_validate( @@ -282,7 +284,9 @@ def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) - ], "metrics": [ {"model": "als", "i_split": 0, "precision@2": 0.5, "recall@1": 0.0}, - {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25}, + {"model": "pop_in_cat", "i_split": 0, "precision@2": 0.5, "recall@1": 0.5}, + {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.0}, + {"model": "pop_in_cat", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25}, ], } diff --git a/tests/models/test_popular_in_category.py b/tests/models/test_popular_in_category.py index b8c510f1..10b274fd 100644 --- a/tests/models/test_popular_in_category.py +++ b/tests/models/test_popular_in_category.py @@ -428,19 +428,19 @@ def test_i2i( @pytest.mark.parametrize("ratio_strategy", ("equal", "proportional")) @pytest.mark.parametrize("n_categories", (2, None)) def test_second_fit_refits_model( - self, + self, dataset: Dataset, popularity: str, category_feature: str, mixing_strategy: str, ratio_strategy: str, - n_categories: tp.Optional[int] - ) -> None: + n_categories: tp.Optional[int], + ) -> None: model = PopularInCategoryModel( category_feature=category_feature, popularity=popularity, mixing_strategy=mixing_strategy, ratio_strategy=ratio_strategy, - n_categories=n_categories + n_categories=n_categories, ) assert_second_fit_refits_model(model, dataset) From 978e77bfb909b451be6e5c1d26c4894b9c472967 Mon Sep 17 00:00:00 2001 From: Daria Tikhonovich Date: Tue, 16 Jul 2024 12:48:28 +0300 Subject: [PATCH 4/6] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 910c6ed7..12dfce5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141)) - Added restriction to `pytorch` version for MacOSX + x86_64 that allows to install it on such platforms ([#142](https://github.com/MobileTeleSystems/RecTools/pull/142)) -- `PopularInCategoryModel` fitting for multiple times, `cross_validate` compatibility, behaviour with empty category interactions +- `PopularInCategoryModel` fitting for multiple times, `cross_validate` compatibility, behaviour with empty category interactions ([#163](https://github.com/MobileTeleSystems/RecTools/pull/163)) ## [0.6.0] - 13.05.2024 From 8c7f6e7bbe4439542f49b8cdf171af8068796f27 Mon Sep 17 00:00:00 2001 From: Daria Tikhonovich Date: Tue, 16 Jul 2024 12:52:07 +0300 Subject: [PATCH 5/6] added developers board to readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5fa488f4..e116d3ae 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ Documentation | Examples | Tutorials | - Contribution Guide | - Release Notes + Contributing | + Releases | + Developers Board

RecTools is an easy-to-use Python library which makes the process of building recommendation systems easier, From 2ecedcb1528d25bcac01822d09d4906f1108b0ce Mon Sep 17 00:00:00 2001 From: Daria Tikhonovich Date: Tue, 16 Jul 2024 13:32:19 +0300 Subject: [PATCH 6/6] code style --- rectools/models/popular_in_category.py | 15 +++++++-------- tests/model_selection/test_cross_validate.py | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py index 16bb4e74..f24f5ee3 100644 --- a/rectools/models/popular_in_category.py +++ b/rectools/models/popular_in_category.py @@ -171,8 +171,8 @@ def _calc_category_scores(self, dataset: Dataset, interactions: pd.DataFrame) -> self.category_interactions[column_num] = category_interactions.copy() col, func = self._get_groupby_col_and_agg_func(self.popularity) scores_dict[column_num] = self.category_interactions[column_num][col].apply(func) - if empty_columns: - self.category_columns = [col for col in self.category_columns if col not in empty_columns] + + self.category_columns = [col for col in self.category_columns if col not in empty_columns] self.category_scores = pd.Series(scores_dict).sort_values(ascending=False) def _define_categories_for_analysis(self) -> None: @@ -193,12 +193,11 @@ def _define_categories_for_analysis(self) -> None: def _fit(self, dataset: Dataset) -> None: # type: ignore - if self.is_fitted: - self.category_columns = [] - self.category_interactions = {} - self.models = {} - self.category_scores = pd.Series() - self.n_effective_categories = 0 + self.category_columns = [] + self.category_interactions = {} + self.models = {} + self.category_scores = pd.Series() + self.n_effective_categories = 0 self._check_category_feature(dataset) interactions = self._filter_interactions(dataset.interactions.df) diff --git a/tests/model_selection/test_cross_validate.py b/tests/model_selection/test_cross_validate.py index 33b02d9c..d5d9dd87 100644 --- a/tests/model_selection/test_cross_validate.py +++ b/tests/model_selection/test_cross_validate.py @@ -244,7 +244,7 @@ def test_happy_path( @pytest.mark.parametrize("prefer_warm_inference_over_cold", (True, False)) def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) -> None: - splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False) # 2 splits + splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False) models: tp.Dict[str, ModelBase] = { "als": ImplicitALSWrapperModel(AlternatingLeastSquares(factors=2, iterations=2, random_state=42)),