From ce388b8fbb19e25114ff213aba4c5bbe15b806b6 Mon Sep 17 00:00:00 2001
From: Daria Tikhonovich
Date: Sun, 14 Jul 2024 14:26:30 +0300
Subject: [PATCH 1/6] fixed popular in category multiple fit
---
CHANGELOG.md | 1 +
rectools/models/popular_in_category.py | 10 +++++++++-
tests/models/test_popular_in_category.py | 24 +++++++++++++++++++-----
3 files changed, 29 insertions(+), 6 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5e362580..d0a57018 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141))
- Added restriction to `pytorch` version for MacOSX + x86_64 that allows to install it on such platforms ([#142](https://github.com/MobileTeleSystems/RecTools/pull/142))
+- `PopularInCategoryModel` fitting for multiple times and `cross_validate` compatibility
## [0.6.0] - 13.05.2024
diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py
index 4918fd01..ab34e727 100644
--- a/rectools/models/popular_in_category.py
+++ b/rectools/models/popular_in_category.py
@@ -175,7 +175,7 @@ def _define_categories_for_analysis(self) -> None:
if self.n_categories:
if len(self.category_columns) >= self.n_categories:
self.n_effective_categories = self.n_categories
- relevant_categories = self.category_scores.head(self.n_categories).index
+ relevant_categories = self.category_scores.head(self.n_categories).index.to_list()
self.category_scores = self.category_scores.loc[relevant_categories]
self.category_columns = relevant_categories
else:
@@ -188,6 +188,14 @@ def _define_categories_for_analysis(self) -> None:
self.n_effective_categories = len(self.category_columns)
def _fit(self, dataset: Dataset) -> None: # type: ignore
+
+ if self.is_fitted:
+ self.category_columns = []
+ self.category_interactions = {}
+ self.models = {}
+ self.category_scores = pd.Series()
+ self.n_effective_categories = 0
+
self._check_category_feature(dataset)
interactions = self._filter_interactions(dataset.interactions.df)
self._calc_category_scores(dataset, interactions)
diff --git a/tests/models/test_popular_in_category.py b/tests/models/test_popular_in_category.py
index 21c6f53b..b8c510f1 100644
--- a/tests/models/test_popular_in_category.py
+++ b/tests/models/test_popular_in_category.py
@@ -422,11 +422,25 @@ def test_i2i(
actual,
)
- def test_second_fit_refits_model(self, dataset: Dataset) -> None:
+ @pytest.mark.parametrize("popularity", ("mean_weight", "n_users", "n_interactions"))
+ @pytest.mark.parametrize("category_feature", ("f1", "f2"))
+ @pytest.mark.parametrize("mixing_strategy", ("group", "rotate"))
+ @pytest.mark.parametrize("ratio_strategy", ("equal", "proportional"))
+ @pytest.mark.parametrize("n_categories", (2, None))
+ def test_second_fit_refits_model(
+ self,
+ dataset: Dataset,
+ popularity: str,
+ category_feature: str,
+ mixing_strategy: str,
+ ratio_strategy: str,
+ n_categories: tp.Optional[int]
+ ) -> None:
model = PopularInCategoryModel(
- category_feature="f2",
- popularity="mean_weight",
- mixing_strategy="group",
- ratio_strategy="proportional",
+ category_feature=category_feature,
+ popularity=popularity,
+ mixing_strategy=mixing_strategy,
+ ratio_strategy=ratio_strategy,
+ n_categories=n_categories
)
assert_second_fit_refits_model(model, dataset)
From a50cb3a103eb77945dd7227ff12588b8dd56aa39 Mon Sep 17 00:00:00 2001
From: Daria Tikhonovich
Date: Sun, 14 Jul 2024 14:58:45 +0300
Subject: [PATCH 2/6] moded to_list
---
rectools/models/popular_in_category.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py
index ab34e727..e4faa24f 100644
--- a/rectools/models/popular_in_category.py
+++ b/rectools/models/popular_in_category.py
@@ -175,9 +175,9 @@ def _define_categories_for_analysis(self) -> None:
if self.n_categories:
if len(self.category_columns) >= self.n_categories:
self.n_effective_categories = self.n_categories
- relevant_categories = self.category_scores.head(self.n_categories).index.to_list()
+ relevant_categories = self.category_scores.head(self.n_categories).index
self.category_scores = self.category_scores.loc[relevant_categories]
- self.category_columns = relevant_categories
+ self.category_columns = relevant_categories.to_list()
else:
self.n_effective_categories = len(self.category_columns)
warnings.warn(
From ac4021a6ca600812b80947b2f921d595afc1b448 Mon Sep 17 00:00:00 2001
From: Daria Tikhonovich
Date: Tue, 16 Jul 2024 12:45:51 +0300
Subject: [PATCH 3/6] fixed empty category interactions
---
CHANGELOG.md | 2 +-
rectools/models/popular_in_category.py | 14 +++++++++-----
tests/model_selection/test_cross_validate.py | 10 +++++++---
tests/models/test_popular_in_category.py | 8 ++++----
4 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0a57018..910c6ed7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141))
- Added restriction to `pytorch` version for MacOSX + x86_64 that allows to install it on such platforms ([#142](https://github.com/MobileTeleSystems/RecTools/pull/142))
-- `PopularInCategoryModel` fitting for multiple times and `cross_validate` compatibility
+- `PopularInCategoryModel` fitting for multiple times, `cross_validate` compatibility, behaviour with empty category interactions
## [0.6.0] - 13.05.2024
diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py
index e4faa24f..16bb4e74 100644
--- a/rectools/models/popular_in_category.py
+++ b/rectools/models/popular_in_category.py
@@ -160,15 +160,19 @@ def _check_category_feature(self, dataset: Dataset) -> None:
def _calc_category_scores(self, dataset: Dataset, interactions: pd.DataFrame) -> None:
scores_dict = {}
+ empty_columns = []
for column_num in self.category_columns:
item_idx = dataset.item_features.values.getcol(column_num).nonzero()[0] # type: ignore
- self.category_interactions[column_num] = interactions[interactions[Columns.Item].isin(item_idx)].copy()
+ category_interactions = interactions[interactions[Columns.Item].isin(item_idx)]
# Category interactions might be empty
- if self.category_interactions[column_num].shape[0] == 0:
- self.category_columns.remove(column_num)
+ if category_interactions.shape[0] == 0:
+ empty_columns.append(column_num)
else:
+ self.category_interactions[column_num] = category_interactions.copy()
col, func = self._get_groupby_col_and_agg_func(self.popularity)
scores_dict[column_num] = self.category_interactions[column_num][col].apply(func)
+ if empty_columns:
+ self.category_columns = [col for col in self.category_columns if col not in empty_columns]
self.category_scores = pd.Series(scores_dict).sort_values(ascending=False)
def _define_categories_for_analysis(self) -> None:
@@ -188,14 +192,14 @@ def _define_categories_for_analysis(self) -> None:
self.n_effective_categories = len(self.category_columns)
def _fit(self, dataset: Dataset) -> None: # type: ignore
-
+
if self.is_fitted:
self.category_columns = []
self.category_interactions = {}
self.models = {}
self.category_scores = pd.Series()
self.n_effective_categories = 0
-
+
self._check_category_feature(dataset)
interactions = self._filter_interactions(dataset.interactions.df)
self._calc_category_scores(dataset, interactions)
diff --git a/tests/model_selection/test_cross_validate.py b/tests/model_selection/test_cross_validate.py
index 213e6ce0..33b02d9c 100644
--- a/tests/model_selection/test_cross_validate.py
+++ b/tests/model_selection/test_cross_validate.py
@@ -28,7 +28,7 @@
from rectools.metrics.base import MetricAtK
from rectools.model_selection import LastNSplitter, cross_validate
from rectools.model_selection.cross_validate import _gen_2x_internal_ids_dataset
-from rectools.models import ImplicitALSWrapperModel, PopularModel, RandomModel
+from rectools.models import ImplicitALSWrapperModel, PopularInCategoryModel, PopularModel, RandomModel
from rectools.models.base import ModelBase
from tests.testing_utils import assert_sparse_matrix_equal
@@ -146,6 +146,7 @@ def setup_method(self) -> None:
[14, "f2", 1],
[11, "f1", "y"],
[11, "f2", 2],
+ [12, "f1", "y"],
],
columns=["id", "feature", "value"],
)
@@ -243,10 +244,11 @@ def test_happy_path(
@pytest.mark.parametrize("prefer_warm_inference_over_cold", (True, False))
def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) -> None:
- splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False)
+ splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False) # 2 splits
models: tp.Dict[str, ModelBase] = {
"als": ImplicitALSWrapperModel(AlternatingLeastSquares(factors=2, iterations=2, random_state=42)),
+ "pop_in_cat": PopularInCategoryModel(category_feature="f1", n_categories=2),
}
actual = cross_validate(
@@ -282,7 +284,9 @@ def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) -
],
"metrics": [
{"model": "als", "i_split": 0, "precision@2": 0.5, "recall@1": 0.0},
- {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25},
+ {"model": "pop_in_cat", "i_split": 0, "precision@2": 0.5, "recall@1": 0.5},
+ {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.0},
+ {"model": "pop_in_cat", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25},
],
}
diff --git a/tests/models/test_popular_in_category.py b/tests/models/test_popular_in_category.py
index b8c510f1..10b274fd 100644
--- a/tests/models/test_popular_in_category.py
+++ b/tests/models/test_popular_in_category.py
@@ -428,19 +428,19 @@ def test_i2i(
@pytest.mark.parametrize("ratio_strategy", ("equal", "proportional"))
@pytest.mark.parametrize("n_categories", (2, None))
def test_second_fit_refits_model(
- self,
+ self,
dataset: Dataset,
popularity: str,
category_feature: str,
mixing_strategy: str,
ratio_strategy: str,
- n_categories: tp.Optional[int]
- ) -> None:
+ n_categories: tp.Optional[int],
+ ) -> None:
model = PopularInCategoryModel(
category_feature=category_feature,
popularity=popularity,
mixing_strategy=mixing_strategy,
ratio_strategy=ratio_strategy,
- n_categories=n_categories
+ n_categories=n_categories,
)
assert_second_fit_refits_model(model, dataset)
From 978e77bfb909b451be6e5c1d26c4894b9c472967 Mon Sep 17 00:00:00 2001
From: Daria Tikhonovich
Date: Tue, 16 Jul 2024 12:48:28 +0300
Subject: [PATCH 4/6] changelog
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 910c6ed7..12dfce5c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Used the latest version of `lightfm` that allows to install it using `poetry>=1.5.0` ([#141](https://github.com/MobileTeleSystems/RecTools/pull/141))
- Added restriction to `pytorch` version for MacOSX + x86_64 that allows to install it on such platforms ([#142](https://github.com/MobileTeleSystems/RecTools/pull/142))
-- `PopularInCategoryModel` fitting for multiple times, `cross_validate` compatibility, behaviour with empty category interactions
+- `PopularInCategoryModel` fitting for multiple times, `cross_validate` compatibility, behaviour with empty category interactions ([#163](https://github.com/MobileTeleSystems/RecTools/pull/163))
## [0.6.0] - 13.05.2024
From 8c7f6e7bbe4439542f49b8cdf171af8068796f27 Mon Sep 17 00:00:00 2001
From: Daria Tikhonovich
Date: Tue, 16 Jul 2024 12:52:07 +0300
Subject: [PATCH 5/6] added developers board to readme
---
README.md | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 5fa488f4..e116d3ae 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,9 @@
Documentation |
Examples |
Tutorials |
- Contribution Guide |
- Release Notes
+ Contributing |
+ Releases |
+ Developers Board
RecTools is an easy-to-use Python library which makes the process of building recommendation systems easier,
From 2ecedcb1528d25bcac01822d09d4906f1108b0ce Mon Sep 17 00:00:00 2001
From: Daria Tikhonovich
Date: Tue, 16 Jul 2024 13:32:19 +0300
Subject: [PATCH 6/6] code style
---
rectools/models/popular_in_category.py | 15 +++++++--------
tests/model_selection/test_cross_validate.py | 2 +-
2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/rectools/models/popular_in_category.py b/rectools/models/popular_in_category.py
index 16bb4e74..f24f5ee3 100644
--- a/rectools/models/popular_in_category.py
+++ b/rectools/models/popular_in_category.py
@@ -171,8 +171,8 @@ def _calc_category_scores(self, dataset: Dataset, interactions: pd.DataFrame) ->
self.category_interactions[column_num] = category_interactions.copy()
col, func = self._get_groupby_col_and_agg_func(self.popularity)
scores_dict[column_num] = self.category_interactions[column_num][col].apply(func)
- if empty_columns:
- self.category_columns = [col for col in self.category_columns if col not in empty_columns]
+
+ self.category_columns = [col for col in self.category_columns if col not in empty_columns]
self.category_scores = pd.Series(scores_dict).sort_values(ascending=False)
def _define_categories_for_analysis(self) -> None:
@@ -193,12 +193,11 @@ def _define_categories_for_analysis(self) -> None:
def _fit(self, dataset: Dataset) -> None: # type: ignore
- if self.is_fitted:
- self.category_columns = []
- self.category_interactions = {}
- self.models = {}
- self.category_scores = pd.Series()
- self.n_effective_categories = 0
+ self.category_columns = []
+ self.category_interactions = {}
+ self.models = {}
+ self.category_scores = pd.Series()
+ self.n_effective_categories = 0
self._check_category_feature(dataset)
interactions = self._filter_interactions(dataset.interactions.df)
diff --git a/tests/model_selection/test_cross_validate.py b/tests/model_selection/test_cross_validate.py
index 33b02d9c..d5d9dd87 100644
--- a/tests/model_selection/test_cross_validate.py
+++ b/tests/model_selection/test_cross_validate.py
@@ -244,7 +244,7 @@ def test_happy_path(
@pytest.mark.parametrize("prefer_warm_inference_over_cold", (True, False))
def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) -> None:
- splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False) # 2 splits
+ splitter = LastNSplitter(n=1, n_splits=2, filter_cold_items=False, filter_already_seen=False)
models: tp.Dict[str, ModelBase] = {
"als": ImplicitALSWrapperModel(AlternatingLeastSquares(factors=2, iterations=2, random_state=42)),