Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Optional `epochs` argument to `ImplicitALSWrapperModel.fit` method ([#203](https://github.com/MobileTeleSystems/RecTools/pull/203))
- `save` and `load` methods to all of the models ([#206](https://github.com/MobileTeleSystems/RecTools/pull/206))
- Model configs example ([#207](https://github.com/MobileTeleSystems/RecTools/pull/207))
- `use_gpu` argument to `ImplicitRanker.rank` method ([#201](https://github.com/MobileTeleSystems/RecTools/pull/201))


## [0.8.0] - 28.08.2024
Expand Down
87 changes: 74 additions & 13 deletions rectools/models/rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
"""Implicit ranker model."""

import typing as tp
import warnings
from enum import Enum

import implicit.cpu
import implicit.gpu
import numpy as np
from implicit.cpu.matrix_factorization_base import _filter_items_from_sparse_matrix as filter_items_from_sparse_matrix
from implicit.gpu import HAS_CUDA
from scipy import sparse

from rectools import InternalIds
Expand Down Expand Up @@ -74,8 +77,14 @@ def __init__(
self.subjects_dots = self._calc_dots(self.subjects_factors)

def _get_neginf_score(self) -> float:
# Adding 1 to avoid float calculation errors (we're comparing `scores <= neginf_score`)
return float(-np.finfo(np.float32).max + 1)
# neginf_score computed according to implicit gpu FLT_FILTER_DISTANCE
# https://github.com/benfred/implicit/blob/main/implicit/gpu/knn.cu#L36
# we're comparing `scores <= neginf_score`
return float(
np.asarray(
np.asarray(-np.finfo(np.float32).max, dtype=np.float32).view(np.uint32) - 1, dtype=np.uint32
).view(np.float32)
)

@staticmethod
def _calc_dots(factors: np.ndarray) -> np.ndarray:
Expand Down Expand Up @@ -132,13 +141,50 @@ def _process_implicit_scores(

return all_target_ids, np.concatenate(all_reco_ids), np.concatenate(all_scores)

def rank(
def _rank_on_gpu(
self,
object_factors: np.ndarray,
subject_factors: tp.Union[np.ndarray, sparse.csr_matrix],
k: int,
object_norms: tp.Optional[np.ndarray],
filter_query_items: tp.Optional[tp.Union[sparse.csr_matrix, sparse.csr_array]],
) -> tp.Tuple[np.ndarray, np.ndarray]: # pragma: no cover
object_factors = implicit.gpu.Matrix(object_factors.astype(np.float32))

if isinstance(subject_factors, sparse.spmatrix):
warnings.warn("Sparse subject factors converted to Dense matrix")
subject_factors = subject_factors.todense()

subject_factors = implicit.gpu.Matrix(subject_factors.astype(np.float32))

if object_norms is not None:
if len(np.shape(object_norms)) == 1:
object_norms = np.expand_dims(object_norms, axis=0)
object_norms = implicit.gpu.Matrix(object_norms)

if filter_query_items is not None:
filter_query_items = implicit.gpu.COOMatrix(filter_query_items.tocoo())

ids, scores = implicit.gpu.KnnQuery().topk( # pylint: disable=c-extension-no-member
items=object_factors,
m=subject_factors,
k=k,
item_norms=object_norms,
query_filter=filter_query_items,
item_filter=None,
)

scores = scores.astype(np.float64)
return ids, scores

def rank( # pylint: disable=too-many-branches
self,
subject_ids: InternalIds,
k: int,
filter_pairs_csr: tp.Optional[sparse.csr_matrix] = None,
sorted_object_whitelist: tp.Optional[InternalIdsArray] = None,
num_threads: int = 0,
use_gpu: bool = False,
) -> tp.Tuple[InternalIds, InternalIds, Scores]:
"""Rank objects to proceed inference using implicit library topk cpu method.

Expand All @@ -156,7 +202,9 @@ def rank(
If given, only these items will be used for recommendations.
Otherwise all items from dataset will be used.
num_threads : int, default 0
Will be used as `num_threads` parameter for `implicit.cpu.topk.topk`.
Will be used as `num_threads` parameter for `implicit.cpu.topk.topk`. Omitted if use_gpu is True
use_gpu : bool, default False
If True `implicit.gpu.KnnQuery().topk` will be used instead of classic cpu version.

Returns
-------
Expand Down Expand Up @@ -191,15 +239,28 @@ def rank(

real_k = min(k, object_factors.shape[0])

ids, scores = implicit.cpu.topk.topk( # pylint: disable=c-extension-no-member
items=object_factors,
query=subject_factors,
k=real_k,
item_norms=object_norms, # query norms for COSINE distance are applied afterwards
filter_query_items=filter_query_items, # queries x objects csr matrix for getting neginf scores
filter_items=None, # rectools doesn't support blacklist for now
num_threads=num_threads,
)
if use_gpu and not HAS_CUDA:
warnings.warn("Forced rank() on CPU")
use_gpu = False

if use_gpu: # pragma: no cover
ids, scores = self._rank_on_gpu(
object_factors=object_factors,
subject_factors=subject_factors,
k=real_k,
object_norms=object_norms,
filter_query_items=filter_query_items,
)
else:
ids, scores = implicit.cpu.topk.topk( # pylint: disable=c-extension-no-member
items=object_factors,
query=subject_factors,
k=real_k,
item_norms=object_norms, # query norms for COSINE distance are applied afterwards
filter_query_items=filter_query_items, # queries x objects csr matrix for getting neginf scores
filter_items=None, # rectools doesn't support blacklist for now
num_threads=num_threads,
)

if sorted_object_whitelist is not None:
ids = sorted_object_whitelist[ids]
Expand Down
63 changes: 39 additions & 24 deletions tests/models/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@
class TestImplicitRanker: # pylint: disable=protected-access
@pytest.fixture
def subject_factors(self) -> np.ndarray:
return np.array([[-4, 0, 3], [0, 0, 0]])
return np.array([[-4, 0, 3], [0, 1, 2]])

@pytest.fixture
def object_factors(self) -> np.ndarray:
return np.array(
[
[-4, 0, 3],
[0, 0, 0],
[1, 1, 1],
[0, 2, 4],
[1, 10, 100],
]
)

Expand All @@ -60,7 +60,7 @@ def test_neginf_score(self, subject_factors: np.ndarray, object_factors: np.ndar
k=1,
filter_items=np.array([0]),
)[1][0][0]
assert neginf == implicit_ranker._get_neginf_score()
assert neginf <= implicit_ranker._get_neginf_score() <= -1e38

@pytest.mark.parametrize(
"dense",
Expand Down Expand Up @@ -94,12 +94,18 @@ def test_mask_for_correct_scores(
@pytest.mark.parametrize(
"distance, expected_recs, expected_scores, dense",
(
(Distance.DOT, [0, 1, 2, 2, 1, 0], [25, 0, -1, 0, 0, 0], True),
(Distance.COSINE, [0, 1, 2, 2, 1, 0], [1, 0, -1 / (5 * 3**0.5), 0, 0, 0], True),
(Distance.EUCLIDEAN, [0, 1, 2, 1, 2, 0], [0, 5, 30**0.5, 0, 3**0.5, 5], True),
(Distance.DOT, [0, 1, 2, 2, 1, 0], [25, 0, -1, 0, 0, 0], False),
(Distance.DOT, [2, 0, 1, 2, 1, 0], [296, 25, 12, 210, 10, 6], True),
(Distance.COSINE, [0, 2, 1, 1, 2, 0], [1, 0.5890328, 0.5366563, 1, 0.9344414, 0.5366563], True),
(
Distance.EUCLIDEAN,
[0, 1, 2, 1, 0, 2],
[0, 4.58257569, 97.64220399, 2.23606798, 4.24264069, 98.41747812],
True,
),
(Distance.DOT, [2, 0, 1, 2, 1, 0], [296, 25, 12, 210, 10, 6], False),
),
)
@pytest.mark.parametrize("use_gpu", (False, True))
def test_rank(
self,
distance: Distance,
Expand All @@ -108,24 +114,26 @@ def test_rank(
subject_factors: np.ndarray,
object_factors: np.ndarray,
dense: bool,
use_gpu: bool,
) -> None:
if not dense:
subject_factors = sparse.csr_matrix(subject_factors)

ranker = ImplicitRanker(distance, subject_factors, object_factors)
_, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3)
_, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, use_gpu=use_gpu)
np.testing.assert_equal(actual_recs, expected_recs)
np.testing.assert_almost_equal(actual_scores, expected_scores)

@pytest.mark.parametrize(
"distance, expected_recs, expected_scores, dense",
(
(Distance.DOT, [0, 2, 2, 1, 0], [25, -1, 0, 0, 0], True),
(Distance.COSINE, [0, 2, 2, 1, 0], [1, -1 / (5 * 3**0.5), 0, 0, 0], True),
(Distance.EUCLIDEAN, [0, 2, 1, 2, 0], [0, 30**0.5, 0, 3**0.5, 5], True),
(Distance.DOT, [0, 2, 2, 1, 0], [25, -1, 0, 0, 0], False),
(Distance.DOT, [2, 0, 2, 1, 0], [296, 25, 210, 10, 6], True),
(Distance.COSINE, [0, 2, 1, 2, 0], [1, 0.5890328, 1, 0.9344414, 0.5366563], True),
(Distance.EUCLIDEAN, [0, 2, 1, 0, 2], [0, 97.64220399, 2.23606798, 4.24264069, 98.41747812], True),
(Distance.DOT, [2, 0, 2, 1, 0], [296, 25, 210, 10, 6], False),
),
)
@pytest.mark.parametrize("use_gpu", (False, True))
def test_rank_with_filtering_viewed_items(
self,
distance: Distance,
Expand All @@ -134,6 +142,7 @@ def test_rank_with_filtering_viewed_items(
subject_factors: np.ndarray,
object_factors: np.ndarray,
dense: bool,
use_gpu: bool,
) -> None:
if not dense:
subject_factors = sparse.csr_matrix(subject_factors)
Expand All @@ -145,19 +154,20 @@ def test_rank_with_filtering_viewed_items(
]
)
ranker = ImplicitRanker(distance, subject_factors, object_factors)
_, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, filter_pairs_csr=ui_csr)
_, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, filter_pairs_csr=ui_csr, use_gpu=use_gpu)
np.testing.assert_equal(actual_recs, expected_recs)
np.testing.assert_almost_equal(actual_scores, expected_scores)

@pytest.mark.parametrize(
"distance, expected_recs, expected_scores, dense",
(
(Distance.DOT, [0, 2, 2, 0], [25, -1, 0, 0], True),
(Distance.COSINE, [0, 2, 2, 0], [1, -1 / (5 * 3**0.5), 0, 0], True),
(Distance.EUCLIDEAN, [0, 2, 2, 0], [0, 30**0.5, 3**0.5, 5], True),
(Distance.DOT, [0, 2, 2, 0], [25, -1, 0, 0], False),
(Distance.DOT, [2, 0, 2, 0], [296, 25, 210, 6], True),
(Distance.COSINE, [0, 2, 2, 0], [1, 0.5890328, 0.9344414, 0.5366563], True),
(Distance.EUCLIDEAN, [0, 2, 0, 2], [0, 97.64220399, 4.24264069, 98.41747812], True),
(Distance.DOT, [2, 0, 2, 0], [296, 25, 210, 6], False),
),
)
@pytest.mark.parametrize("use_gpu", (False, True))
def test_rank_with_objects_whitelist(
self,
distance: Distance,
Expand All @@ -166,25 +176,29 @@ def test_rank_with_objects_whitelist(
subject_factors: np.ndarray,
object_factors: np.ndarray,
dense: bool,
use_gpu: bool,
) -> None:
if not dense:
subject_factors = sparse.csr_matrix(subject_factors)

ranker = ImplicitRanker(distance, subject_factors, object_factors)

_, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]))
_, actual_recs, actual_scores = ranker.rank(
subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]), use_gpu=use_gpu
)
np.testing.assert_equal(actual_recs, expected_recs)
np.testing.assert_almost_equal(actual_scores, expected_scores)

@pytest.mark.parametrize(
"distance, expected_recs, expected_scores, dense",
(
(Distance.DOT, [2, 2, 0], [-1, 0, 0], True),
(Distance.COSINE, [2, 2, 0], [-1 / (5 * 3**0.5), 0, 0], True),
(Distance.EUCLIDEAN, [2, 2, 0], [30**0.5, 3**0.5, 5], True),
(Distance.DOT, [2, 2, 0], [-1, 0, 0], False),
(Distance.DOT, [2, 2, 0], [296, 210, 6], True),
(Distance.COSINE, [2, 2, 0], [0.5890328, 0.9344414, 0.5366563], True),
(Distance.EUCLIDEAN, [2, 0, 2], [97.64220399, 4.24264069, 98.41747812], True),
(Distance.DOT, [2, 2, 0], [296, 210, 6], False),
),
)
@pytest.mark.parametrize("use_gpu", (False, True))
def test_rank_with_objects_whitelist_and_filtering_viewed_items(
self,
distance: Distance,
Expand All @@ -193,6 +207,7 @@ def test_rank_with_objects_whitelist_and_filtering_viewed_items(
subject_factors: np.ndarray,
object_factors: np.ndarray,
dense: bool,
use_gpu: bool,
) -> None:
if not dense:
subject_factors = sparse.csr_matrix(subject_factors)
Expand All @@ -205,7 +220,7 @@ def test_rank_with_objects_whitelist_and_filtering_viewed_items(
)
ranker = ImplicitRanker(distance, subject_factors, object_factors)
_, actual_recs, actual_scores = ranker.rank(
subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]), filter_pairs_csr=ui_csr
subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]), filter_pairs_csr=ui_csr, use_gpu=use_gpu
)
np.testing.assert_equal(actual_recs, expected_recs)
np.testing.assert_almost_equal(actual_scores, expected_scores)
Expand Down