From fc7ea207efa2a78899053219a8ab4d24967435bb Mon Sep 17 00:00:00 2001
From: Vadim Vetrov <vetrovvd@gmail.com>
Date: Fri, 1 Nov 2024 21:04:39 +0300
Subject: [PATCH] GPU support for ImplicitRanker

Implement use_gpu argument to ImplicitRanker.rank method.
Adapt tests to it.
---
 CHANGELOG.md              |  1 +
 rectools/models/rank.py   | 87 +++++++++++++++++++++++++++++++++------
 tests/models/test_rank.py | 63 +++++++++++++++++-----------
 3 files changed, 114 insertions(+), 37 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 546a7706..40c42e29 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Optional `epochs` argument to `ImplicitALSWrapperModel.fit` method ([#203](https://github.com/MobileTeleSystems/RecTools/pull/203))
 - `save` and `load` methods to all of the models ([#206](https://github.com/MobileTeleSystems/RecTools/pull/206))
 - Model configs example ([#207](https://github.com/MobileTeleSystems/RecTools/pull/207))
+- `use_gpu` argument to `ImplicitRanker.rank` method ([#201](https://github.com/MobileTeleSystems/RecTools/pull/201))
 
 
 ## [0.8.0] - 28.08.2024
diff --git a/rectools/models/rank.py b/rectools/models/rank.py
index a8ce6549..fabc389d 100644
--- a/rectools/models/rank.py
+++ b/rectools/models/rank.py
@@ -15,11 +15,14 @@
 """Implicit ranker model."""
 
 import typing as tp
+import warnings
 from enum import Enum
 
 import implicit.cpu
+import implicit.gpu
 import numpy as np
 from implicit.cpu.matrix_factorization_base import _filter_items_from_sparse_matrix as filter_items_from_sparse_matrix
+from implicit.gpu import HAS_CUDA
 from scipy import sparse
 
 from rectools import InternalIds
@@ -74,8 +77,14 @@ def __init__(
             self.subjects_dots = self._calc_dots(self.subjects_factors)
 
     def _get_neginf_score(self) -> float:
-        # Adding 1 to avoid float calculation errors (we're comparing `scores <= neginf_score`)
-        return float(-np.finfo(np.float32).max + 1)
+        # neginf_score computed according to implicit gpu FLT_FILTER_DISTANCE
+        # https://github.com/benfred/implicit/blob/main/implicit/gpu/knn.cu#L36
+        # we're comparing `scores <= neginf_score`
+        return float(
+            np.asarray(
+                np.asarray(-np.finfo(np.float32).max, dtype=np.float32).view(np.uint32) - 1, dtype=np.uint32
+            ).view(np.float32)
+        )
 
     @staticmethod
     def _calc_dots(factors: np.ndarray) -> np.ndarray:
@@ -132,13 +141,50 @@ def _process_implicit_scores(
 
         return all_target_ids, np.concatenate(all_reco_ids), np.concatenate(all_scores)
 
-    def rank(
+    def _rank_on_gpu(
+        self,
+        object_factors: np.ndarray,
+        subject_factors: tp.Union[np.ndarray, sparse.csr_matrix],
+        k: int,
+        object_norms: tp.Optional[np.ndarray],
+        filter_query_items: tp.Optional[tp.Union[sparse.csr_matrix, sparse.csr_array]],
+    ) -> tp.Tuple[np.ndarray, np.ndarray]:  # pragma: no cover
+        object_factors = implicit.gpu.Matrix(object_factors.astype(np.float32))
+
+        if isinstance(subject_factors, sparse.spmatrix):
+            warnings.warn("Sparse subject factors converted to Dense matrix")
+            subject_factors = subject_factors.todense()
+
+        subject_factors = implicit.gpu.Matrix(subject_factors.astype(np.float32))
+
+        if object_norms is not None:
+            if len(np.shape(object_norms)) == 1:
+                object_norms = np.expand_dims(object_norms, axis=0)
+            object_norms = implicit.gpu.Matrix(object_norms)
+
+        if filter_query_items is not None:
+            filter_query_items = implicit.gpu.COOMatrix(filter_query_items.tocoo())
+
+        ids, scores = implicit.gpu.KnnQuery().topk(  # pylint: disable=c-extension-no-member
+            items=object_factors,
+            m=subject_factors,
+            k=k,
+            item_norms=object_norms,
+            query_filter=filter_query_items,
+            item_filter=None,
+        )
+
+        scores = scores.astype(np.float64)
+        return ids, scores
+
+    def rank(  # pylint: disable=too-many-branches
         self,
         subject_ids: InternalIds,
         k: int,
         filter_pairs_csr: tp.Optional[sparse.csr_matrix] = None,
         sorted_object_whitelist: tp.Optional[InternalIdsArray] = None,
         num_threads: int = 0,
+        use_gpu: bool = False,
     ) -> tp.Tuple[InternalIds, InternalIds, Scores]:
         """Rank objects to proceed inference using implicit library topk cpu method.
 
@@ -156,7 +202,9 @@ def rank(
             If given, only these items will be used for recommendations.
             Otherwise all items from dataset will be used.
         num_threads : int, default 0
-            Will be used as `num_threads` parameter for `implicit.cpu.topk.topk`.
+            Will be used as `num_threads` parameter for `implicit.cpu.topk.topk`. Omitted if use_gpu is True
+        use_gpu : bool, default False
+            If True `implicit.gpu.KnnQuery().topk` will be used instead of classic cpu version.
 
         Returns
         -------
@@ -191,15 +239,28 @@ def rank(
 
         real_k = min(k, object_factors.shape[0])
 
-        ids, scores = implicit.cpu.topk.topk(  # pylint: disable=c-extension-no-member
-            items=object_factors,
-            query=subject_factors,
-            k=real_k,
-            item_norms=object_norms,  # query norms for COSINE distance are applied afterwards
-            filter_query_items=filter_query_items,  # queries x objects csr matrix for getting neginf scores
-            filter_items=None,  # rectools doesn't support blacklist for now
-            num_threads=num_threads,
-        )
+        if use_gpu and not HAS_CUDA:
+            warnings.warn("Forced rank() on CPU")
+            use_gpu = False
+
+        if use_gpu:  # pragma: no cover
+            ids, scores = self._rank_on_gpu(
+                object_factors=object_factors,
+                subject_factors=subject_factors,
+                k=real_k,
+                object_norms=object_norms,
+                filter_query_items=filter_query_items,
+            )
+        else:
+            ids, scores = implicit.cpu.topk.topk(  # pylint: disable=c-extension-no-member
+                items=object_factors,
+                query=subject_factors,
+                k=real_k,
+                item_norms=object_norms,  # query norms for COSINE distance are applied afterwards
+                filter_query_items=filter_query_items,  # queries x objects csr matrix for getting neginf scores
+                filter_items=None,  # rectools doesn't support blacklist for now
+                num_threads=num_threads,
+            )
 
         if sorted_object_whitelist is not None:
             ids = sorted_object_whitelist[ids]
diff --git a/tests/models/test_rank.py b/tests/models/test_rank.py
index 9bc4da37..56f86af7 100644
--- a/tests/models/test_rank.py
+++ b/tests/models/test_rank.py
@@ -29,15 +29,15 @@
 class TestImplicitRanker:  # pylint: disable=protected-access
     @pytest.fixture
     def subject_factors(self) -> np.ndarray:
-        return np.array([[-4, 0, 3], [0, 0, 0]])
+        return np.array([[-4, 0, 3], [0, 1, 2]])
 
     @pytest.fixture
     def object_factors(self) -> np.ndarray:
         return np.array(
             [
                 [-4, 0, 3],
-                [0, 0, 0],
-                [1, 1, 1],
+                [0, 2, 4],
+                [1, 10, 100],
             ]
         )
 
@@ -60,7 +60,7 @@ def test_neginf_score(self, subject_factors: np.ndarray, object_factors: np.ndar
             k=1,
             filter_items=np.array([0]),
         )[1][0][0]
-        assert neginf == implicit_ranker._get_neginf_score()
+        assert neginf <= implicit_ranker._get_neginf_score() <= -1e38
 
     @pytest.mark.parametrize(
         "dense",
@@ -94,12 +94,18 @@ def test_mask_for_correct_scores(
     @pytest.mark.parametrize(
         "distance, expected_recs, expected_scores, dense",
         (
-            (Distance.DOT, [0, 1, 2, 2, 1, 0], [25, 0, -1, 0, 0, 0], True),
-            (Distance.COSINE, [0, 1, 2, 2, 1, 0], [1, 0, -1 / (5 * 3**0.5), 0, 0, 0], True),
-            (Distance.EUCLIDEAN, [0, 1, 2, 1, 2, 0], [0, 5, 30**0.5, 0, 3**0.5, 5], True),
-            (Distance.DOT, [0, 1, 2, 2, 1, 0], [25, 0, -1, 0, 0, 0], False),
+            (Distance.DOT, [2, 0, 1, 2, 1, 0], [296, 25, 12, 210, 10, 6], True),
+            (Distance.COSINE, [0, 2, 1, 1, 2, 0], [1, 0.5890328, 0.5366563, 1, 0.9344414, 0.5366563], True),
+            (
+                Distance.EUCLIDEAN,
+                [0, 1, 2, 1, 0, 2],
+                [0, 4.58257569, 97.64220399, 2.23606798, 4.24264069, 98.41747812],
+                True,
+            ),
+            (Distance.DOT, [2, 0, 1, 2, 1, 0], [296, 25, 12, 210, 10, 6], False),
         ),
     )
+    @pytest.mark.parametrize("use_gpu", (False, True))
     def test_rank(
         self,
         distance: Distance,
@@ -108,24 +114,26 @@ def test_rank(
         subject_factors: np.ndarray,
         object_factors: np.ndarray,
         dense: bool,
+        use_gpu: bool,
     ) -> None:
         if not dense:
             subject_factors = sparse.csr_matrix(subject_factors)
 
         ranker = ImplicitRanker(distance, subject_factors, object_factors)
-        _, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3)
+        _, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, use_gpu=use_gpu)
         np.testing.assert_equal(actual_recs, expected_recs)
         np.testing.assert_almost_equal(actual_scores, expected_scores)
 
     @pytest.mark.parametrize(
         "distance, expected_recs, expected_scores, dense",
         (
-            (Distance.DOT, [0, 2, 2, 1, 0], [25, -1, 0, 0, 0], True),
-            (Distance.COSINE, [0, 2, 2, 1, 0], [1, -1 / (5 * 3**0.5), 0, 0, 0], True),
-            (Distance.EUCLIDEAN, [0, 2, 1, 2, 0], [0, 30**0.5, 0, 3**0.5, 5], True),
-            (Distance.DOT, [0, 2, 2, 1, 0], [25, -1, 0, 0, 0], False),
+            (Distance.DOT, [2, 0, 2, 1, 0], [296, 25, 210, 10, 6], True),
+            (Distance.COSINE, [0, 2, 1, 2, 0], [1, 0.5890328, 1, 0.9344414, 0.5366563], True),
+            (Distance.EUCLIDEAN, [0, 2, 1, 0, 2], [0, 97.64220399, 2.23606798, 4.24264069, 98.41747812], True),
+            (Distance.DOT, [2, 0, 2, 1, 0], [296, 25, 210, 10, 6], False),
         ),
     )
+    @pytest.mark.parametrize("use_gpu", (False, True))
     def test_rank_with_filtering_viewed_items(
         self,
         distance: Distance,
@@ -134,6 +142,7 @@ def test_rank_with_filtering_viewed_items(
         subject_factors: np.ndarray,
         object_factors: np.ndarray,
         dense: bool,
+        use_gpu: bool,
     ) -> None:
         if not dense:
             subject_factors = sparse.csr_matrix(subject_factors)
@@ -145,19 +154,20 @@ def test_rank_with_filtering_viewed_items(
             ]
         )
         ranker = ImplicitRanker(distance, subject_factors, object_factors)
-        _, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, filter_pairs_csr=ui_csr)
+        _, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, filter_pairs_csr=ui_csr, use_gpu=use_gpu)
         np.testing.assert_equal(actual_recs, expected_recs)
         np.testing.assert_almost_equal(actual_scores, expected_scores)
 
     @pytest.mark.parametrize(
         "distance, expected_recs, expected_scores, dense",
         (
-            (Distance.DOT, [0, 2, 2, 0], [25, -1, 0, 0], True),
-            (Distance.COSINE, [0, 2, 2, 0], [1, -1 / (5 * 3**0.5), 0, 0], True),
-            (Distance.EUCLIDEAN, [0, 2, 2, 0], [0, 30**0.5, 3**0.5, 5], True),
-            (Distance.DOT, [0, 2, 2, 0], [25, -1, 0, 0], False),
+            (Distance.DOT, [2, 0, 2, 0], [296, 25, 210, 6], True),
+            (Distance.COSINE, [0, 2, 2, 0], [1, 0.5890328, 0.9344414, 0.5366563], True),
+            (Distance.EUCLIDEAN, [0, 2, 0, 2], [0, 97.64220399, 4.24264069, 98.41747812], True),
+            (Distance.DOT, [2, 0, 2, 0], [296, 25, 210, 6], False),
         ),
     )
+    @pytest.mark.parametrize("use_gpu", (False, True))
     def test_rank_with_objects_whitelist(
         self,
         distance: Distance,
@@ -166,25 +176,29 @@ def test_rank_with_objects_whitelist(
         subject_factors: np.ndarray,
         object_factors: np.ndarray,
         dense: bool,
+        use_gpu: bool,
     ) -> None:
         if not dense:
             subject_factors = sparse.csr_matrix(subject_factors)
 
         ranker = ImplicitRanker(distance, subject_factors, object_factors)
 
-        _, actual_recs, actual_scores = ranker.rank(subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]))
+        _, actual_recs, actual_scores = ranker.rank(
+            subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]), use_gpu=use_gpu
+        )
         np.testing.assert_equal(actual_recs, expected_recs)
         np.testing.assert_almost_equal(actual_scores, expected_scores)
 
     @pytest.mark.parametrize(
         "distance, expected_recs, expected_scores, dense",
         (
-            (Distance.DOT, [2, 2, 0], [-1, 0, 0], True),
-            (Distance.COSINE, [2, 2, 0], [-1 / (5 * 3**0.5), 0, 0], True),
-            (Distance.EUCLIDEAN, [2, 2, 0], [30**0.5, 3**0.5, 5], True),
-            (Distance.DOT, [2, 2, 0], [-1, 0, 0], False),
+            (Distance.DOT, [2, 2, 0], [296, 210, 6], True),
+            (Distance.COSINE, [2, 2, 0], [0.5890328, 0.9344414, 0.5366563], True),
+            (Distance.EUCLIDEAN, [2, 0, 2], [97.64220399, 4.24264069, 98.41747812], True),
+            (Distance.DOT, [2, 2, 0], [296, 210, 6], False),
         ),
     )
+    @pytest.mark.parametrize("use_gpu", (False, True))
     def test_rank_with_objects_whitelist_and_filtering_viewed_items(
         self,
         distance: Distance,
@@ -193,6 +207,7 @@ def test_rank_with_objects_whitelist_and_filtering_viewed_items(
         subject_factors: np.ndarray,
         object_factors: np.ndarray,
         dense: bool,
+        use_gpu: bool,
     ) -> None:
         if not dense:
             subject_factors = sparse.csr_matrix(subject_factors)
@@ -205,7 +220,7 @@ def test_rank_with_objects_whitelist_and_filtering_viewed_items(
         )
         ranker = ImplicitRanker(distance, subject_factors, object_factors)
         _, actual_recs, actual_scores = ranker.rank(
-            subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]), filter_pairs_csr=ui_csr
+            subject_ids=[0, 1], k=3, sorted_object_whitelist=np.array([0, 2]), filter_pairs_csr=ui_csr, use_gpu=use_gpu
         )
         np.testing.assert_equal(actual_recs, expected_recs)
         np.testing.assert_almost_equal(actual_scores, expected_scores)