In [156]:
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix

In [139]:
X = csr_matrix(np.array(
    [
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [1, 1, 0],
        [0, 1, 1],
    ]
))

training_interactions = csr_matrix(np.array(
    [
        [0, 1, 1],
        [1, 0, 1],
        [1, 1, 1],
        [1, 0, 0],
        [0, 1, 0],
    ]
))

In [140]:
def union_csr_matrices(a, b):
    return csr_matrix(a.astype(np.bool) + b.astype(np.bool)) * 1.0

In [141]:
# Combine the memoized training interactions with the predict interactions
combined_interactions = union_csr_matrices(training_interactions, X)
combined_interactions.toarray()

array([[0., 1., 1.],
       [1., 0., 1.],
       [1., 1., 1.],
       [1., 1., 0.],
       [0., 1., 1.]])

In [142]:
pred_users = list(set(X.nonzero()[0]))
pred_users

[3, 4]

In [143]:
mask = np.zeros(combined_interactions.shape[0])
mask[pred_users] = 1
mask = mask.reshape(mask.shape[0], 1)
mask

array([[0.],
       [0.],
       [0.],
       [1.],
       [1.]])

In [144]:
mask.shape, combined_interactions.shape

((5, 1), (5, 3))

In [145]:
combined_interactions_selected_users = csr_matrix(
    combined_interactions.multiply(mask)
)
combined_interactions_selected_users.toarray()

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [1., 1., 0.],
       [0., 1., 1.]])

In [146]:
# Compute the interactions that are only in the prediction matrix.
combined_interactions_only_predict = (
    combined_interactions_selected_users
    - training_interactions.multiply(mask)
)
# TODO: check that this keeps working with overlap
combined_interactions_only_predict.toarray()

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [147]:
pred_user_interaction_counts = combined_interactions_selected_users.sum(axis=1)
pred_user_interaction_counts

matrix([[0.],
        [0.],
        [0.],
        [2.],
        [2.]])

In [148]:
train_user_counts = training_interactions.sum(axis=1)
train_user_counts


matrix([[2],
        [2],
        [3],
        [1],
        [1]])

In [149]:
train_item_counts = training_interactions.sum(axis=0)
train_item_counts

matrix([[3, 3, 3]])

In [150]:
np.vstack([train_item_counts for _ in range(X.shape[0])])

matrix([[3, 3, 3],
        [3, 3, 3],
        [3, 3, 3],
        [3, 3, 3],
        [3, 3, 3]])

In [151]:
item_counts_per_user = (
    np.vstack([train_item_counts for _ in range(X.shape[0])]) + combined_interactions_only_predict
)
item_counts_per_user

matrix([[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.],
        [3., 4., 3.],
        [3., 3., 4.]])

In [152]:
def invert(x):
    ret = np.zeros(x.shape)
    ret[x.nonzero()] = 1 / x[x.nonzero()]
    return ret

In [155]:
similarities = (
    combined_interactions_selected_users.multiply(
        invert(np.sqrt(pred_user_interaction_counts))
    ).multiply(
        invert(np.sqrt(item_counts_per_user))
    )
    @
    training_interactions.multiply(
        invert(np.sqrt(train_user_counts))
    ).T
)
similarities.setdiag(0)
similarities.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.25      , 0.28867513, 0.43982641, 0.        , 0.35355339],
       [0.53867513, 0.25      , 0.43982641, 0.        , 0.        ]])

In [116]:
1/(2*2*3)**.5

0.2886751345948129

In [31]:
        users_to_predict = set(X.nonzero()[0])

        # Combine the memoized training interactions with the predict interactions
        combined_interactions = self._union_csr_matrices(self.training_interactions_, X)
        # cut combined interactions to only nonzero users in prediction matrix.
        pred_users = get_users(X)
        mask = np.zeros(combined_interactions.shape[0])
        mask[pred_users] = 1
        combined_interactions_selected_users = csr_matrix(
            combined_interactions.multiply(mask)
        )

        # Compute the interactions that are only in the prediction matrix.
        combined_interactions_only_predict = (
            combined_interactions_selected_users
            - self.training_interactions_.multiply(pred_users)
        )

        pred_user_interaction_counts = combined_interactions_selected_users.sum(axis=1)

        train_user_counts = self.training_interactions_.sum(axis=1)
        train_item_counts = self.training_interactions_.sum(axis=0)

        # These are the c(i) values in the paper
        # Because we have to account for items that occur both in train and predict,
        # but can only use interactions in the X matrix for the user we are computing similarities for (avoid leakage of data)
        # We do this by taking the count in the training matrix per item.
        # vertically stacking these values to get these counts for each user
        # And we then add the interactions for prediction users that only occur during in the X dataset.
        # This gives us per user the accurate count of items,
        # taking into account only their own history from the prediction dataset.
        item_counts_per_user = (
            np.vstack([train_item_counts]) + combined_interactions_only_predict
        )

        # sim(u, v) = sum()
        # Similarities are computed by matrix multiplication of two scaled interaction matrices
        # the training matrix is scaled by dividing each interaction by the square root of the number of user interactions.
        # The combined interactions for prediction users is scaled by dividing by the square root of user interactions
        # and by the square root of the interactions with the item.
        # fmt:off
        similarities = (
            combined_interactions_selected_users.multiply(
                1 / np.sqrt(pred_user_interaction_counts)
            ).multiply(item_counts_per_user).T
            @
            self.training_interactions_.multiply(
                1 / np.sqrt(train_user_counts)
            ).multiply(
                1 / np.sqrt(train_item_counts)
            )
        )
        # fmt:on

        similarities.setdiag(0)


NameError: name 'self' is not defined

In [95]:
np.invert(x)

array([-2, -3])

In [97]:
x

array([1, 2])

In [157]:
a = lil_matrix(similarities.shape)

In [158]:
a

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in List of Lists format>