Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cosine full vector implementation #141

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/prediction_algorithms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ argument is a dictionary with the following (all optional) keys:
``'False'``) for the similarity not to be zero. Simply put, if
:math:`|I_{uv}| < \text{min_support}` then :math:`\text{sim}(u, v) = 0`. The
same goes for items.
- ``'common_ratings_only'``: Determines whether only common user/item ratings are
taken into account or all the full rating vectors are considered
(only relevant for cosine-based similraty). Default is ``True``.
- ``'shrinkage'``: Shrinkage parameter to apply (only relevant for
:func:`pearson_baseline <surprise.similarities.pearson_baseline>` similarity).
Default is 100.
Expand Down
4 changes: 4 additions & 0 deletions surprise/prediction_algorithms/algo_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,10 @@ def compute_similarities(self):
bx, by = bi, bu

args += [self.trainset.global_mean, bx, by, shrinkage]
elif name == 'cosine':
common_ratings_only = self.sim_options.get('common_ratings_only',
True)
args += [common_ratings_only]

try:
print('Computing the {0} similarity matrix...'.format(name))
Expand Down
101 changes: 98 additions & 3 deletions surprise/similarities.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,10 @@ from six.moves import range
from six import iteritems


def cosine(n_x, yr, min_support):
def cosine(n_x, yr, min_support, common_ratings_only=True):
"""Compute the cosine similarity between all pairs of users (or items).

Only **common** users (or items) are taken into account. The cosine
similarity is defined as:
The cosine similarity is defined as:

.. math::
\\text{cosine_sim}(u, v) = \\frac{
Expand All @@ -52,8 +51,20 @@ def cosine(n_x, yr, min_support):

For details on cosine similarity, see on `Wikipedia
<https://en.wikipedia.org/wiki/Cosine_similarity#Definition>`__.

Depending on ``common_ratings_only`` field of ``sim_options``
only common users (or items) are taken into account, or full rating
vectors (default: ``True``).
"""

if common_ratings_only:
return cosine_common_ratings_only(n_x, yr, min_support)
else:
return cosine_full_rating_vectors(n_x, yr, min_support)


def cosine_common_ratings_only(n_x, yr, min_support):

# sum (r_xy * r_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] prods
# number of common ys
Expand All @@ -78,6 +89,90 @@ def cosine(n_x, yr, min_support):
for y, y_ratings in iteritems(yr):
for xi, ri in y_ratings:
for xj, rj in y_ratings:
freq[xi, xj] += 1
prods[xi, xj] += ri * rj
sqi[xi, xj] += ri ** 2
sqj[xi, xj] += rj ** 2

for xi in range(n_x):
sim[xi, xi] = 1
for xj in range(xi + 1, n_x):
if freq[xi, xj] < min_sprt:
sim[xi, xj] = 0
else:
denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj])
sim[xi, xj] = prods[xi, xj] / denum

sim[xj, xi] = sim[xi, xj]

return sim


def cosine_full_rating_vectors(n_x, yr, min_support):

# sum (r_xy * r_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] prods
# number of common ys
cdef np.ndarray[np.int_t, ndim=2] freq
# sum (r_xy ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqi
# sum (r_x'y ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqj
# the similarity matrix
cdef np.ndarray[np.double_t, ndim=2] sim

cdef int xi, xj
cdef double ri, rj
cdef int min_sprt = min_support

prods = np.zeros((n_x, n_x), np.double)
freq = np.zeros((n_x, n_x), np.int)
sqi = np.zeros((n_x, n_x), np.double)
sqj = np.zeros((n_x, n_x), np.double)
sim = np.zeros((n_x, n_x), np.double)

for y, y_ratings in iteritems(yr):

# yr_ratings data structure is sparse. But for cosine similarity it is
# necessary to obtain all pairs, substituting missing ratings for 0.
# Implementation:
# Iterate through the range of x-indexes, taking 0-rating for each
# index unless this index is actually present in the iter
sorted_y_ratings = sorted(y_ratings, key=lambda x: x[0])
xi_iter = iter(sorted_y_ratings)
try:
xi_non_missing, ri_non_missing = next(xi_iter)
except StopIteration:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could all the StopIteration be avoided?

xi_non_missing = n_x
for xi_all in range(n_x):
if xi_all < xi_non_missing:
xi = xi_all
ri = 0
else:
xi = xi_non_missing
ri = ri_non_missing
try:
xi_non_missing, ri_non_missing = next(xi_iter)
except StopIteration:
xi_non_missing = n_x

xj_iter = iter(sorted_y_ratings)
try:
xj_non_missing, rj_non_missing = next(xj_iter)
except StopIteration:
xj_non_missing = n_x
for xj_all in range(n_x):
if xj_all < xj_non_missing:
xj = xj_all
rj = 0
else:
xj = xj_non_missing
rj = rj_non_missing
try:
xj_non_missing, rj_non_missing = next(xj_iter)
except StopIteration:
xj_non_missing = n_x

freq[xi, xj] += 1
prods[xi, xj] += ri * rj
sqi[xi, xj] += ri**2
Expand Down
52 changes: 48 additions & 4 deletions tests/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@

n_x = 8
yr_global = {
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
1: [(0, 4), (1, 4), (2, 4), ], # noqa
2: [ (2, 5), (3, 2), (4, 3) ], # noqa
3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same

}


Expand Down Expand Up @@ -48,7 +48,51 @@ def test_cosine_sim():
# cosine sim is necessarily 1
assert sim[3, 4] == 1

# pairs of users (0, 3) have no common items
# pairs of users (0, 3) have no common items
assert sim[0, 3] == 0
assert sim[0, 4] == 0

# check for float point support and computation correctness
dot_product56 = 1 * 1.5 + 3 * 3.5 + 2 * 2.5
assert sim[5, 6] == (dot_product56 /
((1 ** 2 + 3 ** 2 + 2 ** 2) *
(1.5 ** 2 + 3.5 ** 2 + 2.5 ** 2)) ** 0.5
)

# ensure min_support is taken into account. Only users 1 and 2 have more
# than 4 common ratings.
sim = sims.cosine(n_x, yr, min_support=4)
for i in range(n_x):
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0


def test_cosine_full_vectors_sim():
"""Tests for the cosine similarity."""

yr = yr_global.copy()

# # shuffle every rating list, to ensure the order in which ratings are
# # processed does not matter (it's important because it used to be error
# # prone when we were using itertools.combinations)
# for _, ratings in yr.items():
# random.shuffle(ratings)

sim = sims.cosine(n_x, yr, min_support=1, common_ratings_only=False)

# check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
assert sim[xi, xj] == sim[xj, xi]
assert 0 <= sim[xi, xj] <= 1

# users 0, 1 and 2 have different ratings when non-common items considered
assert sim[0, 1] < 1
assert sim[0, 2] < 1

# pairs of users (0, 3) and (0,4) have no common items
assert sim[0, 3] == 0
assert sim[0, 4] == 0

Expand Down