Skip to content

Commit

Permalink
Cythonized baselines computation. Fixes #4
Browse files Browse the repository at this point in the history
  • Loading branch information
NicolasHug committed Dec 10, 2016
1 parent 084c285 commit 9058da6
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 57 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ build
dist/
surprise/similarities.c
surprise/prediction_algorithms/matrix_factorization.c
surprise/prediction_algorithms/optimize_baselines.c
*.so

Gemfile.lock
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ All experiments are run on a laptop with Intel Core i3 1.7 GHz, 4Go Ram.
| | RMSE | MAE | Time (s) |
|-----------------|:------:|:------:|:--------:|
| [NormalPredictor](http://surprise.readthedocs.io/en/latest/basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor) | 1.5228 | 1.2242 | 4 |
| [BaselineOnly](http://surprise.readthedocs.io/en/latest/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly) | .9445 | .7488 | 16 |
| [BaselineOnly](http://surprise.readthedocs.io/en/latest/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly) | .9445 | .7488 | 5 |
| [KNNBasic](http://surprise.readthedocs.io/en/latest/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic) | .9789 | .7732 | 27 |
| [KNNWithMeans](http://surprise.readthedocs.io/en/latest/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans) | .9514 | .7500 | 30 |
| [KNNBaseline](http://surprise.readthedocs.io/en/latest/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline) | .9306 | .7334 | 44 |
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
Extension('surprise.prediction_algorithms.matrix_factorization',
['surprise/prediction_algorithms/matrix_factorization' + ext],
include_dirs=[np.get_include()]),
Extension('surprise.prediction_algorithms.optimize_baselines',
['surprise/prediction_algorithms/optimize_baselines' + ext],
include_dirs=[np.get_include()]),
]

if USE_CYTHON:
Expand Down
66 changes: 10 additions & 56 deletions surprise/prediction_algorithms/algo_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)

import numpy as np

from .. import similarities as sims
from .predictions import PredictionImpossible
from .predictions import Prediction
from six.moves import range
from .optimize_baselines import baseline_als
from .optimize_baselines import baseline_sgd


class AlgoBase:
Expand Down Expand Up @@ -145,64 +144,19 @@ def compute_baselines(self):
if self.bu is not None:
return self.bu, self.bi

def optimize_sgd():
"""Optimize biases using sgd"""

bu = np.zeros(self.trainset.n_users)
bi = np.zeros(self.trainset.n_items)

reg = self.bsl_options.get('reg', 0.02)
lr = self.bsl_options.get('learning_rate', 0.005)
n_epochs = self.bsl_options.get('n_epochs', 20)

for dummy in range(n_epochs):
for u, i, r in self.trainset.all_ratings():
err = (r - (self.trainset.global_mean + bu[u] + bi[i]))
bu[u] += lr * (err - reg * bu[u])
bi[i] += lr * (err - reg * bi[i])

return bu, bi

def optimize_als():
"""Alternatively optimize user biases and and item biases."""

# This piece of code is largely inspired by that of MyMediaLite:
# https://github.com/zenogantner/MyMediaLite/blob/master/src/MyMediaLite/RatingPrediction/UserItemBaseline.cs
# see also https://www.youtube.com/watch?v=gCaOa3W9kM0&t=32m55s
# (Alex Smola on RS, ML Class 10-701)

bu = np.zeros(self.trainset.n_users)
bi = np.zeros(self.trainset.n_items)

reg_u = self.bsl_options.get('reg_u', 15)
reg_i = self.bsl_options.get('reg_i', 10)
n_epochs = self.bsl_options.get('n_epochs', 10)

for dummy in range(n_epochs):
for i in self.trainset.all_items():
devI = sum(r - self.trainset.global_mean -
bu[u] for (u, r) in self.trainset.ir[i])
bi[i] = devI / (reg_i + len(self.trainset.ir[i]))

for u in self.trainset.all_users():
devU = sum(r - self.trainset.global_mean -
bi[i] for (i, r) in self.trainset.ur[u])
bu[u] = devU / (reg_u + len(self.trainset.ur[u]))

return bu, bi

optimize = dict(als=optimize_als,
sgd=optimize_sgd)
method = dict(als=baseline_als,
sgd=baseline_sgd)

method = self.bsl_options.get('method', 'als')
method_name = self.bsl_options.get('method', 'als')

try:
print('Estimating biases using', method + '...')
self.bu, self.bi = optimize[method]()
print('Estimating biases using', method_name + '...')
self.bu, self.bi = method[method_name](self)
return self.bu, self.bi
except KeyError:
raise ValueError('invalid method ' + method + ' for baseline ' +
'computation. Available methods are als, sgd.')
raise ValueError('Invalid method ' + method_name +
' for baseline computation.' +
' Available methods are als and sgd.')

def compute_similarities(self):
"""Build the simlarity matrix.
Expand Down
91 changes: 91 additions & 0 deletions surprise/prediction_algorithms/optimize_baselines.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""
This module includes the two methods for baseline computation: stochastic
gradient descent and alternating least squares.
"""

from __future__ import (absolute_import, division, print_function,
unicode_literals)

cimport numpy as np
import numpy as np
from six.moves import range

def baseline_als(self):
"""Optimize biases using ALS.
Args:
self: The algorithm that needs to compute baselines.
Returns:
A tuple ``(bu, bi)``, which are users and items baselines.
"""

# This piece of code is largely inspired by that of MyMediaLite:
# https://github.com/zenogantner/MyMediaLite/blob/master/src/MyMediaLite/RatingPrediction/UserItemBaseline.cs
# see also https://www.youtube.com/watch?v=gCaOa3W9kM0&t=32m55s
# (Alex Smola on RS, ML Class 10-701)

cdef np.ndarray[np.double_t] bu = np.zeros(self.trainset.n_users)
cdef np.ndarray[np.double_t] bi = np.zeros(self.trainset.n_items)

cdef int u
cdef int i
cdef double r
cdef double err
cdef double global_mean = self.trainset.global_mean
cdef double dev_i
cdef double dev_u


cdef int n_epochs = self.bsl_options.get('n_epochs', 10)
cdef double reg_u = self.bsl_options.get('reg_u', 15)
cdef double reg_i = self.bsl_options.get('reg_i', 10)

for dummy in range(n_epochs):
for i in self.trainset.all_items():
dev_i = 0
for (u, r) in self.trainset.ir[i]:
dev_i += r - global_mean - bu[u]

bi[i] = dev_i / (reg_i + len(self.trainset.ir[i]))

for u in self.trainset.all_users():
dev_u = 0
for (i, r) in self.trainset.ur[u]:
dev_u += r - global_mean - bi[i]
bu[u] = dev_u / (reg_u + len(self.trainset.ur[u]))

return bu, bi


def baseline_sgd(self):
"""Optimize biases using SGD.
Args:
self: The algorithm that needs to compute baselines.
Returns:
A tuple ``(bu, bi)``, which are users and items baselines.
"""

cdef np.ndarray[np.double_t] bu = np.zeros(self.trainset.n_users)
cdef np.ndarray[np.double_t] bi = np.zeros(self.trainset.n_items)

cdef int u
cdef int i
cdef double r
cdef double err
cdef double global_mean = self.trainset.global_mean


cdef int n_epochs = self.bsl_options.get('n_epochs', 20)
cdef double reg = self.bsl_options.get('reg', 0.02)
cdef double lr = self.bsl_options.get('learning_rate', 0.005)

for dummy in range(n_epochs):
for u, i, r in self.trainset.all_ratings():
err = (r - (global_mean + bu[u] + bi[i]))
bu[u] += lr * (err - reg * bu[u])
bi[i] += lr * (err - reg * bi[i])

return bu, bi

0 comments on commit 9058da6

Please sign in to comment.