# NicolasHug/Surprise

Switch branches/tags
Nothing to show
b54b573 Jan 7, 2018
1 contributor

### Users who have contributed to this file

743 lines (601 sloc) 29.8 KB
 """ the :mod:matrix_factorization module includes some algorithms using matrix factorization. """ from __future__ import (absolute_import, division, print_function, unicode_literals) cimport numpy as np # noqa import numpy as np from six.moves import range from .algo_base import AlgoBase from .predictions import PredictionImpossible from ..utils import get_rng class SVD(AlgoBase): """The famous *SVD* algorithm, as popularized by Simon Funk _ during the Netflix Prize. When baselines are not used, this is equivalent to Probabilistic Matrix Factorization :cite:salakhutdinov2008a (see :ref:note  below). The prediction :math:\\hat{r}_{ui} is set as: .. math:: \hat{r}_{ui} = \mu + b_u + b_i + q_i^Tp_u If user :math:u is unknown, then the bias :math:b_u and the factors :math:p_u are assumed to be zero. The same applies for item :math:i with :math:b_i and :math:q_i. For details, see equation (5) from :cite:Koren:2009. See also :cite:Ricci:2010, section 5.3.1. To estimate all the unknown, we minimize the following regularized squared error: .. math:: \sum_{r_{ui} \in R_{train}} \left(r_{ui} - \hat{r}_{ui} \\right)^2 + \lambda\\left(b_i^2 + b_u^2 + ||q_i||^2 + ||p_u||^2\\right) The minimization is performed by a very straightforward stochastic gradient descent: .. math:: b_u &\\leftarrow b_u &+ \gamma (e_{ui} - \lambda b_u)\\\\ b_i &\\leftarrow b_i &+ \gamma (e_{ui} - \lambda b_i)\\\\ p_u &\\leftarrow p_u &+ \gamma (e_{ui} \\cdot q_i - \lambda p_u)\\\\ q_i &\\leftarrow q_i &+ \gamma (e_{ui} \\cdot p_u - \lambda q_i) where :math:e_{ui} = r_{ui} - \\hat{r}_{ui}. These steps are performed over all the ratings of the trainset and repeated n_epochs times. Baselines are initialized to 0. User and item factors are randomly initialized according to a normal distribution, which can be tuned using the init_mean and init_std_dev parameters. You also have control over the learning rate :math:\gamma and the regularization term :math:\lambda. Both can be different for each kind of parameter (see below). By default, learning rates are set to 0.005 and regularization terms are set to 0.02. .. _unbiased_note: .. note:: You can choose to use an unbiased version of this algorithm, simply predicting: .. math:: \hat{r}_{ui} = q_i^Tp_u This is equivalent to Probabilistic Matrix Factorization (:cite:salakhutdinov2008a, section 2) and can be achieved by setting the biased parameter to False. Args: n_factors: The number of factors. Default is 100. n_epochs: The number of iteration of the SGD procedure. Default is 20. biased(bool): Whether to use baselines (or biases). See :ref:note  above. Default is True. init_mean: The mean of the normal distribution for factor vectors initialization. Default is 0. init_std_dev: The standard deviation of the normal distribution for factor vectors initialization. Default is 0.1. lr_all: The learning rate for all parameters. Default is 0.005. reg_all: The regularization term for all parameters. Default is 0.02. lr_bu: The learning rate for :math:b_u. Takes precedence over lr_all if set. Default is None. lr_bi: The learning rate for :math:b_i. Takes precedence over lr_all if set. Default is None. lr_pu: The learning rate for :math:p_u. Takes precedence over lr_all if set. Default is None. lr_qi: The learning rate for :math:q_i. Takes precedence over lr_all if set. Default is None. reg_bu: The regularization term for :math:b_u. Takes precedence over reg_all if set. Default is None. reg_bi: The regularization term for :math:b_i. Takes precedence over reg_all if set. Default is None. reg_pu: The regularization term for :math:p_u. Takes precedence over reg_all if set. Default is None. reg_qi: The regularization term for :math:q_i. Takes precedence over reg_all if set. Default is None. random_state(int, RandomState instance from numpy, or None): Determines the RNG that will be used for initialization. If int, random_state will be used as a seed for a new RNG. This is useful to get the same initialization over multiple calls to fit(). If RandomState instance, this same instance is used as RNG. If None, the current RNG from numpy is used. Default is None. verbose: If True, prints the current epoch. Default is False. Attributes: pu(numpy array of size (n_users, n_factors)): The user factors (only exists if fit() has been called) qi(numpy array of size (n_items, n_factors)): The item factors (only exists if fit() has been called) bu(numpy array of size (n_users)): The user biases (only exists if fit() has been called) bi(numpy array of size (n_items)): The item biases (only exists if fit() has been called) """ def __init__(self, n_factors=100, n_epochs=20, biased=True, init_mean=0, init_std_dev=.1, lr_all=.005, reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None, lr_qi=None, reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None, random_state=None, verbose=False): self.n_factors = n_factors self.n_epochs = n_epochs self.biased = biased self.init_mean = init_mean self.init_std_dev = init_std_dev self.lr_bu = lr_bu if lr_bu is not None else lr_all self.lr_bi = lr_bi if lr_bi is not None else lr_all self.lr_pu = lr_pu if lr_pu is not None else lr_all self.lr_qi = lr_qi if lr_qi is not None else lr_all self.reg_bu = reg_bu if reg_bu is not None else reg_all self.reg_bi = reg_bi if reg_bi is not None else reg_all self.reg_pu = reg_pu if reg_pu is not None else reg_all self.reg_qi = reg_qi if reg_qi is not None else reg_all self.random_state = random_state self.verbose = verbose AlgoBase.__init__(self) def fit(self, trainset): AlgoBase.fit(self, trainset) self.sgd(trainset) return self def sgd(self, trainset): # OK, let's breath. I've seen so many different implementation of this # algorithm that I just not sure anymore of what it should do. I've # implemented the version as described in the BellKor papers (RS # Handbook, etc.). Mymedialite also does it this way. In his post # however, Funk seems to implicitly say that the algo looks like this # (see reg below): # for f in range(n_factors): # for _ in range(n_iter): # for u, i, r in all_ratings: # err = r_ui - # update p[u, f] # update q[i, f] # which is also the way https://github.com/aaw/IncrementalSVD.jl # implemented it. # # Funk: "Anyway, this will train one feature (aspect), and in # particular will find the most prominent feature remaining (the one # that will most reduce the error that's left over after previously # trained features have done their best). When it's as good as it's # going to get, shift it onto the pile of done features, and start a # new one. For efficiency's sake, cache the residuals (all 100 million # of them) so when you're training feature 72 you don't have to wait # for predictRating() to re-compute the contributions of the previous # 71 features. You will need 2 Gig of ram, a C compiler, and good # programming habits to do this." # A note on cythonization: I haven't dived into the details, but # accessing 2D arrays like pu using just one of the indices like pu[u] # is not efficient. That's why the old (cleaner) version can't be used # anymore, we need to compute the dot products by hand, and update # user and items factors by iterating over all factors... # user biases cdef np.ndarray[np.double_t] bu # item biases cdef np.ndarray[np.double_t] bi # user factors cdef np.ndarray[np.double_t, ndim=2] pu # item factors cdef np.ndarray[np.double_t, ndim=2] qi cdef int u, i, f cdef double r, err, dot, puf, qif cdef double global_mean = self.trainset.global_mean cdef double lr_bu = self.lr_bu cdef double lr_bi = self.lr_bi cdef double lr_pu = self.lr_pu cdef double lr_qi = self.lr_qi cdef double reg_bu = self.reg_bu cdef double reg_bi = self.reg_bi cdef double reg_pu = self.reg_pu cdef double reg_qi = self.reg_qi rng = get_rng(self.random_state) bu = np.zeros(trainset.n_users, np.double) bi = np.zeros(trainset.n_items, np.double) pu = rng.normal(self.init_mean, self.init_std_dev, (trainset.n_users, self.n_factors)) qi = rng.normal(self.init_mean, self.init_std_dev, (trainset.n_items, self.n_factors)) if not self.biased: global_mean = 0 for current_epoch in range(self.n_epochs): if self.verbose: print("Processing epoch {}".format(current_epoch)) for u, i, r in trainset.all_ratings(): # compute current error dot = 0 # for f in range(self.n_factors): dot += qi[i, f] * pu[u, f] err = r - (global_mean + bu[u] + bi[i] + dot) # update biases if self.biased: bu[u] += lr_bu * (err - reg_bu * bu[u]) bi[i] += lr_bi * (err - reg_bi * bi[i]) # update factors for f in range(self.n_factors): puf = pu[u, f] qif = qi[i, f] pu[u, f] += lr_pu * (err * qif - reg_pu * puf) qi[i, f] += lr_qi * (err * puf - reg_qi * qif) self.bu = bu self.bi = bi self.pu = pu self.qi = qi def estimate(self, u, i): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) known_item = self.trainset.knows_item(i) if self.biased: est = self.trainset.global_mean if known_user: est += self.bu[u] if known_item: est += self.bi[i] if known_user and known_item: est += np.dot(self.qi[i], self.pu[u]) else: if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: raise PredictionImpossible('User and item are unkown.') return est class SVDpp(AlgoBase): """The *SVD++* algorithm, an extension of :class:SVD taking into account implicit ratings. The prediction :math:\\hat{r}_{ui} is set as: .. math:: \hat{r}_{ui} = \mu + b_u + b_i + q_i^T\\left(p_u + |I_u|^{-\\frac{1}{2}} \sum_{j \\in I_u}y_j\\right) Where the :math:y_j terms are a new set of item factors that capture implicit ratings. Here, an implicit rating describes the fact that a user :math:u rated an item :math:j, regardless of the rating value. If user :math:u is unknown, then the bias :math:b_u and the factors :math:p_u are assumed to be zero. The same applies for item :math:i with :math:b_i, :math:q_i and :math:y_i. For details, see section 4 of :cite:Koren:2008:FMN. See also :cite:Ricci:2010, section 5.3.1. Just as for :class:SVD, the parameters are learned using a SGD on the regularized squared error objective. Baselines are initialized to 0. User and item factors are randomly initialized according to a normal distribution, which can be tuned using the init_mean and init_std_dev parameters. You have control over the learning rate :math:\gamma and the regularization term :math:\lambda. Both can be different for each kind of parameter (see below). By default, learning rates are set to 0.005 and regularization terms are set to 0.02. Args: n_factors: The number of factors. Default is 20. n_epochs: The number of iteration of the SGD procedure. Default is 20. init_mean: The mean of the normal distribution for factor vectors initialization. Default is 0. init_std_dev: The standard deviation of the normal distribution for factor vectors initialization. Default is 0.1. lr_all: The learning rate for all parameters. Default is 0.007. reg_all: The regularization term for all parameters. Default is 0.02. lr_bu: The learning rate for :math:b_u. Takes precedence over lr_all if set. Default is None. lr_bi: The learning rate for :math:b_i. Takes precedence over lr_all if set. Default is None. lr_pu: The learning rate for :math:p_u. Takes precedence over lr_all if set. Default is None. lr_qi: The learning rate for :math:q_i. Takes precedence over lr_all if set. Default is None. lr_yj: The learning rate for :math:y_j. Takes precedence over lr_all if set. Default is None. reg_bu: The regularization term for :math:b_u. Takes precedence over reg_all if set. Default is None. reg_bi: The regularization term for :math:b_i. Takes precedence over reg_all if set. Default is None. reg_pu: The regularization term for :math:p_u. Takes precedence over reg_all if set. Default is None. reg_qi: The regularization term for :math:q_i. Takes precedence over reg_all if set. Default is None. reg_yj: The regularization term for :math:y_j. Takes precedence over reg_all if set. Default is None. random_state(int, RandomState instance from numpy, or None): Determines the RNG that will be used for initialization. If int, random_state will be used as a seed for a new RNG. This is useful to get the same initialization over multiple calls to fit(). If RandomState instance, this same instance is used as RNG. If None, the current RNG from numpy is used. Default is None. verbose: If True, prints the current epoch. Default is False. Attributes: pu(numpy array of size (n_users, n_factors)): The user factors (only exists if fit() has been called) qi(numpy array of size (n_items, n_factors)): The item factors (only exists if fit() has been called) yj(numpy array of size (n_items, n_factors)): The (implicit) item factors (only exists if fit() has been called) bu(numpy array of size (n_users)): The user biases (only exists if fit() has been called) bi(numpy array of size (n_items)): The item biases (only exists if fit() has been called) """ def __init__(self, n_factors=20, n_epochs=20, init_mean=0, init_std_dev=.1, lr_all=.007, reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None, lr_qi=None, lr_yj=None, reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None, reg_yj=None, random_state=None, verbose=False): self.n_factors = n_factors self.n_epochs = n_epochs self.init_mean = init_mean self.init_std_dev = init_std_dev self.lr_bu = lr_bu if lr_bu is not None else lr_all self.lr_bi = lr_bi if lr_bi is not None else lr_all self.lr_pu = lr_pu if lr_pu is not None else lr_all self.lr_qi = lr_qi if lr_qi is not None else lr_all self.lr_yj = lr_yj if lr_yj is not None else lr_all self.reg_bu = reg_bu if reg_bu is not None else reg_all self.reg_bi = reg_bi if reg_bi is not None else reg_all self.reg_pu = reg_pu if reg_pu is not None else reg_all self.reg_qi = reg_qi if reg_qi is not None else reg_all self.reg_yj = reg_yj if reg_yj is not None else reg_all self.random_state = random_state self.verbose = verbose AlgoBase.__init__(self) def fit(self, trainset): AlgoBase.fit(self, trainset) self.sgd(trainset) return self def sgd(self, trainset): # user biases cdef np.ndarray[np.double_t] bu # item biases cdef np.ndarray[np.double_t] bi # user factors cdef np.ndarray[np.double_t, ndim=2] pu # item factors cdef np.ndarray[np.double_t, ndim=2] qi # item implicit factors cdef np.ndarray[np.double_t, ndim=2] yj cdef int u, i, j, f cdef double r, err, dot, puf, qif, sqrt_Iu, _ cdef double global_mean = self.trainset.global_mean cdef np.ndarray[np.double_t] u_impl_fdb cdef double lr_bu = self.lr_bu cdef double lr_bi = self.lr_bi cdef double lr_pu = self.lr_pu cdef double lr_qi = self.lr_qi cdef double lr_yj = self.lr_yj cdef double reg_bu = self.reg_bu cdef double reg_bi = self.reg_bi cdef double reg_pu = self.reg_pu cdef double reg_qi = self.reg_qi cdef double reg_yj = self.reg_yj bu = np.zeros(trainset.n_users, np.double) bi = np.zeros(trainset.n_items, np.double) rng = get_rng(self.random_state) pu = rng.normal(self.init_mean, self.init_std_dev, (trainset.n_users, self.n_factors)) qi = rng.normal(self.init_mean, self.init_std_dev, (trainset.n_items, self.n_factors)) yj = rng.normal(self.init_mean, self.init_std_dev, (trainset.n_items, self.n_factors)) u_impl_fdb = np.zeros(self.n_factors, np.double) for current_epoch in range(self.n_epochs): if self.verbose: print(" processing epoch {}".format(current_epoch)) for u, i, r in trainset.all_ratings(): # items rated by u. This is COSTLY Iu = [j for (j, _) in trainset.ur[u]] sqrt_Iu = np.sqrt(len(Iu)) # compute user implicit feedback u_impl_fdb = np.zeros(self.n_factors, np.double) for j in Iu: for f in range(self.n_factors): u_impl_fdb[f] += yj[j, f] / sqrt_Iu # compute current error dot = 0 # for f in range(self.n_factors): dot += qi[i, f] * (pu[u, f] + u_impl_fdb[f]) err = r - (global_mean + bu[u] + bi[i] + dot) # update biases bu[u] += lr_bu * (err - reg_bu * bu[u]) bi[i] += lr_bi * (err - reg_bi * bi[i]) # update factors for f in range(self.n_factors): puf = pu[u, f] qif = qi[i, f] pu[u, f] += lr_pu * (err * qif - reg_pu * puf) qi[i, f] += lr_qi * (err * (puf + u_impl_fdb[f]) - reg_qi * qif) for j in Iu: yj[j, f] += lr_yj * (err * qif / sqrt_Iu - reg_yj * yj[j, f]) self.bu = bu self.bi = bi self.pu = pu self.qi = qi self.yj = yj def estimate(self, u, i): est = self.trainset.global_mean if self.trainset.knows_user(u): est += self.bu[u] if self.trainset.knows_item(i): est += self.bi[i] if self.trainset.knows_user(u) and self.trainset.knows_item(i): Iu = len(self.trainset.ur[u]) # nb of items rated by u u_impl_feedback = (sum(self.yj[j] for (j, _) in self.trainset.ur[u]) / np.sqrt(Iu)) est += np.dot(self.qi[i], self.pu[u] + u_impl_feedback) return est class NMF(AlgoBase): """A collaborative filtering algorithm based on Non-negative Matrix Factorization. This algorithm is very similar to :class:SVD. The prediction :math:\\hat{r}_{ui} is set as: .. math:: \hat{r}_{ui} = q_i^Tp_u, where user and item factors are kept **positive**. Our implementation follows that suggested in :cite:NMF:2014, which is equivalent to :cite:Zhang96 in its non-regularized form. Both are direct applications of NMF for dense matrices :cite:NMF_algo. The optimization procedure is a (regularized) stochastic gradient descent with a specific choice of step size that ensures non-negativity of factors, provided that their initial values are also positive. At each step of the SGD procedure, the factors :math:f or user :math:u and item :math:i are updated as follows: .. math:: p_{uf} &\\leftarrow p_{uf} &\cdot \\frac{\\sum_{i \in I_u} q_{if} \\cdot r_{ui}}{\\sum_{i \in I_u} q_{if} \\cdot \\hat{r_{ui}} + \\lambda_u |I_u| p_{uf}}\\\\ q_{if} &\\leftarrow q_{if} &\cdot \\frac{\\sum_{u \in U_i} p_{uf} \\cdot r_{ui}}{\\sum_{u \in U_i} p_{uf} \\cdot \\hat{r_{ui}} + \lambda_i |U_i| q_{if}}\\\\ where :math:\lambda_u and :math:\lambda_i are regularization parameters. This algorithm is highly dependent on initial values. User and item factors are uniformly initialized between init_low and init_high. Change them at your own risks! A biased version is available by setting the biased parameter to True. In this case, the prediction is set as .. math:: \hat{r}_{ui} = \mu + b_u + b_i + q_i^Tp_u, still ensuring positive factors. Baselines are optimized in the same way as in the :class:SVD algorithm. While yielding better accuracy, the biased version seems highly prone to overfitting so you may want to reduce the number of factors (or increase regularization). Args: n_factors: The number of factors. Default is 15. n_epochs: The number of iteration of the SGD procedure. Default is 50. biased(bool): Whether to use baselines (or biases). Default is False. reg_pu: The regularization term for users :math:\lambda_u. Default is 0.06. reg_qi: The regularization term for items :math:\lambda_i. Default is 0.06. reg_bu: The regularization term for :math:b_u. Only relevant for biased version. Default is 0.02. reg_bi: The regularization term for :math:b_i. Only relevant for biased version. Default is 0.02. lr_bu: The learning rate for :math:b_u. Only relevant for biased version. Default is 0.005. lr_bi: The learning rate for :math:b_i. Only relevant for biased version. Default is 0.005. init_low: Lower bound for random initialization of factors. Must be greater than 0 to ensure non-negative factors. Default is 0. init_high: Higher bound for random initialization of factors. Default is 1. random_state(int, RandomState instance from numpy, or None): Determines the RNG that will be used for initialization. If int, random_state will be used as a seed for a new RNG. This is useful to get the same initialization over multiple calls to fit(). If RandomState instance, this same instance is used as RNG. If None, the current RNG from numpy is used. Default is None. verbose: If True, prints the current epoch. Default is False. Attributes: pu(numpy array of size (n_users, n_factors)): The user factors (only exists if fit() has been called) qi(numpy array of size (n_items, n_factors)): The item factors (only exists if fit() has been called) bu(numpy array of size (n_users)): The user biases (only exists if fit() has been called) bi(numpy array of size (n_items)): The item biases (only exists if fit() has been called) """ def __init__(self, n_factors=15, n_epochs=50, biased=False, reg_pu=.06, reg_qi=.06, reg_bu=.02, reg_bi=.02, lr_bu=.005, lr_bi=.005, init_low=0, init_high=1, random_state=None, verbose=False): self.n_factors = n_factors self.n_epochs = n_epochs self.biased = biased self.reg_pu = reg_pu self.reg_qi = reg_qi self.lr_bu = lr_bu self.lr_bi = lr_bi self.reg_bu = reg_bu self.reg_bi = reg_bi self.init_low = init_low self.init_high = init_high self.random_state = random_state self.verbose = verbose if self.init_low < 0: raise ValueError('init_low should be greater than zero') AlgoBase.__init__(self) def fit(self, trainset): AlgoBase.fit(self, trainset) self.sgd(trainset) return self def sgd(self, trainset): # user and item factors cdef np.ndarray[np.double_t, ndim=2] pu cdef np.ndarray[np.double_t, ndim=2] qi # user and item biases cdef np.ndarray[np.double_t] bu cdef np.ndarray[np.double_t] bi # auxiliary matrices used in optimization process cdef np.ndarray[np.double_t, ndim=2] user_num cdef np.ndarray[np.double_t, ndim=2] user_denom cdef np.ndarray[np.double_t, ndim=2] item_num cdef np.ndarray[np.double_t, ndim=2] item_denom cdef int u, i, f cdef double r, est, l, dot, err cdef double reg_pu = self.reg_pu cdef double reg_qi = self.reg_qi cdef double reg_bu = self.reg_bu cdef double reg_bi = self.reg_bi cdef double lr_bu = self.lr_bu cdef double lr_bi = self.lr_bi cdef double global_mean = self.trainset.global_mean # Randomly initialize user and item factors rng = get_rng(self.random_state) pu = rng.uniform(self.init_low, self.init_high, size=(trainset.n_users, self.n_factors)) qi = rng.uniform(self.init_low, self.init_high, size=(trainset.n_items, self.n_factors)) bu = np.zeros(trainset.n_users, np.double) bi = np.zeros(trainset.n_items, np.double) if not self.biased: global_mean = 0 for current_epoch in range(self.n_epochs): if self.verbose: print("Processing epoch {}".format(current_epoch)) # (re)initialize nums and denoms to zero user_num = np.zeros((trainset.n_users, self.n_factors)) user_denom = np.zeros((trainset.n_users, self.n_factors)) item_num = np.zeros((trainset.n_items, self.n_factors)) item_denom = np.zeros((trainset.n_items, self.n_factors)) # Compute numerators and denominators for users and items factors for u, i, r in trainset.all_ratings(): # compute current estimation and error dot = 0 # for f in range(self.n_factors): dot += qi[i, f] * pu[u, f] est = global_mean + bu[u] + bi[i] + dot err = r - est # update biases if self.biased: bu[u] += lr_bu * (err - reg_bu * bu[u]) bi[i] += lr_bi * (err - reg_bi * bi[i]) # compute numerators and denominators for f in range(self.n_factors): user_num[u, f] += qi[i, f] * r user_denom[u, f] += qi[i, f] * est item_num[i, f] += pu[u, f] * r item_denom[i, f] += pu[u, f] * est # Update user factors for u in trainset.all_users(): n_ratings = len(trainset.ur[u]) for f in range(self.n_factors): user_denom[u, f] += n_ratings * reg_pu * pu[u, f] pu[u, f] *= user_num[u, f] / user_denom[u, f] # Update item factors for i in trainset.all_items(): n_ratings = len(trainset.ir[i]) for f in range(self.n_factors): item_denom[i, f] += n_ratings * reg_qi * qi[i, f] qi[i, f] *= item_num[i, f] / item_denom[i, f] self.bu = bu self.bi = bi self.pu = pu self.qi = qi def estimate(self, u, i): # Should we cythonize this as well? known_user = self.trainset.knows_user(u) known_item = self.trainset.knows_item(i) if self.biased: est = self.trainset.global_mean if known_user: est += self.bu[u] if known_item: est += self.bi[i] if known_user and known_item: est += np.dot(self.qi[i], self.pu[u]) else: if known_user and known_item: est = np.dot(self.qi[i], self.pu[u]) else: raise PredictionImpossible('User and item are unkown.') return est