In [35]:
import numpy as np
import datetime
import time
import csv
import pickle

In [36]:
def load_reviews(path, **kwargs):
    """Load MovieLens reviews."""
    
    options = {
        'fieldnames': ('userid', 'movieid', 'rating', 
                       'timestamp'),
        'delimiter': '\t'
    }
    options.update(kwargs)
    
    parse_date = lambda r,k: datetime.date.fromtimestamp(float(r[k]))
    parse_int = lambda r,k: int(r[k])
    
    with open(path, 'rb') as reviews:
        reader = csv.DictReader(reviews, **options)
        for row in reader:
            row['userid'] = parse_int(row, 'userid')
            row['movieid'] = parse_int(row, 'movieid')
            row['rating'] = parse_int(row, 'rating')
            row['timestamp'] = parse_date(row, 'timestamp')
            yield row

In [65]:
class Recommender(object):
    
    @classmethod
    def load(klass, pickle_path):
        """
        Instantiates the class by deserializing the pickle. Note that the
        object returned may not be an exact match to the code in this
        class (if it was saved before updates).
        """
        with open(pickle_path, 'rb') as pkl:
            return pickle.load(pkl)
    
    def __init__(self, udata):
        self.udata = udata
        self.users = None
        self.movies = None
        self.reviews = None
        
        # Descriptive properties
        self.build_start = None
        self.build_finish = None
        self.description = None
        
        # Model properties
        self.model = None
        self.features = 2
        self.steps = 5000
        self.alpha = 0.0002
        self.beta = 0.02
        
        self.load_dataset()
        
    def dump(self, pickle_path):
        """
        Dump the object into a serialized file using the pickle module.
        This will allow us to quickly reload our model in the future.
        """
        with open(pickle_path, 'wb') as pkl:
            pickle.dump(self, pkl)
        
    def load_dataset(self):
        """
        Loads an index of users and movies as a heap and a reviews table
        as a N x M array where N is the number of users and M is the number
        of movies. Note that order matters so that we can look up values
        outside of the matrix!
        """
        self.users = set([])
        self.movies = set([])
        for review in load_reviews(self.udata):
            self.users.add(review['userid'])
            self.movies.add(review['movieid'])
            
        self.users = sorted(self.users)
        self.movies = sorted(self.movies)
        
        self.reviews = np.zeros(shape=(len(self.users), len(self.movies)))
        for review in load_reviews(self.udata):
            uid = self.users.index(review['userid'])
            mid = self.movies.index(review['movieid'])
            self.reviews[uid, mid] = review['rating']
            

    def build(self, output=None):
        """
        Trains the model by employing matrix factorization on our training
        data set, the sparse reviews matrix. The model is the dot product
        of the P and Q decomposed matrices from the factorization.
        """
        options = {
            'K': self.features,
            'steps': self.steps,
            'alpha': self.alpha,
            'beta': self.beta
        }
        
        self.build_start = time.time()
        self.P, self.Q = factor(self.reviews, **options)
        self.model = np.dot(self.P, self.Q.T)
        self.build_finish = time.time()
        
        if output:
            self.dump(output)
            

    def sparsity(self):
        """Returns the percent of elements that are zero in the array."""
        return 1 - self.density()
    
    def density(self):
        """Returns the percent of elements that are nonzero in the array."""
        nonzero = float(np.count_nonzero(self.reviews))
        return nonzero / self.reviews.size

In [70]:
data_path = 'data/ml-100k/u.data'
model = Recommender(data_path)
model.reviews = model.reviews[:100,:100]
model.build('record.pickle')

In [69]:
def initialize(R, K):
    """Returns initial matrices for an N x M matrix,
    R and K features.

    :returns: P, Q initial matrices of N x K and M x K sizes.
    """

    N, M = R.shape
    P = np.random.rand(N, K)
    Q = np.random.rand(M, K)
    return P, Q

def factor(R, P=None, Q=None, K=2, steps=5000, alpha=0.0002, 
           beta=0.02):
    """
    Performs matrix factorization on R with given parameters.

    :param R: A matrix to be factorized, dimension N x M
    :param P: an initial matrix of dimension N x K
    :param Q: an initial matrix of dimension M x K
    :param K: the number of latent features
    :param steps: the maximum number of iterations to optimize in
    :param alpha: the learning rate for gradient descent
    :param beta:  the regularization parameter

    :returns: final matrices P and Q
    """

    if not isinstance(P, np.ndarray) or not isinstance(Q, np.ndarray):
        P, Q = initialize(R, K)
    Q = Q.T

    rows, cols = R.shape
    for step in xrange(steps):
        for i in xrange(rows):
            for j in xrange(cols):
                if R[i,j] > 0:
                    eij = R[i,j] - np.dot(P[i,:], Q[:,j])
                    for k in xrange(K):
                        P[i,k] = P[i,k] + alpha * (2 * eij * Q[k,j] -
                                                  beta * P[i,k])
                        Q[k,j] = Q[k,j] + alpha * (2 * eij * P[i,k] -
                                                  beta * Q[k,j])
        e = 0
        for i in xrange(rows):
            for j in xrange(cols):
                if R[i,j] > 0:
                    e = e + pow(R[i,j] - np.dot(P[i,:], Q[:,j]), 2)
                    for k in xrange(K):
                        e = e + (beta/2) * (pow(P[i,k], 2) + pow(Q[k,j], 2))

        if e < 0.001:
            break

    return P, Q.T

In [72]:
delta = model.build_finish - model.build_start
print delta

386.531332016


In [55]:
isinstance(P, np.ndarray)

True