In [1]:
# imports
import numpy as np
import zipfile as zf
import time
from sklearn import linear_model
import scipy.sparse as sp

In [2]:
# specify file name / location here:
file_name = 'train_triplets.txt'
zip_location = './train_triplets.txt.zip'

In [3]:
AVERAGE_LINE_SIZE = 63  # average number of bytes in a line, tells readlines() how far to read the file

class RecommenderSystem:
    def __init__(self, alpha=3, bins=10, codec='utf-8', file_name='train_triplets.txt', num_factors_k=30,
                 n_triplets=300000, optimization_steps=10, zip_location='./train_triplets.txt.zip'):
        """
        Initialization of a recommender system
        :param alpha:               alpha penalty of the ridge regression model (default is 3)
        :param bins:                how many logarithmic bins to split the play counts into (default is 10)
        :param codec:               character encoding of the .txt file
        :param file_name:           name of the .txt file
        :param num_factors_k:       how many latent factors to compute (default is 30)
        :param n_triplets:          how many triplets from the .txt file to consider (default is 300000)
        :param optimization_steps:  how many times to alternatingly optimize P and Q
        :param zip_location:        the location of the .zip file
        """
        # initialization values
        self.alpha = alpha
        self.bins = bins
        self.codec = codec
        self.file_name = file_name
        self.num_factors_k = num_factors_k
        self.n_triplets = n_triplets
        self.optimization_steps = optimization_steps
        self.zip_location = zip_location

        # create self.matrix_R_df
        self.R = self.get_matrix()
        self.n = self.R.shape[0]
        self.d = self.R.shape[1]
        # set aside 200 values from the matrix to validate
        self.validation = self.extract_validation_data()
        # initialize P and Q using SVD
        self.P, self.Q = self.initialize_P_Q()

    def get_matrix(self):
        # unzip the file as far as necessary
        zfile = zf.ZipFile(self.zip_location)
        ifile = zfile.open(self.file_name)
        line_list = ifile.readlines(self.n_triplets * AVERAGE_LINE_SIZE)
        # write each entry of the triplets to their own list
        users = []
        songs = []
        counts = []
        for l in range(self.n_triplets):
            user, song, count = line_list[l].decode(self.codec).strip('\n').split('\t')
            users.append(user)
            songs.append(song)
            counts.append(count)
        # convert the IDs from ugly 40 byte strings to consecutive integers
        userlist, userids = np.unique(np.asarray(users), return_inverse=True)   # np.unique is getting to be my favorite
        songlist, songids = np.unique(np.asarray(songs), return_inverse=True)   # function after 2 projects in this class ;)
        # bin the play counts
        binned_counts = np.minimum(np.asarray(np.log2(np.asarray(counts, dtype='uint64') * 2), dtype='uint64'), self.bins)
        # create the matrix
        R = sp.coo_matrix((binned_counts, (songids, userids))).tocsr()
        # recursively remove rows/columns with <=5 entries
        for i in range(10):     # just to make sure this doesn't run forever, 10 should be more than enough
            r_entries = R.tocsc()
            r_entries.data = np.ones(r_entries.data.shape)
            entries_per_row = np.asarray(r_entries.sum(1)).flatten()
            R = R[entries_per_row > 5]
            r_entries = R.tocsc()
            r_entries.data = np.ones(r_entries.data.shape)
            entries_per_column = np.asarray(r_entries.sum(0)).flatten()
            if np.min(entries_per_column) > 5 and np.min(entries_per_row) > 5:
                break
            R = R[:, entries_per_column > 5]
        return R

    def extract_validation_data(self):
        validation = np.zeros([200, 3], dtype='uint32')
        index = np.vstack(np.nonzero(self.R)).T
        shuff = np.random.permutation(index)
        for i, (x, y) in enumerate(shuff[:200]):
            validation[i] = [x, y, self.R[x, y]]
            self.R[x, y] = 0
        return validation

    def initialize_P_Q(self):
        U, S, Pt = sp.linalg.svds(self.R.asfptype(), k=self.num_factors_k)   # requires casting to float for some reason
        Q = U.dot(np.diag(S))
        P = Pt.T
        return P, Q

    def find_latent_factors(self, optimization_steps=None):
        # n: how many items/songs we have
        # d: how many users we have
        # Given:                Matrix R (n x d)
        # Find latent factors:  Matrix Q (n x k) and P (d x k) to minimize error for R = Q * P.T (=> P.T is k x d!)
        # Using alternating optimization
        if not optimization_steps:
            optimization_steps = self.optimization_steps
        self.reg = linear_model.Ridge(alpha=self.alpha, fit_intercept=False)
        error = np.sum(np.square(self.R[self.R.nonzero()] - self.Q.dot(self.P.transpose())[self.R.nonzero()]))
        print('initial error =', error)
        for i in range(optimization_steps):     # alternatingly optimize for 20 steps (no idea how much is good)
            self.compute_P()
            self.compute_Q()
            error = np.sum(np.square(self.R[self.R.nonzero()] - self.Q.dot(self.P.transpose())[self.R.nonzero()]))
            print('error after {} steps = {}'.format(i + 1, error))

    def compute_P(self):
        for d in range(self.d):                         # looping over users d to compute rows of P
            y_nonsparse = self.R.getcol(d)              # get the d'th column of items from R
            indices = y_nonsparse.nonzero()[0]
            # only select the songs in that column that user d has listened to:
            y = np.asarray(y_nonsparse[y_nonsparse.nonzero()]).flatten()
            X = self.Q[indices]                         # only select the corresponding rows from Q
            self.reg.fit(X, y)                          # perform ridge regression for this selection
            self.P[d] = self.reg.coef_                  # write this result to the d'th row of P

    def compute_Q(self):
        for n in range(self.n):                         # looping over songs n to compute rows of Q
            y_nonsparse = self.R.getrow(n)              # get the n'th row of users from R
            indices = y_nonsparse.nonzero()[1]
            # only select the users in that row that have listened to song n:
            y = np.asarray(y_nonsparse[y_nonsparse.nonzero()]).flatten()
            X = self.P[indices]                         # only select the corresponding rows from P
            self.reg.fit(X, y)                          # perform ridge regression for this selection
            self.Q[n] = self.reg.coef_                  # write this result to the n'th row of Q

    def evaluate(self):
        errors = [(val[2] - self.make_prediction(val[0], val[1]))**2 for val in self.validation]
        rmse = (sum(errors)/len(errors))**0.5
        print('mean squared error with {} latent factors: RMSE = {}'.format(self.num_factors_k, rmse))

    def make_prediction(self, song, user):
        return np.dot(self.Q[song], self.P[user])

In [4]:
# create recommender (feel free to add more parameters here)
recommender = RecommenderSystem(file_name=file_name, zip_location=zip_location)

# see how good it performs purely from the SVD initialization:
recommender.evaluate()

mean squared error with 30 latent factors: RMSE = 2.072805582136889


In [6]:
# alternatingly optimize latent factors (this takes about 5 minutes for k=30)
t_start = time.time()
recommender.find_latent_factors(optimization_steps=10)
t_finish = time.time()
print('Finding latent factors takes {}sec'.format(t_finish - t_start))

initial error = 410480.560154
error after 1 steps = 83041.55338935749
error after 2 steps = 34398.363920314994
error after 3 steps = 29331.584887217585
error after 4 steps = 27292.09403787067
error after 5 steps = 26185.971140385853
error after 6 steps = 25484.793342798846
error after 7 steps = 24997.135832830332
error after 8 steps = 24636.942010378832
error after 9 steps = 24359.78522714035
error after 10 steps = 24140.25791580917
Finding latent factors takes 190.22467613220215sec


In [7]:
# check how much the optimization improved things
recommender.evaluate()

mean squared error with 30 latent factors: RMSE = 1.1619595817113526


This takes our RMSE from 2.07 to 1.16. 
Further optimization might improve the RMSE, however the time cost of the optimization is quite high.