In [3]:
import json
import numpy as np
import pandas as pd

In [5]:
file = "./renttherunway_final_data.json"
df = pd.read_json(file,lines=True)
df = df[['user_id','item_id','rating']]
df = df.dropna()

In [6]:
df.head()

Unnamed: 0,user_id,item_id,rating
0,420272,2260466,10.0
1,273551,153475,10.0
2,360448,1063761,10.0
3,909926,126335,8.0
4,151944,616682,10.0


In [7]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
ratings = pd.DataFrame(np.zeros((n_users, n_items)), index = df.user_id.unique(), columns = list(df.item_id.unique()))
for row in df.itertuples():
    ratings.loc[row[1],row[2]] = row[3]
ratings = np.array(ratings)

In [8]:
print (str(n_users) + ' users')
print (str(n_items) + ' items')
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

105508 users
5850 items
Sparsity: 0.03%


In [9]:
max_count = 0
one_count = 0
two_count = 0
three_count = 0
for i in range(ratings.shape[0]):
    if len(ratings[i, :].nonzero()[0]) > 9:
        max_count+=1
        
    if len(ratings[i, :].nonzero()[0]) == 1:
        one_count += 1
        
    if len(ratings[i, :].nonzero()[0]) == 2:
        two_count += 1
        
    if len(ratings[i, :].nonzero()[0]) == 3:
        three_count += 1

In [10]:
print(one_count/n_users)
print(two_count/n_users)
print(three_count/n_users)
print(max_count)

0.6808014558137772
0.16936156499981045
0.06513250180081132
1388


In [11]:
test_users = []
for i in range(ratings.shape[0]):
    if len(ratings[i, :].nonzero()[0]) > 9:
        test_users.append(i)

In [12]:
def train_test_split(ratings, test_users, s):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in test_users:
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], size=s, replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

In [13]:
train, test = train_test_split(ratings, test_users, 5)

In [14]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [17]:
class ExplicitMF():
    def __init__(self, 
                 ratings,
                 n_factors=40,
                 learning='sgd',
                 item_fact_reg=0.0, 
                 user_fact_reg=0.0,
                 item_bias_reg=0.0,
                 user_bias_reg=0.0,
                 verbose=False):
        """
        Train a matrix factorization model to predict empty 
        entries in a matrix. The terminology assumes a 
        ratings matrix which is ~ user x item
        
        Params
        ======
        ratings : (ndarray)
            User x Item matrix with corresponding ratings
        
        n_factors : (int)
            Number of latent factors to use in matrix 
            factorization model
        learning : (str)
            Method of optimization. Options include 
            'sgd' or 'als'.
        
        item_fact_reg : (float)
            Regularization term for item latent factors
        
        user_fact_reg : (float)
            Regularization term for user latent factors
            
        item_bias_reg : (float)
            Regularization term for item biases
        
        user_bias_reg : (float)
            Regularization term for user biases
        
        verbose : (bool)
            Whether or not to printout training progress
        """
        
        self.ratings = ratings
        self.n_users, self.n_items = ratings.shape
        self.n_factors = n_factors
        self.item_fact_reg = item_fact_reg
        self.user_fact_reg = user_fact_reg
        self.item_bias_reg = item_bias_reg
        self.user_bias_reg = user_bias_reg
        self.learning = learning
        if self.learning == 'sgd':
            self.sample_row, self.sample_col = self.ratings.nonzero()
            self.n_samples = len(self.sample_row)
        self._v = verbose

    def als_step(self,
                 latent_vectors,
                 fixed_vecs,
                 ratings,
                 _lambda,
                 type='user'):
        """
        One of the two ALS steps. Solve for the latent vectors
        specified by type.
        """
        if type == 'user':
            # Precompute
            YTY = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(YTY.shape[0]) * _lambda

            for u in range(latent_vectors.shape[0]):
                latent_vectors[u, :] = solve((YTY + lambdaI), 
                                             ratings[u, :].dot(fixed_vecs))
        elif type == 'item':
            # Precompute
            XTX = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(XTX.shape[0]) * _lambda
            
            for i in range(latent_vectors.shape[0]):
                latent_vectors[i, :] = solve((XTX + lambdaI), 
                                             ratings[:, i].T.dot(fixed_vecs))
        return latent_vectors

    def train(self, n_iter=10, learning_rate=0.1):
        """ Train model for n_iter iterations from scratch."""
        # initialize latent vectors        
        self.user_vecs = np.random.normal(scale=1./self.n_factors,size=(self.n_users, self.n_factors))
        self.item_vecs = np.random.normal(scale=1./self.n_factors,size=(self.n_items, self.n_factors))
        
        if self.learning == 'als':
            self.partial_train(n_iter)
        elif self.learning == 'sgd':
            self.learning_rate = learning_rate
            self.user_bias = np.zeros(self.n_users)
            self.item_bias = np.zeros(self.n_items)
            self.global_bias = np.mean(self.ratings[np.where(self.ratings != 0)])
            self.partial_train(n_iter)
    
    
    def partial_train(self, n_iter):
        """ 
        Train model for n_iter iterations. Can be 
        called multiple times for further training.
        """
        ctr = 1
        while ctr <= n_iter:
            if ctr % 10 == 0 and self._v:
                print ('\tcurrent iteration: {}'.format(ctr))
            if self.learning == 'als':
                self.user_vecs = self.als_step(self.user_vecs, 
                                               self.item_vecs, 
                                               self.ratings, 
                                               self.user_fact_reg, 
                                               type='user')
                self.item_vecs = self.als_step(self.item_vecs, 
                                               self.user_vecs, 
                                               self.ratings, 
                                               self.item_fact_reg, 
                                               type='item')
            elif self.learning == 'sgd':
                self.training_indices = np.arange(self.n_samples)
                np.random.shuffle(self.training_indices)
                self.sgd()
            ctr += 1

    def sgd(self):
        for idx in self.training_indices:
#             print('idx',idx)
            u = self.sample_row[idx]
#             print('u',u)
            i = self.sample_col[idx]
#             print('i',i)
            prediction = self.predict(u, i)
            e = self.ratings[u,i] - prediction # error 
#             print('prediction',prediction)
            
            # Update biases
            self.user_bias[u] += self.learning_rate * (e - self.user_bias_reg * self.user_bias[u])
#             print('self.user_bias[u]',self.user_bias[u])
            self.item_bias[i] += self.learning_rate * (e - self.item_bias_reg * self.item_bias[i])
#             print('self.item_bias[i]',self.item_bias[i])
            
            #Update latent factors
            self.user_vecs[u, :] += self.learning_rate * (e * self.item_vecs[i, :] - self.user_fact_reg * self.user_vecs[u,:])
            self.item_vecs[i, :] += self.learning_rate * (e * self.user_vecs[u, :] - self.item_fact_reg * self.item_vecs[i,:])
    
    def predict(self, u, i):
        """ Single user and item prediction."""
        if self.learning == 'als':
            return self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
        elif self.learning == 'sgd':
            prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
#             print('self.user_bias[u]_pre',self.user_bias[u])
#             print('self.item_bias[i]_pre',self.item_bias[i])
            prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
            return prediction
    
    def predict_all(self):
        """ Predict ratings for every user and item."""
        predictions = np.zeros((self.user_vecs.shape[0], 
                                self.item_vecs.shape[0]))
        r = self.user_vecs.shape[0]
        c = self.item_vecs.shape[0]
        global_bias = np.repeat(self.global_bias, r*c).reshape(r,c)
        user_bias = np.repeat(self.user_bias, c, axis=0).reshape(r,c)
        print(user_bias)
        item_bias = np.repeat(self.item_bias, r, axis=0).reshape(c,r).T
        print(item_bias)
        predictions = np.matmul(self.user_vecs,self.item_vecs.T)+global_bias+user_bias+item_bias
#         for u in range(self.user_vecs.shape[0]):
#             for i in range(self.item_vecs.shape[0]):
#                 predictions[u, i] = self.predict(u, i)
        return predictions
    
    def calculate_learning_curve(self, iter_array, test, learning_rate=0.1):
        """
        Keep track of MSE as a function of training iterations.
        
        Params
        ======
        iter_array : (list)
            List of numbers of iterations to train for each step of 
            the learning curve. e.g. [1, 5, 10, 20]
        test : (2D ndarray)
            Testing dataset (assumed to be user x item).
        
        The function creates two new class attributes:
        
        train_mse : (list)
            Training data MSE values for each value of iter_array
        test_mse : (list)
            Test data MSE values for each value of iter_array
        """
        iter_array.sort()
        self.train_mse =[]
        self.test_mse = []
        iter_diff = 0
        for (i, n_iter) in enumerate(iter_array):
            if self._v:
                print ('Iteration: {}'.format(n_iter))
            if i == 0:
                self.train(n_iter - iter_diff, learning_rate)
            else:
                self.partial_train(n_iter - iter_diff)

            predictions = self.predict_all()
#             print(predictions)

            self.train_mse += [get_mse(predictions, self.ratings)]
            self.test_mse += [get_mse(predictions, test)]
            if self._v:
                print ('Train mse: ' + str(self.train_mse[-1]))
                print ('Test mse: ' + str(self.test_mse[-1]))
            iter_diff = n_iter

In [18]:
MF_SGD = ExplicitMF(train, 4, learning='sgd', verbose=True)
iter_array = [1, 2]
MF_SGD.calculate_learning_curve(iter_array, test, learning_rate=0.01)

Iteration: 1
[[-0.08262239 -0.08262239 -0.08262239 ... -0.08262239 -0.08262239
  -0.08262239]
 [ 0.00843556  0.00843556  0.00843556 ...  0.00843556  0.00843556
   0.00843556]
 [ 0.00739227  0.00739227  0.00739227 ...  0.00739227  0.00739227
   0.00739227]
 ...
 [-0.00736123 -0.00736123 -0.00736123 ... -0.00736123 -0.00736123
  -0.00736123]
 [ 0.01424442  0.01424442  0.01424442 ...  0.01424442  0.01424442
   0.01424442]
 [ 0.0051673   0.0051673   0.0051673  ...  0.0051673   0.0051673
   0.0051673 ]]
[[-0.19296303  0.03284214  0.14365255 ...  0.         -0.04958717
  -0.01021779]
 [-0.19296303  0.03284214  0.14365255 ...  0.         -0.04958717
  -0.01021779]
 [-0.19296303  0.03284214  0.14365255 ...  0.         -0.04958717
  -0.01021779]
 ...
 [-0.19296303  0.03284214  0.14365255 ...  0.         -0.04958717
  -0.01021779]
 [-0.19296303  0.03284214  0.14365255 ...  0.         -0.04958717
  -0.01021779]
 [-0.19296303  0.03284214  0.14365255 ...  0.         -0.04958717
  -0.01021779]]
Trai

In [1]:
a = [1,2,3,4,5]

In [8]:
np.repeat(a, 3, axis = 0).reshape(5,3).T

array([[1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5]])