In [1]:
import numpy as np
import data

In [2]:
train = data.train
test = data.test

In [11]:
class MatrixFactorization():
    
    def __init__(self, train, test, f, epsilon):
        """
        param train : Rating Matrix for train
        param test : Rating Matrix for test
        param f : latent feature parameter
        """
        
        self._R = train # Implication Matrix for training size (m, n)
        self._R_test = test # Implication Matrix for test size (m, n)
        self._P = np.array(np.vectorize(lambda x: 0 if x==0 else 1)(train), dtype = np.float64) # Preference Matrix for training
        self._P_test = np.array(np.vectorize(lambda x: 0 if x==0 else 1)(test), dtype = np.float64) # Preference Matrix for training
        self._n_user_rated = np.sum(self._P, axis = 1)
        self._n_item_rated = np.sum(self._P, axis = 0)
        self._num_users, self._num_items = train.shape
        self._alpha = 1
        self._lambda = 0.01
        self._C = 1 + self._alpha * self._R # Confidence Matrix size (m, n)
        self._f = f
        self._epsilon = epsilon
        
        
    def fit(self):
        """
        training Matrix Factorization : update matrix latent weight and bias
        """
        # init latent features
        self._W = np.random.normal(0, 0.1, size=(self._num_users, self._f))
        self._H = np.random.normal(0, 0.1, size=(self._num_items, self._f))
        
        cost_diff = 100000
        count = 0
        self._training_process = [100000000]
        # repeat ALS until convergence
        while cost_diff > self._epsilon :
        
            count += 1
            
            self._yTy = self._Y.T.dot(self._Y)
            for u in range(self._num_users):
                self.optimize_x(u)
            
            self._xTx = self._X.T.dot(self._X)
            for i in range(self._num_items):
                self.optimize_y(i)
    
            cost = self.cost()
            self._training_process.append(cost)
            cost_diff = self._training_process[count - 1] - self._training_process[count]
            rank = self.compute_rank()
            print("count: %d, cost_difference : %.4f, rank : %.4f"% (count, cost_diff, rank))
        
        self.print_results()
                
                
    def optimize_x(self, u):
        """
        Optimize X given user u
        """
        C_u = np.diag(self._C[u, :]) # create diagonal matrix size (n, n)
        
        # (f,f) matrix
        temp1 = self._yTy + self._Y.T.dot(C_u - np.identity(self._num_items)).dot(self._Y) + self._lambda * np.identity(self._f)
        # (f,1) matrix
        temp2 = self._Y.T.dot(C_u).dot(self._P[u])
        
        self._X[u, :] = np.linalg.inv(temp1).dot(temp2)
        
    
    def optimize_y(self, i):
        """
        Optimize X given user u
        """
        C_i = np.diag(self._C[:, i]) # create diagonal matrix size (m, m)
        
        # (f,f) matrix
        temp1 = self._xTx + self._X.T.dot(C_i - np.identity(self._num_users)).dot(self._X) + self._lambda * np.identity(self._f)
        # (f,1) matrix
        temp2 = self._X.T.dot(C_i).dot(self._P[:, i])
        
        self._Y[i, :] = np.linalg.inv(temp1).dot(temp2)
        
                
    def cost(self):
        """
        compute Loss function
        """
        loss = np.sum(self._C * np.square(self._P - self._X.dot(self._Y.T))) + self._lambda * (np.linalg.norm(self._X) + np.linalg.norm(self._Y))
        
        return loss
    
    
    def compute_rank(self):
        
        prediction = self._X.dot(self._Y.T)
        test_x = np.unique(self._R_test.nonzero()[0])
        temp_1 = 0
        temp_2 = 0
        
        for x in test_x :
            temp_y = self._R_test[x].nonzero()
            inv_pre = -1 * prediction[x, temp_y]
            sort_x = inv_pre.argsort() # index starts with 0
            sort_x = sort_x.argsort()
            rank_x = sort_x / len(sort_x[0])
            
            temp_1 += (self._R_test[x, temp_y] * rank_x).sum()
            temp_2 += self._R_test[x, temp_y].sum()
        
        rank = temp_1 / temp_2
            
        return rank
    
    
    def print_results(self):
        """
        print fit results
        """

        print("Final P hat matrix:")
        print(self._X.dot(self._Y.T))

In [12]:
np.random.seed(7)
    
np.seterr(all="warn")
    
factorizer = MatrixFactorization(train, test, f=40, epsilon = 1.0)
factorizer.fit()

count: 1, cost_difference : 99870355.4053, rank : 0.4743
count: 2, cost_difference : 33070.4776, rank : 0.4699
count: 3, cost_difference : 4349.5330, rank : 0.4694
count: 4, cost_difference : 1643.7486, rank : 0.4692
count: 5, cost_difference : 843.9207, rank : 0.4692
count: 6, cost_difference : 503.3075, rank : 0.4691
count: 7, cost_difference : 327.9517, rank : 0.4690
count: 8, cost_difference : 226.1830, rank : 0.4690
count: 9, cost_difference : 162.2168, rank : 0.4689
count: 10, cost_difference : 120.1176, rank : 0.4689
count: 11, cost_difference : 91.5488, rank : 0.4689
count: 12, cost_difference : 71.5798, rank : 0.4689
count: 13, cost_difference : 57.1788, rank : 0.4689
count: 14, cost_difference : 46.4761, rank : 0.4689
count: 15, cost_difference : 38.3108, rank : 0.4689
count: 16, cost_difference : 31.9477, rank : 0.4689
count: 17, cost_difference : 26.9068, rank : 0.4689
count: 18, cost_difference : 22.8643, rank : 0.4688
count: 19, cost_difference : 19.5929, rank : 0.4689
co

KeyboardInterrupt: 

데이터 셋의 한계점이 존재하는 듯  
cost_difference가 지속적으로 감소하는 것으로 봐서는, cost가 감소하면서 수렴하는 것으로 보임 (convex 함수이므로)  
rank measure가 0.47인 것은 매우 나쁜 수치  
저번주 implementation 중 좋은 성능을 보인 Constrained_pmf에 적용해보니, 0.45 수준에 수렴하는 것을 확인할 수 있었음