# SGD를 사용한 MF 기본 알고리즘

In [3]:
import os
import pandas as pd
import numpy as np 

base_src = '../data/drive-download-20240102T142504Z-001'
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(
    u_data_src,
    sep='\t',
    names=r_cols,
    encoding='latin-1'
)
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

In [4]:
class MF():
    def __init__(self, ratings, hyper_params):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = hyper_params['K'] # 잠재요인의 count
        self.alpha = hyper_params['alpha'] # 학습률
        self.beta = hyper_params['beta'] # 정규화 계수
        self.iterations = hyper_params['iterations'] # SGD 반복 횟수
        self.verbose = hyper_params['verbose'] 
        
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction) # 실제값 - 예측값
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors**2))
        
    def train(self):
        self.P = np.random.normal(scale= 1./self.K,
                                  size=(self.num_users, self.K))
        self.Q = np.random.normal(scale= 1./self.K,
                                  size=(self.num_items, self.K))
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items) 
        self.b = np.mean(self.R[self.R.nonzero()]) # 전체 평균
        
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            training_process.append((i+1, rmse))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print('Iteration: %d ; Train RMSE = %.4f' % (i+1, rmse))
        return training_process
    
    def get_prediction(self, i, j):
        return self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j,].T)

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction) # 실제값 - 예측값
            
            self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
            self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))
            
            self.P[i, :] += self.alpha * ((e * self.Q[j, :]) - (self.beta * self.P[i, :]))
            self.Q[j, :] += self.alpha * ((e * self.P[i, :]) - (self.beta * self.Q[j, :]))

R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
hyper_params = {
    'K': 30,
    'alpha': 0.001,
    'beta': 0.02,
    'iterations': 100,
    'verbose': True
}

In [5]:
mf = MF(R_temp, hyper_params)
training_process = mf.train()

Iteration: 10 ; Train RMSE = 0.9585
Iteration: 20 ; Train RMSE = 0.9374
Iteration: 30 ; Train RMSE = 0.9281
Iteration: 40 ; Train RMSE = 0.9226
Iteration: 50 ; Train RMSE = 0.9185
Iteration: 60 ; Train RMSE = 0.9148
Iteration: 70 ; Train RMSE = 0.9104
Iteration: 80 ; Train RMSE = 0.9046
Iteration: 90 ; Train RMSE = 0.8962
Iteration: 100 ; Train RMSE = 0.8849
