# x 1.초반대

In [5]:
import pandas as pd
import numpy as np
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# LabelEncoder
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])

ratings = ratings.pivot(index = 'user_id', columns ='isbn', values = 'rating').fillna(0)

# New MF class for training & testing
class MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
        # user_id, movie_id를 R의 index와 매칭하기 위한 dictionary 생성
        isbn_index = []
        index_isbn = []
        for i, one_id in enumerate(ratings):
            isbn_index.append([one_id, i])
            index_isbn.append([i, one_id])
        self.isbn_index = dict(isbn_index)
        self.index_isbn = dict(index_isbn)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        # 다른 변수 초기화
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

    def train(self):                             # Training 하면서 test set의 정확도를 계산하는 메소드 
        # user isbn 행렬 선언
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # bias 선언
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # List of training samples
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        # sgd iter
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            # rmse = self.rmse()
            rmse = self.rmse()

            training_process.append((i, rmse))
            if self.verbose:
                if (i+1) % 2 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f " % (i+1, rmse))
        return training_process

    # sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            error = (r - prediction)

            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (error - self.beta * self.b_d[j])

            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j,:])
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i,:])

    # rmse
    def rmse(self):
        rows, columns = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(rows, columns):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # 평점 유저, isbn
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

     # 평점 유저, isbn
    def get_one_prediction(self, user_id, isbn):
        return self.get_prediction(self.user_id_index[user_id], self.isbn_index[isbn])

# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# rmse리스트
average_rmse_values = []

for train_index, test_index in kf.split(ratings):
    # Training set과 test set을 나눔
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]

    # 모델 생성 및 훈련
    # mf = MF(train_data, K=30, alpha=0.001, beta=0.02, iterations=20, verbose=True)
    mf = MF(train_data, K=30, alpha=0.01, beta=0.02, iterations=20, verbose=True)
    mf.train()

    # 모델 평가
    rmse = mf.rmse()
    # average_rmse_values.append(rmse)
    # rmse = mf.rmse(test_data)  # test_data를 이용하여 모델을 평가
    average_rmse_values.append(rmse)

# rmse 평균
average_rmse = np.mean(average_rmse_values)
print("평균 RMSE:", average_rmse)

Iteration: 2 ; Train RMSE = 1.661214 
Iteration: 4 ; Train RMSE = 1.571802 
Iteration: 6 ; Train RMSE = 1.505876 
Iteration: 8 ; Train RMSE = 1.451751 
Iteration: 10 ; Train RMSE = 1.404364 
Iteration: 12 ; Train RMSE = 1.360649 
Iteration: 14 ; Train RMSE = 1.318300 
Iteration: 16 ; Train RMSE = 1.275512 
Iteration: 18 ; Train RMSE = 1.231201 
Iteration: 20 ; Train RMSE = 1.185339 
Iteration: 2 ; Train RMSE = 1.645081 
Iteration: 4 ; Train RMSE = 1.556427 
Iteration: 6 ; Train RMSE = 1.491481 
Iteration: 8 ; Train RMSE = 1.438327 
Iteration: 10 ; Train RMSE = 1.391683 
Iteration: 12 ; Train RMSE = 1.348364 
Iteration: 14 ; Train RMSE = 1.305987 
Iteration: 16 ; Train RMSE = 1.263377 
Iteration: 18 ; Train RMSE = 1.220144 
Iteration: 20 ; Train RMSE = 1.176381 
Iteration: 2 ; Train RMSE = 1.649609 
Iteration: 4 ; Train RMSE = 1.558075 
Iteration: 6 ; Train RMSE = 1.491386 
Iteration: 8 ; Train RMSE = 1.436955 
Iteration: 10 ; Train RMSE = 1.389607 
Iteration: 12 ; Train RMSE = 1.346122

# 파라미터 1 1.654331301145468

In [8]:
import pandas as pd
import numpy as np
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# LabelEncoder
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])




class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, tolerance=0.005, verbose=True):
        self.R = np.array(ratings)

        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        # 다른 변수 초기화
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.tolerance = tolerance
        self.verbose = verbose
        # print(len(self.user_id_index))
    # 테스트 셋을 선정하는 메소드 
    def set_test(self, ratings_test):                           # Setting test set
        test_set = []
        for i in range(len(ratings_test)):                      # Selected ratings
            # print(self.user_id_index)
            # print(self.user_id_index[4738])
            x = self.user_id_index[ratings_test.iloc[i,0]]      # Getting R indice for the given user_id and isbn
            y = self.item_id_index[ratings_test.iloc[i,1]]
            z = ratings_test.iloc[i,2]
            test_set.append([x, y, z])
            self.R[x, y] = 0                    # Setting test set ratings to 0
        self.test_set = test_set
        return test_set                         # Return test set

    def test(self):                             # Training 하면서 test set의 정확도를 계산하는 메소드 
        # user isbn 행렬 선언
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # bias 선언
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # 학습리스트
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        # sgd iter
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                      # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance: # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process,best_RMSE

    # sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            error = (r - prediction)
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (error - self.beta * self.b_d[j])

            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j,:])
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i,:])

    # train rmse
    def rmse(self):
        rows, columns = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(rows, columns):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Test RMSE 계산하는 method 
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # 평점 유저, isbn
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

     # 평점 유저, isbn
    def get_one_prediction(self, user_id, isbn):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[isbn])


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
average_rmse_values = []

for train_index, test_index in kf.split(ratings):
    # Training set과 test set을 나눔
    # print(train_index)
    # print(test_index)
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]
    # print(test_data)
    # 모델 생성 및 훈련
    temp = ratings.pivot(index = 'user_id', columns ='isbn', values = 'rating').fillna(0)
    # mf = NEW_MF(temp, K=220, alpha=0.0014, beta=0.075, iterations=350, tolerance=0.0001, verbose=True)
    mf = NEW_MF(temp, K=30, alpha=0.01, beta=0.02, iterations=20, tolerance=0.0001, verbose=True)
    # Test set에 대한 예측 및 평가
    test_set = mf.set_test(test_data)
    result,rmse = mf.test()
    average_rmse_values.append(rmse)

# 전체 폴드에 대한 RMSE 평균 계산 및 출력
average_rmse = np.mean(average_rmse_values)
print("평균 RMSE:", average_rmse)


Iteration: 10 ; Train RMSE = 1.402241 ; Test RMSE = 1.664166
Iteration: 20 ; Train RMSE = 1.190615 ; Test RMSE = 1.655629
18 1.6555306486327295
Iteration: 10 ; Train RMSE = 1.410767 ; Test RMSE = 1.654760
Iteration: 20 ; Train RMSE = 1.200527 ; Test RMSE = 1.641287
19 1.6412865142346602
Iteration: 10 ; Train RMSE = 1.395122 ; Test RMSE = 1.676858
Iteration: 20 ; Train RMSE = 1.185222 ; Test RMSE = 1.666177
19 1.6661767405690138
평균 RMSE: 1.654331301145468


# 파라미터2 1.652371795990227

In [10]:
import pandas as pd
import numpy as np
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# LabelEncoder
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])




class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, tolerance=0.005, verbose=True):
        self.R = np.array(ratings)

        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        # 다른 변수 초기화
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.tolerance = tolerance
        self.verbose = verbose
        # print(len(self.user_id_index))
    # 테스트 셋을 선정하는 메소드 
    def set_test(self, ratings_test):                           # Setting test set
        test_set = []
        for i in range(len(ratings_test)):                      # Selected ratings
            # print(self.user_id_index)
            # print(self.user_id_index[4738])
            x = self.user_id_index[ratings_test.iloc[i,0]]      # Getting R indice for the given user_id and isbn
            y = self.item_id_index[ratings_test.iloc[i,1]]
            z = ratings_test.iloc[i,2]
            test_set.append([x, y, z])
            self.R[x, y] = 0                    # Setting test set ratings to 0
        self.test_set = test_set
        return test_set                         # Return test set

    def test(self):                             # Training 하면서 test set의 정확도를 계산하는 메소드 
        # user isbn 행렬 선언
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # bias 선언
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # 학습리스트
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        # sgd iter
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                      # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance: # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process,best_RMSE

    # sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            error = (r - prediction)
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (error - self.beta * self.b_d[j])

            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j,:])
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i,:])

    # train rmse
    def rmse(self):
        rows, columns = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(rows, columns):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Test RMSE 계산하는 method 
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # 평점 유저, isbn
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

     # 평점 유저, isbn
    def get_one_prediction(self, user_id, isbn):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[isbn])


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
average_rmse_values = []

for train_index, test_index in kf.split(ratings):
    # Training set과 test set을 나눔
    # print(train_index)
    # print(test_index)
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]
    # print(test_data)
    # 모델 생성 및 훈련
    temp = ratings.pivot(index = 'user_id', columns ='isbn', values = 'rating').fillna(0)
    # mf = NEW_MF(temp, K=220, alpha=0.0014, beta=0.075, iterations=350, tolerance=0.0001, verbose=True)
    mf = NEW_MF(temp, K=30, alpha=0.01, beta=0.02, iterations=100, tolerance=0.0001, verbose=True)
    # Test set에 대한 예측 및 평가
    test_set = mf.set_test(test_data)
    result,rmse = mf.test()
    average_rmse_values.append(rmse)

# 전체 폴드에 대한 RMSE 평균 계산 및 출력
average_rmse = np.mean(average_rmse_values)
print("평균 RMSE:", average_rmse)


Iteration: 10 ; Train RMSE = 1.408299 ; Test RMSE = 1.652284
Iteration: 20 ; Train RMSE = 1.198636 ; Test RMSE = 1.640711
19 1.6407113289976978
Iteration: 10 ; Train RMSE = 1.398577 ; Test RMSE = 1.675701
Iteration: 20 ; Train RMSE = 1.184847 ; Test RMSE = 1.663455
19 1.6634548982417063
Iteration: 10 ; Train RMSE = 1.402582 ; Test RMSE = 1.665350
Iteration: 20 ; Train RMSE = 1.195277 ; Test RMSE = 1.653157
21 1.6529491607312772
평균 RMSE: 1.652371795990227


# best 1.6516754814088204

In [11]:
import pandas as pd
import numpy as np
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# LabelEncoder
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])




class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, tolerance=0.005, verbose=True):
        self.R = np.array(ratings)

        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        # 다른 변수 초기화
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.tolerance = tolerance
        self.verbose = verbose
        # print(len(self.user_id_index))
    # 테스트 셋을 선정하는 메소드 
    def set_test(self, ratings_test):                           # Setting test set
        test_set = []
        for i in range(len(ratings_test)):                      # Selected ratings
            # print(self.user_id_index)
            # print(self.user_id_index[4738])
            x = self.user_id_index[ratings_test.iloc[i,0]]      # Getting R indice for the given user_id and isbn
            y = self.item_id_index[ratings_test.iloc[i,1]]
            z = ratings_test.iloc[i,2]
            test_set.append([x, y, z])
            self.R[x, y] = 0                    # Setting test set ratings to 0
        self.test_set = test_set
        return test_set                         # Return test set

    def test(self):                             # Training 하면서 test set의 정확도를 계산하는 메소드 
        # user isbn 행렬 선언
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # bias 선언
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # 학습리스트
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        # sgd iter
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                      # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance: # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process,best_RMSE

    # sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            error = (r - prediction)
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (error - self.beta * self.b_d[j])

            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j,:])
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i,:])

    # train rmse
    def rmse(self):
        rows, columns = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(rows, columns):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Test RMSE 계산하는 method 
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # 평점 유저, isbn
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

     # 평점 유저, isbn
    def get_one_prediction(self, user_id, isbn):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[isbn])


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
average_rmse_values = []

for train_index, test_index in kf.split(ratings):
    # Training set과 test set을 나눔
    # print(train_index)
    # print(test_index)
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]
    # print(test_data)
    # 모델 생성 및 훈련
    temp = ratings.pivot(index = 'user_id', columns ='isbn', values = 'rating').fillna(0)
    # mf = NEW_MF(temp, K=220, alpha=0.0014, beta=0.075, iterations=350, tolerance=0.0001, verbose=True)
    mf = NEW_MF(temp, K=30, alpha=0.01, beta=0.02, iterations=100, tolerance=0.01, verbose=True)
    # Test set에 대한 예측 및 평가
    test_set = mf.set_test(test_data)
    result,rmse = mf.test()
    average_rmse_values.append(rmse)

# 전체 폴드에 대한 RMSE 평균 계산 및 출력
average_rmse = np.mean(average_rmse_values)
print("평균 RMSE:", average_rmse)


Iteration: 10 ; Train RMSE = 1.397400 ; Test RMSE = 1.683767
Iteration: 20 ; Train RMSE = 1.188668 ; Test RMSE = 1.669946
Iteration: 30 ; Train RMSE = 0.979994 ; Test RMSE = 1.671452
Iteration: 40 ; Train RMSE = 0.785293 ; Test RMSE = 1.676944
21 1.6695460502831667
Iteration: 10 ; Train RMSE = 1.403494 ; Test RMSE = 1.660651
Iteration: 20 ; Train RMSE = 1.193318 ; Test RMSE = 1.648821
Iteration: 30 ; Train RMSE = 0.982521 ; Test RMSE = 1.651776
Iteration: 40 ; Train RMSE = 0.782062 ; Test RMSE = 1.657684
20 1.648703958212813
Iteration: 10 ; Train RMSE = 1.409145 ; Test RMSE = 1.650209
Iteration: 20 ; Train RMSE = 1.198282 ; Test RMSE = 1.636960
Iteration: 30 ; Train RMSE = 0.987234 ; Test RMSE = 1.639423
Iteration: 40 ; Train RMSE = 0.786633 ; Test RMSE = 1.645226
21 1.6367764357304813
평균 RMSE: 1.6516754814088204


# 파라미터3 1.6548192178781562

In [12]:
import pandas as pd
import numpy as np
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# LabelEncoder
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])




class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, tolerance=0.005, verbose=True):
        self.R = np.array(ratings)

        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        # 다른 변수 초기화
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.tolerance = tolerance
        self.verbose = verbose
        # print(len(self.user_id_index))
    # 테스트 셋을 선정하는 메소드 
    def set_test(self, ratings_test):                           # Setting test set
        test_set = []
        for i in range(len(ratings_test)):                      # Selected ratings
            # print(self.user_id_index)
            # print(self.user_id_index[4738])
            x = self.user_id_index[ratings_test.iloc[i,0]]      # Getting R indice for the given user_id and isbn
            y = self.item_id_index[ratings_test.iloc[i,1]]
            z = ratings_test.iloc[i,2]
            test_set.append([x, y, z])
            self.R[x, y] = 0                    # Setting test set ratings to 0
        self.test_set = test_set
        return test_set                         # Return test set

    def test(self):                             # Training 하면서 test set의 정확도를 계산하는 메소드 
        # user isbn 행렬 선언
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # bias 선언
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # 학습리스트
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        # sgd iter
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                      # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance: # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process,best_RMSE

    # sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            error = (r - prediction)
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (error - self.beta * self.b_d[j])

            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j,:])
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i,:])

    # train rmse
    def rmse(self):
        rows, columns = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(rows, columns):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Test RMSE 계산하는 method 
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # 평점 유저, isbn
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

     # 평점 유저, isbn
    def get_one_prediction(self, user_id, isbn):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[isbn])


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
average_rmse_values = []

for train_index, test_index in kf.split(ratings):
    # Training set과 test set을 나눔
    # print(train_index)
    # print(test_index)
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]
    # print(test_data)
    # 모델 생성 및 훈련
    temp = ratings.pivot(index = 'user_id', columns ='isbn', values = 'rating').fillna(0)
    # mf = NEW_MF(temp, K=220, alpha=0.0014, beta=0.075, iterations=350, tolerance=0.0001, verbose=True)
    mf = NEW_MF(temp, K=50, alpha=0.01, beta=0.02, iterations=100, tolerance=0.01, verbose=True)
    # Test set에 대한 예측 및 평가
    test_set = mf.set_test(test_data)
    result,rmse = mf.test()
    average_rmse_values.append(rmse)

# 전체 폴드에 대한 RMSE 평균 계산 및 출력
average_rmse = np.mean(average_rmse_values)
print("평균 RMSE:", average_rmse)


Iteration: 10 ; Train RMSE = 1.405153 ; Test RMSE = 1.676314
Iteration: 20 ; Train RMSE = 1.211938 ; Test RMSE = 1.662822
Iteration: 30 ; Train RMSE = 1.020498 ; Test RMSE = 1.663891
Iteration: 40 ; Train RMSE = 0.834693 ; Test RMSE = 1.668667
23 1.6624413056856375
Iteration: 10 ; Train RMSE = 1.406713 ; Test RMSE = 1.668564
Iteration: 20 ; Train RMSE = 1.212034 ; Test RMSE = 1.658893
Iteration: 30 ; Train RMSE = 1.019769 ; Test RMSE = 1.662399
Iteration: 40 ; Train RMSE = 0.832768 ; Test RMSE = 1.669039
19 1.6588930772198467
Iteration: 10 ; Train RMSE = 1.412662 ; Test RMSE = 1.655416
Iteration: 20 ; Train RMSE = 1.218239 ; Test RMSE = 1.643192
Iteration: 30 ; Train RMSE = 1.027407 ; Test RMSE = 1.645175
Iteration: 40 ; Train RMSE = 0.837878 ; Test RMSE = 1.650029
21 1.643123270728984
평균 RMSE: 1.6548192178781562
