# hy1 5점대

In [3]:
import pandas as pd
import numpy as np
import random
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# LabelEncoder
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])

# Dummy recommender 0
def recommender0(recomm_list):
    recommendations = []
    for pair in recomm_list:
        recommendations.append(random.random() * 4 + 1)
    return recommendations


# Dummy recommender 1
def recommender1(recomm_list):
    recommendations = []
    for pair in recomm_list:
        recommendations.append(random.random() * 4 + 1)
    return recommendations


# Hybrid 함수
def hybrid1(recomm_list, weight=[0.5, 0.5]):
    result0 = recommender0(recomm_list)
    result1 = recommender1(recomm_list)
    result = []
    for i, number in enumerate(result0):
        result.append(result0[i] * weight[0] + result1[i] * weight[1])
    return result

# RMSE 계산을 위한 함수
def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
average_rmse_values = []

for train_index, test_index in kf.split(ratings):
    # Training set과 test set을 나눔
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]
    
    # np.array로 바꾸기
    ratings_train = np.array(train_data.pivot(index='user_id', columns='isbn', values='rating').fillna(0))
    ratings_test = np.array(test_data)
    
    # print(test_data)
    
    # Hybrid 결과 얻기
    predictions = hybrid1(ratings_test[:, [0, 1]], [0.8, 0.2])
    rmse=RMSE2(ratings_test[:, 2], predictions)
    print(rmse)
    average_rmse_values.append(rmse)

# 전체 폴드에 대한 RMSE 평균 계산 및 출력
average_rmse = np.mean(average_rmse_values)
print("평균 RMSE:", average_rmse)

5.167802917329347
5.175871905029951
5.155988028267658
평균 RMSE: 5.166554283542318


# hy2 1.1대 base

In [10]:
# Predictions using MF ###########################################################################
import pandas as pd
import numpy as np
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

# LabelEncoder
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])




class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, tolerance=0.005, verbose=True):
        self.R = np.array(ratings)

        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        # 다른 변수 초기화
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.tolerance = tolerance
        self.verbose = verbose
        # print(len(self.user_id_index))
    # 테스트 셋을 선정하는 메소드 
    def set_test(self, ratings_test):                           # Setting test set
        test_set = []
        for i in range(len(ratings_test)):                      # Selected ratings
            # print(self.user_id_index)
            # print(self.user_id_index[4738])
            x = self.user_id_index[ratings_test.iloc[i,0]]      # Getting R indice for the given user_id and isbn
            y = self.item_id_index[ratings_test.iloc[i,1]]
            z = ratings_test.iloc[i,2]
            test_set.append([x, y, z])
            self.R[x, y] = 0                    # Setting test set ratings to 0
        self.test_set = test_set
        return test_set                         # Return test set

    def test(self):                             # Training 하면서 test set의 정확도를 계산하는 메소드 
        # user isbn 행렬 선언
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # bias 선언
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # 학습리스트
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        # sgd iter
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                      # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance: # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process,best_RMSE

    # sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            error = (r - prediction)
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (error - self.beta * self.b_d[j])

            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j,:])
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i,:])

    # train rmse
    def rmse(self):
        rows, columns = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(rows, columns):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Test RMSE 계산하는 method 
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # 평점 유저, isbn
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

     # 평점 유저, isbn
    def get_one_prediction(self, user_id, isbn):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[isbn])


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
average_rmse_values = []

for train_index, test_index in kf.split(ratings):
    # Training set과 test set을 나눔
    # print(train_index)
    # print(test_index)
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]
    # print(test_data)
    # 모델 생성 및 훈련
    temp = ratings.pivot(index = 'user_id', columns ='isbn', values = 'rating').fillna(0)
    # mf = NEW_MF(temp, K=220, alpha=0.0014, beta=0.075, iterations=350, tolerance=0.0001, verbose=True)
    mf = NEW_MF(temp, K=30, alpha=0.01, beta=0.02, iterations=100, tolerance=0.01, verbose=True)
    # Test set에 대한 예측 및 평가
    test_set = mf.set_test(test_data)
    result,rmse = mf.test()
    average_rmse_values.append(rmse)

# 전체 폴드에 대한 RMSE 평균 계산 및 출력
average_rmse = np.mean(average_rmse_values)
print("평균 RMSE:", average_rmse)


# Predictions using DL ###########################################################################
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.layers import Dropout, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam, Adamax
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
# Variable 초기화 
K = 350                         # Latent factor 수 
lr = 0.01                           # 학습률
reg = 0.005                     # Regularization penalty

def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

# 모델 구조
def architecture(train_data, test_data,lr):
    # Keras model
    user = Input(shape=(1,))  # User input
    item = Input(shape=(1,))  # Item input
    P_embedding = Embedding(N, K, embeddings_regularizer=l2(reg))(user)  # (N, 1, K)
    Q_embedding = Embedding(M, K, embeddings_regularizer=l2(reg))(item)  # (M, 1, K)
    user_bias = Embedding(N, 1, embeddings_regularizer=l2(reg))(user)  # User bias term (N, 1, )
    item_bias = Embedding(M, 1, embeddings_regularizer=l2(reg))(item)  # Item bias term (M, 1, )

    # Concatenate layers
    P_embedding = Flatten()(P_embedding)  # (K, )
    Q_embedding = Flatten()(Q_embedding)  # (K, )
    user_bias = Flatten()(user_bias)  # (1, )
    item_bias = Flatten()(item_bias)  # (1, )
    R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias])  # (2K + 2, )

    # Neural network
    R = Dense(2048)(R)
    R = Activation('swish')(R)
    R = Dense(1)(R)

    model = Model(inputs=[user, item], outputs=R)
    model.compile(
        loss=RMSE,
        #optimizer=SGD(lr=0.1, momentum=0.9),
        optimizer=Adamax(lr=lr),
        metrics=[RMSE]
    )

    checkpoint_path = 'CheckPoint'
    checkpoint = ModelCheckpoint(checkpoint_path,
                                save_best_only=True,
                                save_weights_only=True,
                                monitor='val_RMSE',
                                verbose=1)

    result = model.fit(
        x=[train_data.user_id.values, train_data.isbn.values],
        y=train_data.rating.values - mu,
        callbacks=[checkpoint],
        epochs=25,
        batch_size=128,
        validation_data=(
            [test_data.user_id.values, test_data.isbn.values],
            test_data.rating.values - mu
        )
    )

    model.load_weights(checkpoint_path)
    return model, result

# 사용자 수 및 도서 수
N = len(set(ratings.user_id)) + 1
M = len(set(ratings.isbn)) + 1


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
rmse_values = []

# for문
for train_indices, test_indices in kf.split(ratings):
    train_data = ratings.iloc[train_indices].reset_index(drop=True)
    test_data = ratings.iloc[test_indices].reset_index(drop=True)
    
    mu = train_data.rating.mean()    # 전체 평균 

    # 셔플
    train_data = shuffle(train_data)

    # 모델 및 결과
    model, result = architecture(train_data, test_data,lr)

    # 모델 평가
    predictions = model.predict([test_data.user_id.values, test_data.isbn.values]) + mu
    rmse = RMSE(test_data.rating.values, predictions)
    rmse_values.append(rmse)

# rmse 평균
average_rmse = np.mean(rmse_values)
print("평균 RMSE:", average_rmse)





# Hybrid recommendation ###########################################################################
def recommender0(recomm_list, mf):
    id_pairs = zip(recomm_list[:, 0], recomm_list[:, 1])
    recommendations = np.array([mf.get_one_prediction(user, isbn) for (user, isbn) in id_pairs])
    return recommendations


# Recommender 1
def recommender1(recomm_list, model):
    user_ids = recomm_list[:, 0]
    isbn = recomm_list[:, 1]
    recommendations = model.predict([user_ids, isbn]) + mu
    return recommendations


# RMSE 계산을 위한 함수
def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
rmse_values = []

# for문
for train_indices, test_indices in kf.split(ratings):
    train_data = ratings.iloc[train_indices].reset_index(drop=True)
    test_data = ratings.iloc[test_indices].reset_index(drop=True)
    recomm_list = np.array(test_data.iloc[:, [0, 1]])
    result0 = recommender0(recomm_list, mf)
    result1 = np.ravel(recommender1(recomm_list, model))

    weight = [0.5, 0.5]
    predictions = []
    for i, number in enumerate(result0):
        predictions.append(result0[i] * weight[0] + result1[i] * weight[1])
    print(RMSE2(test_data['rating'], predictions))
    print(RMSE2(test_data['rating'], result0))
    print(RMSE2(test_data['rating'], result1))

    for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        weight = [i, 1 - i]
        predictions = []
        for i, number in enumerate(result0):
            predictions.append(result0[i] * weight[0] + result1[i] * weight[1])
        print("Weights - %.2f : %.2f ; RMSE = %.7f" % (weight[0], weight[1], RMSE2(test_data['rating'], predictions)))

    for i in [0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]:
        weight = [i, 1 - i]
        predictions = []
        for i, number in enumerate(result0):
            predictions.append(result0[i] * weight[0] + result1[i] * weight[1])
        print("Weights - %.2f : %.2f ; RMSE = %.7f" % (weight[0], weight[1], RMSE2(test_data['rating'], predictions)))


Iteration: 10 ; Train RMSE = 1.399208 ; Test RMSE = 1.677623
Iteration: 20 ; Train RMSE = 1.191494 ; Test RMSE = 1.665007
Iteration: 30 ; Train RMSE = 0.978475 ; Test RMSE = 1.667352
Iteration: 40 ; Train RMSE = 0.778754 ; Test RMSE = 1.673550
20 1.664856511749714
Iteration: 10 ; Train RMSE = 1.401221 ; Test RMSE = 1.671852
Iteration: 20 ; Train RMSE = 1.192272 ; Test RMSE = 1.657453
Iteration: 30 ; Train RMSE = 0.982934 ; Test RMSE = 1.659129
Iteration: 40 ; Train RMSE = 0.786008 ; Test RMSE = 1.664173
21 1.657168909333758
Iteration: 10 ; Train RMSE = 1.409634 ; Test RMSE = 1.643798
Iteration: 20 ; Train RMSE = 1.197614 ; Test RMSE = 1.633979
Iteration: 30 ; Train RMSE = 0.987280 ; Test RMSE = 1.637629




Iteration: 40 ; Train RMSE = 0.785657 ; Test RMSE = 1.644205
20 1.6339147258262274
평균 RMSE: 1.6519800489699
Epoch 1/25
Epoch 1: val_RMSE improved from inf to 1.68318, saving model to CheckPoint
Epoch 2/25
Epoch 2: val_RMSE improved from 1.68318 to 1.67454, saving model to CheckPoint
Epoch 3/25
Epoch 3: val_RMSE did not improve from 1.67454
Epoch 4/25
Epoch 4: val_RMSE did not improve from 1.67454
Epoch 5/25
Epoch 5: val_RMSE did not improve from 1.67454
Epoch 6/25
Epoch 6: val_RMSE did not improve from 1.67454
Epoch 7/25
Epoch 7: val_RMSE did not improve from 1.67454
Epoch 8/25
Epoch 8: val_RMSE did not improve from 1.67454
Epoch 9/25
Epoch 9: val_RMSE did not improve from 1.67454
Epoch 10/25
Epoch 10: val_RMSE did not improve from 1.67454
Epoch 11/25
Epoch 11: val_RMSE did not improve from 1.67454
Epoch 12/25
Epoch 12: val_RMSE did not improve from 1.67454
Epoch 13/25
Epoch 13: val_RMSE did not improve from 1.67454
Epoch 14/25
Epoch 14: val_RMSE did not improve from 1.67454
Epoch 15/2



Epoch 1/25
Epoch 1: val_RMSE improved from inf to 1.66843, saving model to CheckPoint
Epoch 2/25
Epoch 2: val_RMSE improved from 1.66843 to 1.66036, saving model to CheckPoint
Epoch 3/25
Epoch 3: val_RMSE did not improve from 1.66036
Epoch 4/25
Epoch 4: val_RMSE did not improve from 1.66036
Epoch 5/25
Epoch 5: val_RMSE did not improve from 1.66036
Epoch 6/25
Epoch 6: val_RMSE did not improve from 1.66036
Epoch 7/25
Epoch 7: val_RMSE did not improve from 1.66036
Epoch 8/25
Epoch 8: val_RMSE did not improve from 1.66036
Epoch 9/25
Epoch 9: val_RMSE did not improve from 1.66036
Epoch 10/25
Epoch 10: val_RMSE did not improve from 1.66036
Epoch 11/25
Epoch 11: val_RMSE did not improve from 1.66036
Epoch 12/25
Epoch 12: val_RMSE did not improve from 1.66036
Epoch 13/25
Epoch 13: val_RMSE did not improve from 1.66036
Epoch 14/25
Epoch 14: val_RMSE did not improve from 1.66036
Epoch 15/25
Epoch 15: val_RMSE did not improve from 1.66036
Epoch 16/25
Epoch 16: val_RMSE did not improve from 1.6603



Epoch 1/25
Epoch 1: val_RMSE improved from inf to 1.68671, saving model to CheckPoint
Epoch 2/25
Epoch 2: val_RMSE improved from 1.68671 to 1.65957, saving model to CheckPoint
Epoch 3/25
Epoch 3: val_RMSE did not improve from 1.65957
Epoch 4/25
Epoch 4: val_RMSE did not improve from 1.65957
Epoch 5/25
Epoch 5: val_RMSE did not improve from 1.65957
Epoch 6/25
Epoch 6: val_RMSE did not improve from 1.65957
Epoch 7/25
Epoch 7: val_RMSE did not improve from 1.65957
Epoch 8/25
Epoch 8: val_RMSE did not improve from 1.65957
Epoch 9/25
Epoch 9: val_RMSE did not improve from 1.65957
Epoch 10/25
Epoch 10: val_RMSE did not improve from 1.65957
Epoch 11/25
Epoch 11: val_RMSE did not improve from 1.65957
Epoch 12/25
Epoch 12: val_RMSE did not improve from 1.65957
Epoch 13/25
Epoch 13: val_RMSE did not improve from 1.65957
Epoch 14/25
Epoch 14: val_RMSE did not improve from 1.65957
Epoch 15/25
Epoch 15: val_RMSE did not improve from 1.65957
Epoch 16/25
Epoch 16: val_RMSE did not improve from 1.6595

# hy2 1.1대 swish 층 3개 각주O
 - RMSE 1.1475490666666666

In [9]:
# Predictions using MF ###########################################################################
import pandas as pd
import numpy as np
path='/home/recordk/reco/BX-Book-Ratings.csv'
ratings = pd.read_csv(path)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']

# 0점은 다 제거
ratings=ratings[ratings['rating']!=0]
ratings=ratings.reset_index(drop=True)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

#  user과 isbn의 인덱스를 정해주기 위해 LabelEncoder 사용! 
user_encoder = LabelEncoder()
isbn_encoder = LabelEncoder()

# user_id와 isbn label인코딩
ratings['user_id'] = user_encoder.fit_transform(ratings['user_id'])
ratings['isbn'] = isbn_encoder.fit_transform(ratings['isbn'])



# mf class
class NEW_MF():
    # user - book matrix 행렬을 받음
    def __init__(self, ratings, K, alpha, beta, iterations, tolerance=0.005, verbose=True):
        self.R = np.array(ratings)
        # book id와 index 리스트 선언
        item_id_index = []
        # index와 book id 리스트 선언
        index_item_id = []
        
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
            
        # 딕셔너리화
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)
        
        # user와 index 리스트 선언        
        user_id_index = []
        # index와 user와 리스트 선언
        index_user_id = []
        
        # book user matrix 행렬을 받음
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
            
        # 딕셔너리화
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        
        # 파라미터 선언
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.tolerance = tolerance
        self.verbose = verbose
        # print(len(self.user_id_index))
        
    # 테스트 데이터 세팅
    def set_test(self, ratings_test):                           # Setting test set
        test_set = []
        for i in range(len(ratings_test)):                      # Selected ratings
            # print(self.user_id_index)
            # print(self.user_id_index[4738])
            
            # 인덱스 확인해서 유저의 인덱스에 해당하는 것
            x = self.user_id_index[ratings_test.iloc[i,0]]
            
            # 인덱스 확인해서 book의 인덱스에 해당하는 것
            y = self.item_id_index[ratings_test.iloc[i,1]]
            
            # 점수
            z = ratings_test.iloc[i,2]
            
            # 테스트 데이터 
            test_set.append([x, y, z])
            
            # 테스트 데이터에 해당되는 원본데이터 점수 0점으로 만듬
            self.R[x, y] = 0                    
        self.test_set = test_set
        return test_set                         

    def test(self):                             # Training 하면서 test set의 정확도를 계산하는 메소드 
        # P,Q 선언
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # bias 선언
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # 비어있는값 삭제 -> bookratings는 이미 비어있는 값이 없으므로 사실상 필요 x
        rows, columns = self.R.nonzero()
        
        # 샘플값
        self.samples = [(i,j, self.R[i,j]) for i, j in zip(rows, columns)]

        
        # for문을 통해서 비교 후 rmse가 작은 값을 찾기 위해 큰값을 best RMSE에 선언
        best_RMSE = 10000
        
        # iteration확인하기 위한 선언
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)#샘플 셔플
            self.sgd()# sgd
            rmse1 = self.rmse()
            rmse2 = self.test_rmse() # test rmse
            training_process.append((i, rmse1, rmse2)) # 진행과정 확인위한 리스트
            
            # 출력되는부분
            if self.verbose:# default값이 True설정 False하면 안보임
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            
            # rmse값이 더 좋으면 bestrmse에 반영
            if best_RMSE > rmse2:                      # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance: # 점수가 더 나아지지 않으면 break
                break
        print(best_iteration, best_RMSE) # 출력하는부분
        return training_process,best_RMSE # 평균값을 받기 위해 리턴값 하나 더 추가

    # sgd 함수
    def sgd(self):
        for i, j, r in self.samples: #샘플을 받아서 
            prediction = self.get_prediction(i, j) # 예측
            error = (r - prediction) #loss 평가
            self.b_u[i] += self.alpha * (error - self.beta * self.b_u[i]) # 파라미터값과 계산하여 가중치 변화
            self.b_d[j] += self.alpha * (error - self.beta * self.b_d[j]) # ==

            self.Q[j, :] += self.alpha * (error * self.P[i, :] - self.beta * self.Q[j,:]) # ==
            self.P[i, :] += self.alpha * (error * self.Q[j, :] - self.beta * self.P[i,:]) # ==

    # rmse 측정 학습 rmse를 평가하는 함수
    def rmse(self):
        rows, columns = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(rows, columns):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Test RMSE 계산하는 method 
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # predict하는 함수
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # predict하는 함수
    def get_one_prediction(self, user_id, isbn):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[isbn])


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
average_rmse_values = []

# train index test index 무작위로 1/3 나눔
for train_index, test_index in kf.split(ratings):
    
    # print(train_index)
    # print(test_index)
    
    # Train + test index의 리스트를 각각받아 원본데이터를 나눔
    train_data = ratings.iloc[train_index]
    test_data = ratings.iloc[test_index]
    # print(test_data)
    
    # 피봇으로 만듬 fillna는 사실상 안쓰임 + user_id isbn 행렬을 만듬
    temp = ratings.pivot(index = 'user_id', columns ='isbn', values = 'rating').fillna(0)
    # mf = NEW_MF(temp, K=220, alpha=0.0014, beta=0.075, iterations=350, tolerance=0.0001, verbose=True)
    
    # 인스턴스생성 + 파라미터를 대입
    mf = NEW_MF(temp, K=30, alpha=0.01, beta=0.02, iterations=100, tolerance=0.01, verbose=True)
    
    # Test 데이터 세팅
    test_set = mf.set_test(test_data)
    
    # 추가 리턴을 받아서 rmse또한 반환
    result,rmse = mf.test()
    
    # 평균을 계산하기 위한 리스트
    average_rmse_values.append(rmse)

# 전체 폴드에 대한 RMSE 평균 계산 및 출력
average_rmse = np.mean(average_rmse_values)

#평균 RMSE
print("평균 RMSE:", average_rmse)


# Predictions using DL ###########################################################################
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.layers import Dropout, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam, Adamax
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
# Variable 초기화 
K = 350                         # Latent factor 수 
lr = 0.01                           # 학습률
reg = 0.005                     # Regularization penalty

#tf RMSE계산
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

# 모델 구조
def architecture(train_data, test_data,lr):
    # Keras model
    user = Input(shape=(1,))  # User input
    item = Input(shape=(1,))  # Item input
    P_embedding = Embedding(N, K, embeddings_regularizer=l2(reg))(user)  # (N, 1, K)
    Q_embedding = Embedding(M, K, embeddings_regularizer=l2(reg))(item)  # (M, 1, K)
    user_bias = Embedding(N, 1, embeddings_regularizer=l2(reg))(user)  # User bias term (N, 1, )
    item_bias = Embedding(M, 1, embeddings_regularizer=l2(reg))(item)  # Item bias term (M, 1, )

    # P,Q임베딩 및 bias 연결
    P_embedding = Flatten()(P_embedding)  # (K, )
    Q_embedding = Flatten()(Q_embedding)  # (K, )
    user_bias = Flatten()(user_bias)  # (1, )
    item_bias = Flatten()(item_bias)  # (1, )
    R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias])  # (2K + 2, )

    # 신경망 층 구성! swish 활성화함수 활용하여 3개층구성
    R = Dense(256)(R)
    R = Activation('swish')(R)
    R = Dense(128)(R)
    R = Activation('swish')(R)
    R = Dense(64)(R)
    R = Activation('swish')(R)
    R = Dense(1)(R)

    # 모델 선언
    model = Model(inputs=[user, item], outputs=R)
    
    # 모델 컴파일
    model.compile(
        loss=RMSE,
        #optimizer=SGD(lr=0.1, momentum=0.9),
        # adamax optimizer
        optimizer=Adamax(lr=lr),
        metrics=[RMSE]
    )

    #checkpoint경로 설정
    checkpoint_path = 'CheckPoint'
    
    #모델 체크포인트
    checkpoint = ModelCheckpoint(checkpoint_path,
                                save_best_only=True,
                                save_weights_only=True,
                                monitor='val_RMSE',
                                verbose=1)
    # 모델 학습 및 결과 도출
    result = model.fit(
        x=[train_data.user_id.values, train_data.isbn.values],
        y=train_data.rating.values - mu, # ratings에 평균을 빼서 정규화
        callbacks=[checkpoint], # 콜백함수 선언 tensorboard나 earlystopping등 많은 콜백함수 사용가능
        epochs=25,
        batch_size=128,
        validation_data=(
            [test_data.user_id.values, test_data.isbn.values],
            test_data.rating.values - mu
        )
    )
    # 모델 가중치 불러오기
    model.load_weights(checkpoint_path)
    
    # 모델과 결과 반환
    return model, result

# 사용자 수 및 도서 수
N = len(set(ratings.user_id)) + 1
M = len(set(ratings.isbn)) + 1


# kfold이용해서 트레인 테스트 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
rmse_values = []

# for문
for train_indices, test_indices in kf.split(ratings):
    train_data = ratings.iloc[train_indices].reset_index(drop=True)
    test_data = ratings.iloc[test_indices].reset_index(drop=True)
    
    mu = train_data.rating.mean()    # 전체 평균 

    # 셔플
    train_data = shuffle(train_data)

    # 모델 및 결과
    model, result = architecture(train_data, test_data,lr)

    # 모델 평가
    predictions = model.predict([test_data.user_id.values, test_data.isbn.values]) + mu
    rmse = RMSE(test_data.rating.values, predictions)
    rmse_values.append(rmse)

# rmse 평균
average_rmse = np.mean(rmse_values)
print("평균 RMSE:", average_rmse)





# Hybrid recommendation ###########################################################################

# MF추천
def recommender0(recomm_list, mf):
    id_pairs = zip(recomm_list[:, 0], recomm_list[:, 1])
    recommendations = np.array([mf.get_one_prediction(user, isbn) for (user, isbn) in id_pairs])
    return recommendations


# 딥러닝 모델 추천
def recommender1(recomm_list, model):
    user_ids = recomm_list[:, 0]
    isbn = recomm_list[:, 1]
    recommendations = model.predict([user_ids, isbn]) + mu
    return recommendations


# kfold이용해서 train test 3개로 분리
kf = KFold(n_splits=3, shuffle=True)

# rmse리스트
rmse_values = []

# for문
for train_indices, test_indices in kf.split(ratings):
    train_data = ratings.iloc[train_indices].reset_index(drop=True)
    test_data = ratings.iloc[test_indices].reset_index(drop=True)
    
    # test 데이터 사용하여 평가
    recomm_list = np.array(test_data.iloc[:, [0, 1]])
    
    # mf결과
    result0 = recommender0(recomm_list, mf)
    
    # 딥러닝 결과
    result1 = np.ravel(recommender1(recomm_list, model))

    # 각각의 모델에 대한 가중치 선언
    weight = [0.5, 0.5]
    predictions = []
    for i, number in enumerate(result0):
        predictions.append(result0[i] * weight[0] + result1[i] * weight[1])
        
    # 각각의 모델에 대한 rmse 평가
    print(RMSE2(test_data['rating'], predictions))
    print(RMSE2(test_data['rating'], result0))
    print(RMSE2(test_data['rating'], result1))

    # 가중치 조절
    for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        weight = [i, 1 - i]
        predictions = []
        for i, number in enumerate(result0):
            predictions.append(result0[i] * weight[0] + result1[i] * weight[1])
        print("Weights - %.2f : %.2f ; RMSE = %.7f" % (weight[0], weight[1], RMSE2(test_data['rating'], predictions)))

    # 가중치 조절 2
    for i in [0.88, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]:
        weight = [i, 1 - i]
        predictions = []
        for i, number in enumerate(result0):
            predictions.append(result0[i] * weight[0] + result1[i] * weight[1])
        print("Weights - %.2f : %.2f ; RMSE = %.7f" % (weight[0], weight[1], RMSE2(test_data['rating'], predictions)))


Iteration: 10 ; Train RMSE = 1.397127 ; Test RMSE = 1.682567
Iteration: 20 ; Train RMSE = 1.189229 ; Test RMSE = 1.670560
Iteration: 30 ; Train RMSE = 0.978470 ; Test RMSE = 1.673958
Iteration: 40 ; Train RMSE = 0.779834 ; Test RMSE = 1.680920
20 1.6705488449249313
Iteration: 10 ; Train RMSE = 1.410151 ; Test RMSE = 1.648082
Iteration: 20 ; Train RMSE = 1.196928 ; Test RMSE = 1.637061
Iteration: 30 ; Train RMSE = 0.983315 ; Test RMSE = 1.640656
Iteration: 40 ; Train RMSE = 0.784239 ; Test RMSE = 1.647733
19 1.637060536177332
Iteration: 10 ; Train RMSE = 1.400140 ; Test RMSE = 1.670089
Iteration: 20 ; Train RMSE = 1.190195 ; Test RMSE = 1.658814
Iteration: 30 ; Train RMSE = 0.981148 ; Test RMSE = 1.661657
Iteration: 40 ; Train RMSE = 0.782620 ; Test RMSE = 1.668159




20 1.6587715442839177
평균 RMSE: 1.6554603084620603
Epoch 1/25
Epoch 1: val_RMSE improved from inf to 1.67890, saving model to CheckPoint
Epoch 2/25
Epoch 2: val_RMSE improved from 1.67890 to 1.65617, saving model to CheckPoint
Epoch 3/25
Epoch 3: val_RMSE did not improve from 1.65617
Epoch 4/25
Epoch 4: val_RMSE did not improve from 1.65617
Epoch 5/25
Epoch 5: val_RMSE did not improve from 1.65617
Epoch 6/25
Epoch 6: val_RMSE did not improve from 1.65617
Epoch 7/25
Epoch 7: val_RMSE did not improve from 1.65617
Epoch 8/25
Epoch 8: val_RMSE did not improve from 1.65617
Epoch 9/25
Epoch 9: val_RMSE did not improve from 1.65617
Epoch 10/25
Epoch 10: val_RMSE did not improve from 1.65617
Epoch 11/25
Epoch 11: val_RMSE did not improve from 1.65617
Epoch 12/25
Epoch 12: val_RMSE did not improve from 1.65617
Epoch 13/25
Epoch 13: val_RMSE did not improve from 1.65617
Epoch 14/25
Epoch 14: val_RMSE did not improve from 1.65617
Epoch 15/25
Epoch 15: val_RMSE did not improve from 1.65617
Epoch 16

2023-12-17 08:03:48.569164: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1121674248 exceeds 10% of free system memory.


Epoch 1/25
Epoch 1: val_RMSE improved from inf to 1.70096, saving model to CheckPoint
Epoch 2/25
Epoch 2: val_RMSE improved from 1.70096 to 1.67846, saving model to CheckPoint
Epoch 3/25
Epoch 3: val_RMSE did not improve from 1.67846
Epoch 4/25
Epoch 4: val_RMSE did not improve from 1.67846
Epoch 5/25
Epoch 5: val_RMSE did not improve from 1.67846
Epoch 6/25
Epoch 6: val_RMSE did not improve from 1.67846
Epoch 7/25
Epoch 7: val_RMSE did not improve from 1.67846
Epoch 8/25
Epoch 8: val_RMSE did not improve from 1.67846
Epoch 9/25
Epoch 9: val_RMSE did not improve from 1.67846
Epoch 10/25
Epoch 10: val_RMSE did not improve from 1.67846
Epoch 11/25
Epoch 11: val_RMSE did not improve from 1.67846
Epoch 12/25
Epoch 12: val_RMSE did not improve from 1.67846
Epoch 13/25
Epoch 13: val_RMSE did not improve from 1.67846
Epoch 14/25
Epoch 14: val_RMSE did not improve from 1.67846
Epoch 15/25
Epoch 15: val_RMSE did not improve from 1.67846
Epoch 16/25
Epoch 16: val_RMSE did not improve from 1.6784

2023-12-17 08:04:33.750147: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1121674248 exceeds 10% of free system memory.


Epoch 1/25
Epoch 1: val_RMSE improved from inf to 1.67368, saving model to CheckPoint
Epoch 2/25
Epoch 2: val_RMSE improved from 1.67368 to 1.64594, saving model to CheckPoint
Epoch 3/25
Epoch 3: val_RMSE did not improve from 1.64594
Epoch 4/25
Epoch 4: val_RMSE did not improve from 1.64594
Epoch 5/25
Epoch 5: val_RMSE did not improve from 1.64594
Epoch 6/25
Epoch 6: val_RMSE did not improve from 1.64594
Epoch 7/25
Epoch 7: val_RMSE did not improve from 1.64594
Epoch 8/25
Epoch 8: val_RMSE did not improve from 1.64594
Epoch 9/25
Epoch 9: val_RMSE did not improve from 1.64594
Epoch 10/25
Epoch 10: val_RMSE did not improve from 1.64594
Epoch 11/25
Epoch 11: val_RMSE did not improve from 1.64594
Epoch 12/25
Epoch 12: val_RMSE did not improve from 1.64594
Epoch 13/25
Epoch 13: val_RMSE did not improve from 1.64594
Epoch 14/25
Epoch 14: val_RMSE did not improve from 1.64594
Epoch 15/25
Epoch 15: val_RMSE did not improve from 1.64594
Epoch 16/25
Epoch 16: val_RMSE did not improve from 1.6459

In [None]:
### 최종
Weights - 0.96 : 0.04 ; RMSE = 1.1489783
Weights - 0.97 : 0.03 ; RMSE = 1.1518401
Weights - 0.96 : 0.04 ; RMSE = 1.1418288

In [12]:
RMSE1 = 1.1489783
RMSE2= 1.1518401
RMSE3= 1.1418288
np.mean([RMSE1,RMSE2,RMSE3])

1.1475490666666666