In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

ratings = pd.read_csv('/content/drive/MyDrive/개인화추천시스템/추천시스템 Termproject/BX-Book-Ratings.csv')

ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)
ratings.columns=['user_id','isbn','rating']
ratings=ratings[ratings['rating']!=0]

In [None]:
from sklearn.utils import shuffle

# User encoding
user_dict = {}
for i in set(ratings['user_id']):
    user_dict[i] = len(user_dict)
n_user = len(user_dict)
# Item encoding
item_dict = {}
start_point = n_user
for i in set(ratings['isbn']):
    item_dict[i] = start_point + len(item_dict)
n_item = len(item_dict)
start_point += n_item
num_x = start_point               # Total number of x
x = shuffle(ratings, random_state=12)

# Generate X data
data = []
y = []
w0 = np.mean(x['rating'])
for i in range(len(ratings)):
    case = x.iloc[i]
    x_index = []
    x_value = []
    x_index.append(user_dict[case['user_id']])     # User id encoding
    x_value.append(1)
    x_index.append(item_dict[case['isbn']])    # Movie id encoding
    x_value.append(1)
    data.append([x_index, x_value])
    y.append(case['rating'] - w0)
    if (i % 10000) == 0:
        print('Encoding ', i, ' cases...')

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

Encoding  0  cases...
Encoding  10000  cases...
Encoding  20000  cases...
Encoding  30000  cases...


In [None]:
class FM():
    def __init__(self, N, K, data, y, alpha, beta, train_ratio=0.7, iterations=100, tolerance=0.005, l2_reg=True, verbose=True):
        self.K = K          # Number of latent factors
        self.N = N          # Number of x (variables)
        self.n_cases = len(data)            # N of observations
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.l2_reg = l2_reg
        self.tolerance = tolerance
        self.verbose = verbose
        # w 초기화
        self.w = np.random.normal(scale=1./self.N, size=(self.N))
        # v 초기화
        self.v = np.random.normal(scale=1./self.K, size=(self.N, self.K))
        # Train/Test 분리
        cutoff = int(train_ratio * len(data))
        self.train_x = data[:cutoff]
        self.test_x = data[cutoff:]
        self.train_y = y[:cutoff]
        self.test_y = y[cutoff:]

    def test(self):                                     # Training 하면서 RMSE 계산
        # SGD를 iterations 숫자만큼 수행
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            rmse1 = self.sgd(self.train_x, self.train_y)        # SGD & Train RMSE 계산
            rmse2 = self.test_rmse(self.test_x, self.test_y)    # Test RMSE 계산
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                       # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance:  # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process

    # w, v 업데이트를 위한 Stochastic gradient descent
    def sgd(self, x_data, y_data):
        y_pred = []
        for data, y in zip(x_data, y_data):
            x_idx = data[0]
            x_0 = np.array(data[1])     # xi axis=0 [1, 2, 3]
            x_1 = x_0.reshape(-1, 1)    # xi axis=1 [[1], [2], [3]]

            # biases
            bias_score = np.sum(self.w[x_idx] * x_0)

            # score 계산
            vx = self.v[x_idx] * (x_1)          # v matrix * x
            sum_vx = np.sum(vx, axis=0)         # sigma(vx)
            sum_vx_2 = np.sum(vx * vx, axis=0)  # ( v matrix * x )의 제곱
            latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

            # 예측값 계산
            y_hat = bias_score + latent_score
            y_pred.append(y_hat)
            error = y - y_hat
            # w, v 업데이트
            if self.l2_reg:     # regularization이 있는 경우
                self.w[x_idx] += error * self.alpha * (x_0 - self.beta * self.w[x_idx])
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1) - self.beta * self.v[x_idx])
            else:               # regularization이 없는 경우
                self.w[x_idx] += error * self.alpha * x_0
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1))
        return RMSE(y_data, y_pred)

    def test_rmse(self, x_data, y_data):
        y_pred = []
        for data , y in zip(x_data, y_data):
            y_hat = self.predict(data[0], data[1])
            y_pred.append(y_hat)
        return RMSE(y_data, y_pred)

    def predict(self, idx, x):
        x_0 = np.array(x)
        x_1 = x_0.reshape(-1, 1)

        # biases
        bias_score = np.sum(self.w[idx] * x_0)

        # score 계산
        vx = self.v[idx] * (x_1)
        sum_vx = np.sum(vx, axis=0)
        sum_vx_2 = np.sum(vx * vx, axis=0)
        latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

        # 예측값 계산
        y_hat = bias_score + latent_score
        return y_hat

    def predict_one(self, user_id, movie_id):
        x_idx = np.array([user_dict[user_id], item_dict[movie_id]])
        x_data = np.array([1, 1])
        return self.predict(x_idx, x_data) + w0

In [9]:
K = 30
# Define the number of folds
num_folds = 3
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize a list to store the RMSE for each fold
cv_rmse_scores = []

# Perform cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
    print(f"\nFold {fold + 1}/{num_folds}")
    # Convert indices to numpy arrays
    train_idx, val_idx = np.array(train_idx), np.array(val_idx)

    # Split the data into train and validation sets
    train_data, val_data = np.array(data)[train_idx], np.array(data)[val_idx]
    train_y, val_y = np.array(y)[train_idx], np.array(y)[val_idx]

    # Reset the model for each fold
    fm_model = FM(num_x, K, train_data, train_y, alpha=0.002, beta=0.01, train_ratio=0.7, iterations=600, tolerance=0.0005, l2_reg=True, verbose=True)

    # Train the model on the training set
    training_process = fm_model.test()

    # Test the model on the validation set
    val_rmse = fm_model.test_rmse(val_data, val_y)
    print(f"Validation RMSE for Fold {fold + 1}: {val_rmse}")

    # Store the RMSE for this fold
    cv_rmse_scores.append(val_rmse)

# Calculate and print the average RMSE across all folds
average_rmse = np.mean(cv_rmse_scores)
print(f"\nAverage RMSE across K={K}: {average_rmse}")


Fold 1/3
Iteration: 10 ; Train RMSE = 1.694849 ; Test RMSE = 1.758323
Iteration: 20 ; Train RMSE = 1.600693 ; Test RMSE = 1.726475
Iteration: 30 ; Train RMSE = 1.527200 ; Test RMSE = 1.707665
Iteration: 40 ; Train RMSE = 1.465229 ; Test RMSE = 1.695346
Iteration: 50 ; Train RMSE = 1.410686 ; Test RMSE = 1.686932
Iteration: 60 ; Train RMSE = 1.361139 ; Test RMSE = 1.681084
Iteration: 70 ; Train RMSE = 1.314866 ; Test RMSE = 1.677009
Iteration: 80 ; Train RMSE = 1.270519 ; Test RMSE = 1.674198
Iteration: 90 ; Train RMSE = 1.227027 ; Test RMSE = 1.672310
Iteration: 100 ; Train RMSE = 1.183649 ; Test RMSE = 1.671111
Iteration: 110 ; Train RMSE = 1.140044 ; Test RMSE = 1.670436
Iteration: 120 ; Train RMSE = 1.096215 ; Test RMSE = 1.670174
Iteration: 130 ; Train RMSE = 1.052369 ; Test RMSE = 1.670243
Iteration: 140 ; Train RMSE = 1.008809 ; Test RMSE = 1.670589
122 1.6701626078426481
Validation RMSE for Fold 1: 1.673616981838154

Fold 2/3
Iteration: 10 ; Train RMSE = 1.701432 ; Test RMSE = 

In [None]:
from sklearn.model_selection import KFold

K_list=[50,100,150,200,250,300,350,400]
    # Assume data and y are your input data and target variable
    # Create an instance of your FM model (assuming the class is named FM)
for k in K_list:
    K = k
    # Define the number of folds
    num_folds = 3
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Initialize a list to store the RMSE for each fold
    cv_rmse_scores = []

    # Perform cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
        print(f"\nFold {fold + 1}/{num_folds}")
        # Convert indices to numpy arrays
        train_idx, val_idx = np.array(train_idx), np.array(val_idx)

        # Split the data into train and validation sets
        train_data, val_data = np.array(data)[train_idx], np.array(data)[val_idx]
        train_y, val_y = np.array(y)[train_idx], np.array(y)[val_idx]

        # Reset the model for each fold
        fm_model = FM(num_x, K, train_data, train_y, alpha=0.002, beta=0.01, train_ratio=0.7, iterations=600, tolerance=0.0005, l2_reg=True, verbose=True)

        # Train the model on the training set
        training_process = fm_model.test()

        # Test the model on the validation set
        val_rmse = fm_model.test_rmse(val_data, val_y)
        print(f"Validation RMSE for Fold {fold + 1}: {val_rmse}")

        # Store the RMSE for this fold
        cv_rmse_scores.append(val_rmse)

    # Calculate and print the average RMSE across all folds
    average_rmse = np.mean(cv_rmse_scores)
    print(f"\nAverage RMSE across K={K}: {average_rmse}")


Fold 1/3
Iteration: 10 ; Train RMSE = 1.696594 ; Test RMSE = 1.758256
Iteration: 20 ; Train RMSE = 1.604366 ; Test RMSE = 1.726372
Iteration: 30 ; Train RMSE = 1.532858 ; Test RMSE = 1.707537
Iteration: 40 ; Train RMSE = 1.473071 ; Test RMSE = 1.695205
Iteration: 50 ; Train RMSE = 1.421057 ; Test RMSE = 1.686793
Iteration: 60 ; Train RMSE = 1.374543 ; Test RMSE = 1.680966
Iteration: 70 ; Train RMSE = 1.331984 ; Test RMSE = 1.676929
Iteration: 80 ; Train RMSE = 1.292198 ; Test RMSE = 1.674176
Iteration: 90 ; Train RMSE = 1.254219 ; Test RMSE = 1.672365
Iteration: 100 ; Train RMSE = 1.217270 ; Test RMSE = 1.671260
Iteration: 110 ; Train RMSE = 1.180791 ; Test RMSE = 1.670693
Iteration: 120 ; Train RMSE = 1.144414 ; Test RMSE = 1.670543
Iteration: 130 ; Train RMSE = 1.107897 ; Test RMSE = 1.670719
118 1.6705418578342441
Validation RMSE for Fold 1: 1.6732766107881087

Fold 2/3
Iteration: 10 ; Train RMSE = 1.703128 ; Test RMSE = 1.757817
Iteration: 20 ; Train RMSE = 1.610869 ; Test RMSE = 