### Matrix Factorization Used for Recommendation

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', 120)

In [2]:
# Load the dataset
jokes = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/jokes.csv", usecols = [1], header = None)
jokes.head(5)

Unnamed: 0,1
0,"A man visits the doctor. The doctor says ""I have bad news for you.You have cancer and Alzheimer's disease"". The man ..."
1,This couple had an excellent relationship going until one day he came home from work to find his girlfriend packing....
2,Q. What's 200 feet long and has 4 teeth? A. The front row at a Willie Nelson Concert.
3,Q. What's the difference between a man and a toilet? A. A toilet doesn't follow you around after you use it.
4,Q. What's O. J. Simpson's Internet address? A.\tSlash slash backslash slash slash escape.


In [3]:
# Load the dataset
jokes_ratings = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/modified_jester_data.csv", header = None)
jokes_ratings.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,3.18,19.79,1.34,2.84,3.48,2.5,1.15,15.17,2.02,6.24,...,13.82,0.0,0.0,0.0,0.0,0.0,5.37,0.0,0.0,0.0
1,15.08,10.71,17.36,15.37,8.62,1.34,10.27,5.66,19.88,20.22,...,13.82,6.05,10.71,18.86,10.81,8.86,14.06,11.34,6.68,12.07
2,0.0,0.0,0.0,0.0,20.03,20.27,20.03,20.27,0.0,0.0,...,0.0,0.0,0.0,20.08,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,19.35,0.0,0.0,12.8,19.16,8.18,17.21,0.0,12.84,...,0.0,0.0,0.0,11.53,0.0,0.0,0.0,0.0,0.0,0.0
4,19.5,15.61,6.83,5.61,12.36,12.6,18.04,15.61,10.56,16.73,...,16.19,16.58,15.27,16.19,16.73,12.55,14.11,17.55,12.8,12.6


---
Factorize the Ratings Matrix

In [4]:
def matrix_factorization(R, K, U, I, steps, alpha, beta):

    """
    Function to factorize a rating matrix into the user feature matrix and the item feature matrix

    Parameters:
    R : np.array
        User x Item rating matrix (m x n)
    K : int
        The number of latent factors (features)
    U : np.array
        Initial User-factor matrix (m x k)
    I : np.array
        Initial Item-factor matrix (n x k)
    steps : int, optional
        The number of epochs in gradient descent
    alpha : float, optional
        The learning rate for gradient descent
    beta : float, optional
        The regularization coefficient

    Returns:
    U : np.array
        The updated user-factor matrix
    I.T : np.array
        The transposed item-factor matrix
    """

    # Transpose Q    
    I = I.T

    # Gradient descent process
    for step in range(steps):
        for i in range(len(R)): # Iterate over users
            for j in range(len(R[i])): # Iterate over items
                if R[i][j] > 0: 
                    eij = R[i][j] - np.dot(U[i,:], I[:,j]) # Calculate the error against only positive ratings
                    for k in range(K):
                        U[i][k] = U[i][k] + alpha * (2 * eij * I[k][j] - beta * U[i][k]) # Update user features based on the partial derivatives
                        I[k][j] = I[k][j] + alpha * (2 * eij * U[i][k] - beta * I[k][j]) # Update item features based on the partial derivatives
        
        # Calculate the estimated rating matrix
        eR = np.dot(U, I)

        # Initialize error
        e = 0

        # Calculate the total error
        for i in range(len(R)): # Iterate over users
            for j in range(len(R[i])): # Iterate over items
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(U[i,:], I[:,j]), 2) # Calculate the squared error against only positive ratings
                    for k in range(K):
                        e = e + (beta / 2) * ( pow(U[i][k], 2) + pow(I[k][j], 2)) # Add regularization to the error

        # If the error is sufficiently low, stop the process
        if e < 0.001:
            break

        print("Step %d of %d; Error: %0.5f" %(step + 1, steps, e))

    return U, I.T

In [5]:
# Setup parameters for matrix factorization
M = jokes_ratings.shape[0] # Number of users
N = jokes_ratings.shape[1] # Number of items
jokes_ratings = np.array(jokes_ratings) # Convert to a NumPy array
K = 5 # Number of latent factors
U = np.random.rand(M,K) # User-factor matrix
I = np.random.rand(N,K) # Item-factor matrix
steps = 100 # Number of epochs
alpha = 0.0002 # Learning rate
beta = 0.02 # Regularization coefficient

# Perform matrix factorization
user_feature_matrix, item_feature_matrix = matrix_factorization(jokes_ratings, K, U, I, steps, alpha, beta)

Step 1 of 100; Error: 2812099.55887
Step 2 of 100; Error: 1502100.35866
Step 3 of 100; Error: 1385831.48311
Step 4 of 100; Error: 1348421.59563
Step 5 of 100; Error: 1335395.31451
Step 6 of 100; Error: 1330332.96623
Step 7 of 100; Error: 1328049.54156
Step 8 of 100; Error: 1326769.22334
Step 9 of 100; Error: 1325848.60171
Step 10 of 100; Error: 1325043.38310
Step 11 of 100; Error: 1324254.35102
Step 12 of 100; Error: 1323435.85348
Step 13 of 100; Error: 1322561.94392
Step 14 of 100; Error: 1321613.52891
Step 15 of 100; Error: 1320573.32261
Step 16 of 100; Error: 1319423.75608
Step 17 of 100; Error: 1318146.03332
Step 18 of 100; Error: 1316719.65747
Step 19 of 100; Error: 1315122.18004
Step 20 of 100; Error: 1313329.09426
Step 21 of 100; Error: 1311313.86299
Step 22 of 100; Error: 1309048.10443
Step 23 of 100; Error: 1306501.97517
Step 24 of 100; Error: 1303644.79919
Step 25 of 100; Error: 1300445.99279
Step 26 of 100; Error: 1296876.32792
Step 27 of 100; Error: 1292909.55675
Step 28 of

In [6]:
print("User-Feature Matrix:")
print(user_feature_matrix)
print()
print("Item-Feature Matrix:")
print(item_feature_matrix)

User-Feature Matrix:
[[-1.22803194  0.97686363  1.08876602  0.28589445  1.96188288]
 [ 2.55256006  1.12984977  0.11500366  1.99362346 -0.30369749]
 [ 2.04544408  1.33757782  1.4038686   1.60131708  0.99202955]
 ...
 [ 0.50522787  0.23020106  1.33798395  1.34845861  1.08105756]
 [ 0.40894675  0.1313441   0.41518231  0.56936115  0.47871442]
 [ 1.34134832  1.0239407   1.39811196  0.90265406  0.84860658]]

Item-Feature Matrix:
[[ 1.56590498e+00  9.37795705e-01  3.10816689e+00  5.47824962e+00
   1.23609607e+00]
 [ 1.01304892e+00  1.17108665e+00  1.63913654e+00  4.85647463e+00
   3.21418749e+00]
 [ 1.89701334e+00  3.05782922e+00  1.25760566e+00  4.46618074e+00
   1.09149011e+00]
 [ 3.83842120e+00  2.14725103e+00 -2.67718313e-01  2.90151624e+00
   1.42299148e+00]
 [ 2.57085568e+00  1.44783319e+00  2.03662835e+00  2.77703229e+00
   2.51218634e+00]
 [ 3.27152060e+00  3.11197667e+00  1.36343663e+00  1.47501112e+00
   3.77380028e+00]
 [ 3.24180177e+00 -1.02559572e+00  4.40586957e+00  3.27394548e+

In [7]:
# Define the output directory
output_directory = r"C:\\Users\\wodnj\\OneDrive\\바탕 화면\\Programming Machine Learning Applications\\DSC 478 - Week 8\\Data File"

# Define the file paths
output_path_user_feature_matrix = f"{output_directory}\\user_feature_matrix.csv"
output_path_item_feature_matrix = f"{output_directory}\\item_feature_matrix.csv"

# Open the output files and save the matrices
with open(output_path_user_feature_matrix, "w") as output_user_feature_matrix:
    np.savetxt(output_user_feature_matrix, user_feature_matrix, delimiter = ',', fmt = '%1.4f')

with open(output_path_item_feature_matrix, "w") as output_iten_feature_matrix:
    np.savetxt(output_iten_feature_matrix, item_feature_matrix, delimiter = ',', fmt = '%1.4f')

In [8]:
### Compute the individual rating prediction for a given user-item pair
predicted_rating = np.dot(user_feature_matrix[979], item_feature_matrix[9].T)
print(predicted_rating)

11.663511545775927


In [9]:
### Compute all rating prediction for a given user-item pair
predicted_ratings = np.dot(user_feature_matrix, item_feature_matrix.T)
print(predicted_ratings[:5])

[[ 6.36846059  9.37887027  5.44495956  0.71365502  6.19720795  8.33236018
   2.67570466  8.02045062  3.95219286  7.13979809  8.15004386  9.04866521
   4.22348867 11.5673204   2.33720923  1.92094432  4.91969403  8.94874235
   3.89016067  4.65962869 10.0253366   6.64514496  5.56475487 -0.18981323
   6.49073211 11.76886417 13.39184747 10.91900499 14.6584166   2.43909717
  11.7978888  12.88257408  2.37946594  8.49136562 14.57855636 13.12299212
   1.98656768  9.94684319  6.05255733  3.4389653   2.31292615 10.51135483
   1.42853963  0.55845448  7.02936815 13.31182283 12.82541124  9.82889776
  13.63939874 13.8862184   3.71200707  3.55330784 14.06373971 16.88822302
   7.79750409 13.47892343 -0.19388187 -1.61705448  4.30819052  3.11371375
  12.68350081 14.23888978  8.55484171  3.2756878  15.08038418 14.26818294
   1.52103728 14.1145792  14.0949411   7.34338385  6.42648009 10.11263854
   1.9358573   1.11824095  2.06992844  7.06794583  2.40676468  3.81042129
   2.69591233  3.68659346  4.18487564 

In [10]:
# Define the output directory
output_directory = r"C:\\Users\\wodnj\\OneDrive\\바탕 화면\\Programming Machine Learning Applications\\DSC 478 - Week 8\\Data File"

# Define the file path
output_path_predicted_ratings = f"{output_directory}\\predicted_ratings.csv"

# Open the output file and save the predicted ratings
with open(output_path_predicted_ratings, "w") as output_predicted_ratings:
    np.savetxt(output_predicted_ratings, predicted_ratings, delimiter = ',', fmt = '%1.4f')

---
Evaluate the performance of the algorithm

In [11]:
# Initialize total counts and total error
total_count = 0
total_error = 0 

# Loop through each user to compute individual Mean Absolute Error
for u in range(M):
    err_u = 0 # Initialize error for the current user
    rate_count_u = 0 # Initialize count of ratings for the current user

    # Loop through each item to calculate the error for the current user
    for j in range(N):
        if (jokes_ratings[u, j] > 0): # Only consider items that the user has rated
            rate_count_u += 1
            err_u += abs(np.dot(user_feature_matrix[u], item_feature_matrix[j]) - jokes_ratings[u, j]) # Calculate the absolute error
   
    # Accumulate total counts and total error
    total_count += rate_count_u
    total_error += err_u

print("Overall Mean Absolute Error = %0.3f" %(total_error / total_count))

Overall Mean Absolute Error = 2.966
