In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from sklearn.model_selection import train_test_split

In [2]:
# Load ratins dataset
ratings = pd.read_csv("dataset/ratings.csv").drop("timestamp", axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,16,4.0
1,1,24,1.5
2,1,32,4.0
3,1,47,4.0
4,1,50,4.0


In [3]:
# Load movies dataset
movies = pd.read_csv("dataset/movies.csv", encoding="latin-1")[["movieId", "title"]]
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
# Merge ratings with movie titles
mrgd_ratings = ratings.merge(movies, on="movieId")
mrgd_ratings.shape

(105339, 4)

In [5]:
mrgd_ratings.drop_duplicates(subset=['userId', 'movieId'], keep="first", inplace=True)
mrgd_ratings.shape

(105339, 4)

In [34]:
# Convert ratings into a matrix
user_movie_matrix = mrgd_ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)
user_movie_matrix.shape

(668, 10325)

In [14]:
def get_rated_indices(R:np.array) -> list:
    # Suppose R is the full user-item rating matrix (with 0 for missing values)
    # Create a mask for known ratings
    known_ratings_mask = R > 0

    # For simplicity, let's create a train-test split on the indices of known ratings
    user_idx, item_idx = np.where(known_ratings_mask)
    indices = list(zip(user_idx, item_idx))
    return indices

def get_train_test_datasets(ratings: pd.DataFrame):
    R = ratings.to_numpy()
    indices = get_rated_indices(R)
    train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

    # Build train and test matrices
    R_train = np.zeros(R.shape)
    R_test = np.zeros(R.shape)

    for i, j in train_indices:
        R_train[i, j] = R[i, j]

    for i, j in test_indices:
        R_test[i, j] = R[i, j]
    return R_train, R_test, train_indices, test_indices

In [None]:

def train(R_train: np.array, k: int, alpha: float, lambda_reg: float, epochs: int) -> np.array:
    num_users, num_items = R_train.shape  # Rows (Users) & Columns (Movies)
    # Initialize U and V randomly
    U = np.random.rand(num_users, k)
    V = np.random.rand(num_items, k)

    # Get nonzero ratings (user-item pairs with ratings)
    rated_indices = get_rated_indices(R_train)

    # SGD Optimization
    for epoch in range(epochs):
        for u, i in rated_indices:
            r_ui = R_train[u,i]
            # Compute predicted rating
            pred = np.dot(U[u, :], V[i, :].T)
            error = r_ui - pred

            # Update U and V using SGD
            U[u, :] += alpha * (error * V[i, :] - lambda_reg * U[u, :])
            V[i, :] += alpha * (error * U[u, :] - lambda_reg * V[i, :])

    # Reconstructed rating matrix
    predicted_R = np.dot(U, V.T)
    print("Predicted Ratings:\n", predicted_R)
    return predicted_R


In [37]:
R_train, R_test, train_indices, test_indices = get_train_test_datasets(user_movie_matrix)
predicted_R = train(R_train, 2, 0.03, 0.1, 500)


Predicted Ratings:
 [[3.90979169 3.05038923 3.14633893 ... 3.37489547 4.50817003 4.52722276]
 [4.0328891  3.25436472 3.29930002 ... 3.10881287 4.39023067 4.22310749]
 [3.90919969 3.04087699 3.14134299 ... 3.40560493 4.52927794 4.56398885]
 ...
 [3.66872293 2.7212047  2.88187873 ... 3.65356623 4.56994306 4.83199308]
 [4.08780754 3.20129907 3.2955983  ... 3.48708129 4.68448223 4.68359702]
 [3.24931967 2.61992977 2.65720009 ... 2.51213016 3.54235707 3.41138875]]


In [33]:
predicted_R[:4,:]

array([[3.88067462, 3.11466442, 3.13375659, ..., 3.31934394, 4.60035423,
        4.43590391],
       [4.04817475, 3.48887752, 3.53084946, ..., 3.60287385, 4.6991131 ,
        4.79687815],
       [3.89009086, 3.15578851, 3.17801461, ..., 3.34703307, 4.59754497,
        4.47039714],
       [4.04020012, 3.3519924 , 3.38192253, ..., 3.51972504, 4.74397257,
        4.69551682]], shape=(4, 10325))

In [38]:
# Evaluate only on the test indices
true_values = []
pred_values = []
for i, j in test_indices:
    true_values.append(R_test[i, j])
    pred_values.append(predicted_R[i, j])

rmse = sqrt(mean_squared_error(true_values, pred_values))
mae = mean_absolute_error(true_values, pred_values)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 0.9981
MAE: 0.7551
