In [2]:
import pickle
import lzma

with lzma.open('movie_data_movielens.xz', 'rb') as f:
    train_movie_ratings_list, test_movie_ratings_list = pickle.load(f)

with lzma.open('user_data_movielens.xz', 'rb') as f:
    train_user_ratings_list, test_user_ratings_list = pickle.load(f)

In [5]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
!unzip 'ml-latest-small.zip'

--2024-05-03 16:37:46--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.1’


2024-05-03 16:37:48 (780 KB/s) - ‘ml-latest-small.zip.1’ saved [978202/978202]

Archive:  ml-latest-small.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [6]:
import pandas as pd
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

movies = ratings_df['movieId'].to_numpy(dtype=int)
ratings = ratings_df['rating'].to_numpy(dtype=float)
users = ratings_df['userId'].to_numpy(dtype=int)

In [7]:
import numpy as np
rng = np.random.default_rng(2024)

userid_to_index = {}
index_to_userid = []
movieid_to_index = {}
index_to_movieid = []
train_movie_ratings_list = []
# test_movie_ratings_list = []
train_user_ratings_list = []
# test_user_ratings_list = []
# movie_ratings_list = []
# user_ratings_list = []

for (user, movie, rating) in zip(users, movies, ratings):
  if movieid_to_index.get(movie) is not None:
    movie_index = movieid_to_index.get(movie)
  else:
    movie_index = len(movieid_to_index)
    index_to_movieid.append(movie)
    movieid_to_index[movie] = movie_index
    train_movie_ratings_list.append([])
    # test_movie_ratings_list.append([])
    # movie_ratings_list.append([])

  if userid_to_index.get(user) is not None:
    user_index = userid_to_index.get(user)
  else:
    user_index = len(userid_to_index)
    index_to_userid.append(user)
    userid_to_index[user] = user_index
    train_user_ratings_list.append([])
    # test_user_ratings_list.append([])
    # user_ratings_list.append([])

  # user_ratings_list[user_index].append((movie_index, rating))
  # movie_ratings_list[movie_index].append((user_index, rating))

  # if rng.uniform(0, 1) < 0.8:
  train_user_ratings_list[user_index].append((movie_index, rating))
  train_movie_ratings_list[movie_index].append((user_index, rating))
  # else:
  #   test_user_ratings_list[user_index].append((movie_index, rating))
  #   test_movie_ratings_list[movie_index].append((user_index, rating))

In [41]:
M = len(train_user_ratings_list)
N = len(train_movie_ratings_list)

user_biases = np.zeros((M))
item_biases = np.zeros((N))

# Hyperparameters
n_epochs = 100
lmb = 1
gamma = 1e-3
tau = 5e-2
k = 5

U = np.random.normal(loc=0.0, scale=1/np.sqrt(k), size=(M, k))
V = np.random.normal(loc=0.0, scale=1/np.sqrt(k), size=(N, k))

train_losses = []
train_rmses = []
test_losses = []
test_rmses = []

In [43]:
for i in tqdm(range(n_epochs)):
    for m in range(M):
        # Extract ratings and indices for the current user
        ratings = train_user_ratings_list[m][:, 1]
        indices = train_user_ratings_list[m][:, 0].astype(int)
        
        # Calculate user bias
        user_bias = lmb * np.sum(ratings - np.einsum('i,ji->j', U[m], V[indices]) - item_biases[indices]) / (lmb * len(indices) + gamma)
        user_biases[m] = user_bias
        
        # Calculate left and right matrices for user factor update
        left = np.sum(np.einsum('ij,il->ijl', V[indices], V[indices]), axis=0)
        right = np.sum(np.einsum('ji,j->ji', V[indices], ratings - user_bias - item_biases[indices]), axis=0)
        
        # Update user factor
        U[m] = np.linalg.solve(lmb * left + tau * np.eye(k), lmb * right)

    for n in range(N):
        # Extract ratings and indices for the current item
        ratings = train_movie_ratings_list[n][:, 1]
        indices = train_movie_ratings_list[n][:, 0].astype(int)
        
        # Calculate item bias
        item_bias = lmb * np.sum(ratings - np.einsum('ij,j->i', U[indices], V[n]) - user_biases[indices]) / (lmb * len(indices) + gamma)
        item_biases[n] = item_bias
        
        # Calculate left and right matrices for item factor update
        left = np.sum(np.einsum('ij,il->ijl', U[indices], U[indices]), axis=0)
        right = np.sum(np.einsum('ji,j->ji', U[indices], ratings - user_biases[indices] - item_bias), axis=0)
        
        # Update item factor
        V[n] = np.linalg.solve(lmb * left + tau * np.eye(k), lmb * right)

    # Calculate training loss and RMSE
    train_error_squared = 0
    train_size = 0
    for m in range(M):
        ratings = train_user_ratings_list[m][:, 1]
        indices = train_user_ratings_list[m][:, 0].astype(int)
        train_error_squared += np.sum((ratings - np.einsum('i,ji->j', U[m], V[indices]) - user_biases[m] - item_biases[indices])**2)
        train_size += len(indices)
    
    train_loss = -0.5 * lmb * train_error_squared - 0.5 * gamma * np.sum(user_biases**2) - 0.5 * gamma * np.sum(item_biases**2) - 0.5 * tau * (np.einsum('ij,ij->', V, V) + np.einsum('ij,ij->', U, U))
    train_rmse = np.sqrt(1 / train_size * train_error_squared)
    
    train_losses.append(train_loss)
    train_rmses.append(train_rmse)
    
    print(f'train loss {round(train_loss, 2)} train rmse {round(train_rmse, 4)}')

  0%|          | 0/100 [00:00<?, ?it/s]

train loss -17571.08 train rmse 0.5816
train loss -17538.5 train rmse 0.5811
train loss -17508.14 train rmse 0.5807
train loss -17479.66 train rmse 0.5802
train loss -17452.92 train rmse 0.5799
train loss -17427.89 train rmse 0.5795
train loss -17404.44 train rmse 0.5792
train loss -17382.37 train rmse 0.5788
train loss -17361.52 train rmse 0.5785
train loss -17341.83 train rmse 0.5782
train loss -17323.27 train rmse 0.578
train loss -17305.76 train rmse 0.5777
train loss -17289.2 train rmse 0.5775
train loss -17273.47 train rmse 0.5772
train loss -17258.47 train rmse 0.577
train loss -17244.13 train rmse 0.5768
train loss -17230.38 train rmse 0.5766
train loss -17217.17 train rmse 0.5764
train loss -17204.45 train rmse 0.5762
train loss -17192.18 train rmse 0.576
train loss -17180.3 train rmse 0.5758
train loss -17168.76 train rmse 0.5756
train loss -17157.48 train rmse 0.5755
train loss -17146.3 train rmse 0.5753
train loss -17135.37 train rmse 0.5751
train loss -17124.84 train rmse 