In [3]:
import numpy as np
import pandas as pd

# Load ratings
ratings = pd.read_csv('movielens_100k/ratings.csv', usecols=['userId', 'movieId', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# Load movies
movies = pd.read_csv('movielens_100k/movies.csv', usecols=['movieId', 'title', 'genres'])
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Count number of users and movies
num_users  = len(ratings.userId.unique())
num_movies = len(movies.movieId.unique())
print(f"Number of Users: {num_users} | Number of Movies {num_movies}")

Number of Users: 610 | Number of Movies 9742


In [6]:
rating_frame = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
rating_frame.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Convert to numpy array
rating_matrix    = rating_frame.values
user_rating_mean = np.mean(rating_matrix, axis=1)
ratings_demeaned = rating_matrix - user_rating_mean.reshape(-1, 1)

# Check sparsity of dataset
sparsity = round(1.0 - len(ratings) / float (num_users * num_movies), 3)
print("Sparsity: " + str(sparsity))

Sparsity: 0.983


In [11]:
# Perform SVD
from scipy.sparse.linalg import svds
U, sigma, VT = svds(ratings_demeaned, k = 250) # Increasing k decreases RMSE

# Convert sigma to diagonal matrix
sigma = np.diag(sigma)

In [12]:
# Make predictions
predicted_ratings_matrix = np.dot(np.dot(U, sigma), VT) + user_rating_mean.reshape(-1, 1)
predicted_ratings = pd.DataFrame(predicted_ratings_matrix, columns=rating_frame.columns)
predicted_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,4.173251,-0.135702,4.243084,-0.010896,-0.009015,3.89054,0.180917,-0.003308,0.023687,-0.063293,...,-0.012446,-0.010755,-0.014137,-0.014137,-0.012446,-0.014137,-0.012446,-0.012446,-0.012446,-0.028234
1,-0.547001,0.36751,0.141238,0.00932,0.184993,-0.117637,-0.140065,-0.006148,-0.133646,0.249896,...,0.013881,0.012269,0.015493,0.015493,0.013881,0.015493,0.013881,0.013881,0.013881,0.0781
2,0.056668,-0.070198,0.039634,0.012483,0.040768,0.188939,0.015368,-0.043578,0.074793,0.045872,...,-0.007756,-0.005367,-0.010144,-0.010144,-0.007756,-0.010144,-0.007756,-0.007756,-0.007756,0.00874
3,-0.17048,-0.207635,0.117181,-0.067777,-0.022151,0.015227,-0.008205,-0.05591,0.115035,0.093912,...,0.014028,0.012144,0.015912,0.015912,0.014028,0.015912,0.014028,0.014028,0.014028,-0.037578
4,2.597659,0.383874,0.198052,0.040858,0.172658,-0.218932,0.300605,0.154209,-0.266918,-0.150857,...,0.048138,0.041375,0.054901,0.054901,0.048138,0.054901,0.048138,0.048138,0.048138,-0.000146


In [13]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    print(pred)
    actual = actual[actual.nonzero()].flatten()
    print(actual)
    return sqrt(mean_squared_error(pred, actual))

# RMSE on the train data
print('RMSE: ' + str(rmse(predicted_ratings_matrix, rating_matrix)))

[4.17325113 4.24308449 3.89054006 ... 4.99475905 5.02081081 2.99056041]
[4. 4. 4. ... 5. 5. 3.]
RMSE: 0.8221672185680953
