In [75]:
import numpy as np
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style()

In [79]:
data_path = "movie_ids.txt"
movies = []
with open(data_path, encoding = "ISO-8859-1") as file:
    for line in file:
        movies.append(' '.join(line.split()[1:]))
len(movies)

1682

In [80]:
my_ratings = np.zeros([1682, 1])
my_ratings[0] = 4
my_ratings[97] = 2;
my_ratings[6] = 3
my_ratings[11] = 5
my_ratings[53] = 4
my_ratings[5] = 5
my_ratings[65]= 3
my_ratings[68] = 5
my_ratings[182] = 4
my_ratings[227] = 5
my_ratings[354] = 5
print('New user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('{} rated {}'.format(movies[i], my_ratings[i][0]))

New user ratings:

Toy Story (1995) rated 4.0
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) rated 5.0
Twelve Monkeys (1995) rated 3.0
Usual Suspects, The (1995) rated 5.0
Outbreak (1995) rated 4.0
While You Were Sleeping (1995) rated 3.0
Forrest Gump (1994) rated 5.0
Silence of the Lambs, The (1991) rated 2.0
Alien (1979) rated 4.0
Star Trek: The Wrath of Khan (1982) rated 5.0
Sphere (1998) rated 5.0


In [122]:
#  Y is a 1682x943 matrix, containing ratings (1-5) of 1682 movies by 943 users
#  R is a 1682x943 matrix, where R(i,j) = 1 if and only if user j gave a rating to movie i
import scipy.io as sio
data_path = 'ex8_movies.mat'
data = sio.loadmat(data_path)
Y = data['Y']
R = data['R']
print(Y.shape)
print(my_ratings.shape)
Y = np.hstack([my_ratings, Y])
R = np.hstack([(my_ratings != 0), R])
print(Y.shape, R.shape)

(1682, 943)
(1682, 1)
(1682, 944) (1682, 944)


In [123]:
def normalize_ratings(Y, R):
    Y_mean = np.zeros([Y.shape[0], 1])
    Y_norm = np.zeros(Y.shape)
    for i in range(Y.shape[0]):
        idx = R[i, :] == 1
        Y_mean[i] = np.mean(Y[i, idx])
        Y_norm[i, idx] = Y[i, idx] - Y_mean[i]
    return Y_norm, Y_mean

In [124]:
Y_norm, Y_mean = normalize_ratings(Y, R)

In [125]:
num_movies, num_users = Y.shape
num_features = 10
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)
print(X.shape, Theta.shape)
initial_parameters = np.r_[X, Theta].ravel()

(1682, 10) (944, 10)


In [126]:
def cofi_cost_func(params, Y, R, num_users, num_movies, num_features, lamb):
    X = params[:num_movies*num_features].reshape([num_movies, num_features])
    Theta = params[num_movies*num_features:].reshape([num_users, num_features])
    X_grad = np.zeros(X.shape)
    Theta_grad = np.zeros(Theta.shape)
    grad = 0
    J_unreg = (1/2) * np.sum(((X@Theta.T - Y)*R)**2)
    J = J_unreg + (lamb/2)*(np.sum(X**2) + np.sum(Theta**2))
        
    for i in range(num_movies):
        X_grad[i, :] = ((X[i, :] @ Theta.T - Y[i, :]) * R[i, :]) @ Theta

    for i in range(num_users):
        Theta_grad[i, :] = ((X @ Theta[i, :].T - Y[:, i]) * R[:, i]).T @ X

    X_grad =  X_grad + lamb*X
    Theta_grad =  Theta_grad + lamb*Theta
        
    grad = np.r_[X_grad, Theta_grad].ravel()
    return J, grad

In [129]:
from scipy import optimize
options = {'maxiter': 100, 'disp':True}
lamb = 10
res = optimize.minimize(lambda params: cofi_cost_func(params, Y_norm, R, num_users,
                                                 num_movies, num_features, lamb),
                        initial_parameters,
                        method='TNC',
                        jac=True,
                        options=options)
res

     fun: 43210.55379470745
     jac: array([-6.82972865, -6.70936265, -8.68839135, ...,  0.33485331,
       -0.02131491,  1.24334053])
 message: 'Max. number of function evaluations reached'
    nfev: 100
     nit: 9
  status: 3
 success: False
       x: array([-5.11836008e-01, -3.63519298e-01,  4.09681778e-01, ...,
       -5.01148802e-04, -2.34268703e-01,  8.73883508e-02])

In [130]:
params = res.x
X_res = params[:num_movies*num_features].reshape([num_movies, num_features])
Theta_res = params[num_movies*num_features:].reshape([num_users, num_features])

In [138]:
temp.max()

5.030561614565158

In [89]:
p = X_res @ Theta_res.T
my_predictions = p[:, 0] + Y_mean.ravel()

In [91]:
np.sort(my_predictions)

array([0.99928727, 0.99928747, 0.99939309, ..., 5.00045594, 5.00053649,
       5.00054966])

In [154]:
pred_idxs_sorted = np.argsort(my_predictions)
pred_idxs_sorted = pred_idxs_sorted[-10:]

print("Top recommendations for you:")
for i in pred_idxs_sorted:
    print("Predicting rating {:.3} for movie {}".format(my_predictions[i], movies[i]))

print("\nOriginal ratings provided:")
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('{} rated {:.3}'.format(movies[i], my_ratings[i][0]))

Top recommendations for you:
Predicting rating 5.0 for movie They Made Me a Criminal (1939)
Predicting rating 5.0 for movie Star Kid (1997)
Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996)
Predicting rating 5.0 for movie Prefontaine (1997)
Predicting rating 5.0 for movie Santa with Muscles (1996)
Predicting rating 5.0 for movie Great Day in Harlem, A (1994)
Predicting rating 5.0 for movie Aiqing wansui (1994)
Predicting rating 5.0 for movie Someone Else's America (1995)
Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996)
Predicting rating 5.0 for movie Saint of Fort Washington, The (1993)

Original ratings provided:
Toy Story (1995) rated 4.0
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) rated 5.0
Twelve Monkeys (1995) rated 3.0
Usual Suspects, The (1995) rated 5.0
Outbreak (1995) rated 4.0
While You Were Sleeping (1995) rated 3.0
Forrest Gump (1994) rated 5.0
Silence of the Lambs, The (1991) rated 2.0
Alien (1979) rated 4.0