In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from recsys_utils import *

In [5]:
# Load data
X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
Y, R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [8]:
# From the matrix, we can compute statistics like average rating.
tsmean =  np.mean(Y[0, R[0, :].astype(bool)])
print(f"Average rating for movie 1 : {tsmean:0.3f} / 5" )

Average rating for movie 1 : 3.400 / 5


In [13]:
# Collaborative filtering cost function 
def cofi_cost_func(X, W, b, Y, R, lambda_):
    nm, nu = Y.shape
    J = 0
    for j in range(nu):
        w = W[j,:]
        b_j = b[0,j]
        for i in range(nm):
            x = X[i,:]
            y = Y[i,j]
            r = R[i,j]
            J += np.square(r * (np.dot(w,x) + b_j - y ) )
    J = J/2
    J += (lambda_/2) * (np.sum(np.square(W)) + np.sum(np.square(X))) 
    return J

In [16]:
# Reduce the data set size so that this runs faster
num_users_r = 4
num_movies_r = 5 
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

# Evaluate cost function
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

Cost: 13.67


In [19]:
# Vectorized cost function
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [22]:
# Evaluate cost function
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# Evaluate cost function with regularization 
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost: 13.67
Cost (with regularization): 28.09


In [64]:
# Add new user ratings
movieList, movieList_df = load_Movie_List_pd()
my_ratings = np.zeros(num_movies)          #  Initialize my ratings

# Check the file small_movie_list.csv for id of each movie in our dataset
# For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set
my_ratings[2700] = 5 

#Or suppose you did not enjoy Persuasion (2007), you can set
my_ratings[2609] = 2;

# We have selected a few movies we liked / did not like and the ratings we
# gave are as follows:
my_ratings[929]  = 4   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 4   # Shrek (2001)
my_ratings[2716] = 4   # Inception
my_ratings[1150] = 4.5 # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_ratings[3618] = 5   # Interstellar (2014)
my_ratings[687]  = 5   # How to Lose a Guy in 10 days (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movieList_df.loc[i,"title"]}');


New user ratings:

Rated 4.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  How to Lose a Guy in 10 Days (2003)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 4.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 4.5 for  Incredibles, The (2004)
Rated 2.0 for  Persuasion (2007)
Rated 5.0 for  Toy Story 3 (2010)
Rated 4.0 for  Inception (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)
Rated 5.0 for  Interstellar (2014)


In [66]:
# Reload ratings
Y, R = load_ratings_small()

# Add new user ratings to Y 
Y = np.c_[my_ratings, Y]

# Add new user indicator matrix to R
R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

In [68]:
#  Useful Values
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [70]:
iterations = 280
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 2321292.5
Training loss at iteration 20: 136166.2
Training loss at iteration 40: 51864.9
Training loss at iteration 60: 24600.7
Training loss at iteration 80: 13632.0
Training loss at iteration 100: 8488.7
Training loss at iteration 120: 5808.3
Training loss at iteration 140: 4312.1
Training loss at iteration 160: 3435.6
Training loss at iteration 180: 2902.5
Training loss at iteration 200: 2567.0
Training loss at iteration 220: 2349.1
Training loss at iteration 240: 2203.2
Training loss at iteration 260: 2102.7


In [72]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean
my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movieList[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}')

Predicting rating 4.73 for movie My Sassy Girl (Yeopgijeogin geunyeo) (2001)
Predicting rating 4.70 for movie Colourful (Karafuru) (2010)
Predicting rating 4.69 for movie Battle Royale 2: Requiem (Batoru rowaiaru II: Chinkonka) (2003)
Predicting rating 4.69 for movie Into the Abyss (2011)
Predicting rating 4.69 for movie Eichmann (2007)
Predicting rating 4.68 for movie Martin Lawrence Live: Runteldat (2002)
Predicting rating 4.67 for movie Particle Fever (2013)
Predicting rating 4.66 for movie Son of the Bride (Hijo de la novia, El) (2001)
Predicting rating 4.66 for movie Strictly Sexual (2008)
Predicting rating 4.66 for movie Radio Day (2008)
Predicting rating 4.66 for movie My Love (2006)


Original vs Predicted ratings:

Original 4.0, Predicted 4.00 for Shrek (2001)
Original 5.0, Predicted 4.91 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Original 2.0, Predicted 2.14 for Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Origin

In [74]:
# Filter movies that have more than 20 ratings and are in the top 300 predictions
movieList_df["pred"] = my_predictions
movieList_df = movieList_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
filtered_ix = [int(i.numpy()) if hasattr(i, 'numpy') else int(i) for i in ix[:300]]
filtered_ix = [i for i in filtered_ix if i not in my_rated]
filtered_df = movieList_df.loc[filtered_ix]
filtered_df = filtered_df[filtered_df["number of ratings"] > 20]
filtered_df.sort_values("pred", ascending=False).head(20)

Unnamed: 0,pred,mean rating,number of ratings,title
1771,4.46995,3.944444,81,Casino Royale (2006)
361,4.35895,3.871212,132,"Monsters, Inc. (2001)"
2649,4.341813,3.943396,53,How to Train Your Dragon (2010)
3083,4.325614,3.993421,76,"Dark Knight Rises, The (2012)"
2804,4.319064,3.989362,47,Harry Potter and the Deathly Hallows: Part 1 (...
2523,4.296493,3.877358,53,Zombieland (2009)
1431,4.257096,3.94,50,Serenity (2005)
2455,4.241115,3.887931,58,Harry Potter and the Half-Blood Prince (2009)
653,4.226867,4.021277,188,"Lord of the Rings: The Two Towers, The (2002)"
1930,4.214066,3.862069,58,Harry Potter and the Order of the Phoenix (2007)
