In [1]:
import pandas as pd
import numpy as np
from numpy import loadtxt
import tensorflow as tf
from tensorflow import keras


In [2]:
# Load the ratings matrix Y and the indicator matrix R

file = open('./data/small_movies_Y.csv', 'rb')
Y = loadtxt(file, delimiter=",")

file = open('./data/small_movies_R.csv', 'rb')
R = loadtxt(file, delimiter=",")

In [3]:
def normalizeRatings(Y, R):
    """
    Preprocess data by subtracting the mean rating for each movie (each row).
    Only include real ratings where R(i,j) = 1.
    Normalize Y so that each movie has a rating of 0 on average.
    Unrated movies then have a mean rating (0).
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y * R, axis=1) / (np.sum(R, axis=1) + 1e-12)).reshape(-1, 1)
    Ynorm = Y - np.multiply(Ymean, R)  # Subtract mean rating only for rated movies
    return Ynorm, Ymean


In [4]:
def cost_function(X, W, b, Y, R, lambda_):
    # Compute the difference between predicted and actual ratings, considering only rated movies
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
    # Compute the cost function including regularization terms
    J = 0.5 * tf.reduce_sum(j ** 2) + (lambda_ / 2) * (tf.reduce_sum(X ** 2) + tf.reduce_sum(W ** 2))
    
    # j = (np.dot(X, W.T) + b - Y) * R
    # J = (np.sum(j ** 2) / 2) + (lambda_ / 2) * np.sum(np.square(W)) + (lambda_ / 2) * np.sum(np.square(X))
    
    return J



In [5]:
# Load the movie list with titles
df = pd.read_csv('data/small_movie_list.csv', header=0, index_col=0, delimiter=',', quotechar='"')


In [6]:
num_movies, num_users = Y.shape 

# Initialize an array to store personal ratings
my_ratings = np.zeros(num_movies)

# Manually set ratings for selected movies
my_ratings[929] = 5  # Lord of the Rings: The Return of the King, The
my_ratings[246] = 5  # Shrek (2001)
my_ratings[2716] = 3  # Inception
my_ratings[1150] = 5  # Incredibles, The (2004)
my_ratings[382] = 2  # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366] = 5  # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622] = 5  # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988] = 3  # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1  # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1  # Nothing to Declare (Rien à déclarer)
my_ratings[793] = 5  # Additional movie rating


In [7]:
# Print out the personal ratings for each movie
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f"my rating {my_ratings[i]} for movie {df.loc[i, 'title']}")

my rating 5.0 for movie Shrek (2001)
my rating 5.0 for movie Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my rating 2.0 for movie Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
my rating 5.0 for movie Harry Potter and the Chamber of Secrets (2002)
my rating 5.0 for movie Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my rating 5.0 for movie Lord of the Rings: The Return of the King, The (2003)
my rating 3.0 for movie Eternal Sunshine of the Spotless Mind (2004)
my rating 5.0 for movie Incredibles, The (2004)
my rating 3.0 for movie Inception (2010)
my rating 1.0 for movie Louis Theroux: Law & Disorder (2008)
my rating 1.0 for movie Nothing to Declare (Rien à déclarer) (2010)


In [8]:
# Add personal ratings to the ratings matrix Y and update R accordingly
Y = np.c_[my_ratings, Y]
R = np.c_[(my_ratings != 0).astype(int), R]


In [9]:
# Normalize the ratings matrix Y
Ynorm, Ymean = normalizeRatings(Y, R)


In [10]:
# Set parameters for the model
alpha = 0.3  # Learning rate
iterations = 500  # Number of iterations for gradient descent
num_features = 100  # Number of features for each movie/user
lambda_ = 0.1  # Regularization parameter
num_movies, num_users = Y.shape  # Update dimensions after adding my ratings


In [11]:
# Initialize the optimizer and the model parameters
optimizer = keras.optimizers.Adam(learning_rate=alpha)
W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64))  # User feature matrix
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64))  # User bias term
X = tf.Variable(tf.random.normal((num_movies, num_features), dtype=tf.float64))  # Movie feature matrix

# Gradient descent loop to minimize the cost function
for itr in range(iterations):

    with tf.GradientTape() as tape:
        cost_value = cost_function(X, W, b, Ynorm, R, lambda_)  # Compute the cost function

    grads = tape.gradient(cost_value,
                          [W, X, b])  # Compute gradients of the cost function with respect to the parameters

    # Update the parameters using the computed gradients
    optimizer.apply_gradients(zip(grads, [W, X, b]))

    # Print the cost value every 10 iterations
    if itr % 20 == 0:
        print(f'iteration : {itr}  : cost value :{cost_value:0.2f}')


iteration : 0  : cost value :2021812.84
iteration : 20  : cost value :54359.61
iteration : 40  : cost value :25734.21
iteration : 60  : cost value :14523.21
iteration : 80  : cost value :8925.23
iteration : 100  : cost value :5843.25
iteration : 120  : cost value :3990.06
iteration : 140  : cost value :2811.50
iteration : 160  : cost value :2034.80
iteration : 180  : cost value :1509.64
iteration : 200  : cost value :1147.44
iteration : 220  : cost value :893.52
iteration : 240  : cost value :712.99
iteration : 260  : cost value :583.05
iteration : 280  : cost value :488.52
iteration : 300  : cost value :419.08
iteration : 320  : cost value :367.63
iteration : 340  : cost value :329.19
iteration : 360  : cost value :300.24
iteration : 380  : cost value :278.26
iteration : 400  : cost value :261.44
iteration : 420  : cost value :249.01
iteration : 440  : cost value :238.43
iteration : 460  : cost value :230.44
iteration : 480  : cost value :224.15


In [12]:
# Make predictions by computing the dot product of X and W, adding the bias term and mean ratings
predictions = np.dot(X.numpy(), W.numpy().T) + b.numpy()
predictions = predictions + Ymean  # Add the mean rating back to denormalize

# Extract personal predictions (the first column corresponds to my ratings)
my_predictions = predictions[:, 0]

# Print out the predictions for the movies that were rated
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f"my rating {my_ratings[i]} , prediction : {my_predictions[i]:0.2f} for movie :{df.loc[i, 'title']}")


my rating 5.0 , prediction : 4.98 for movie :Shrek (2001)
my rating 5.0 , prediction : 4.98 for movie :Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my rating 2.0 , prediction : 2.02 for movie :Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
my rating 5.0 , prediction : 4.99 for movie :Harry Potter and the Chamber of Secrets (2002)
my rating 5.0 , prediction : 4.98 for movie :Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my rating 5.0 , prediction : 4.98 for movie :Lord of the Rings: The Return of the King, The (2003)
my rating 3.0 , prediction : 3.00 for movie :Eternal Sunshine of the Spotless Mind (2004)
my rating 5.0 , prediction : 4.99 for movie :Incredibles, The (2004)
my rating 3.0 , prediction : 3.00 for movie :Inception (2010)
my rating 1.0 , prediction : 1.04 for movie :Louis Theroux: Law & Disorder (2008)
my rating 1.0 , prediction : 1.03 for movie :Nothing to Declare (Rien à déclarer) (2010)


In [13]:
# most similar movies
indices_high_ratings = np.where(Y[:, 0] > 4)[0]
distance_set = set()
X = X.numpy()
num_movies= X.shape[0]

for i in indices_high_ratings:
    for j in range(num_movies):
        if i != j:
            distance = np.linalg.norm(X[i, :] - X[j, :])
            distance_set.add((distance, j))

sorted_distances = sorted(distance_set, key=lambda x: x[0])[:5]

for distance, movie_index in sorted_distances:
    print(f"Similar movie: {df.loc[movie_index, 'title']}")


Similar movie: The Hunger Games: Catching Fire (2013)
Similar movie: Windtalkers (2002)
Similar movie: The Alamo (2004)
Similar movie: Star Trek: Nemesis (2002)
Similar movie: Adventures of Sharkboy and Lavagirl 3-D, The (2005)
