In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from numpy import loadtxt

In [3]:
def load_precalc_params_small():

    file = open('./data/small_movies_X.csv', 'rb')
    X = loadtxt(file, delimiter = ",")

    file = open('./data/small_movies_W.csv', 'rb')
    W = loadtxt(file,delimiter = ",")

    file = open('./data/small_movies_b.csv', 'rb')
    b = loadtxt(file,delimiter = ",")
    b = b.reshape(1,-1)
    
    num_movies, num_features = X.shape
    num_users,_ = W.shape
    return(X, W, b, num_movies, num_features, num_users)

In [5]:
def load_ratings_small():
    file = open('./data/small_movies_Y.csv', 'rb')
    Y = loadtxt(file,delimiter = ",")

    file = open('./data/small_movies_R.csv', 'rb')
    R = loadtxt(file,delimiter = ",")
    return(Y,R)

In [6]:
X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
Y, R = load_ratings_small()

In [10]:
print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [18]:
R.shape
R[0,:]
Y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 4., 3., 3.]])

In [19]:
tsmean=np.mean(Y[0,R[0,:].astype(bool)])# y is the binary rated value from the user while r is a bool 1or 0
print(f"the average rating for the movie 1 is : {tsmean} / 5")

the average rating for the movie 1 is : 3.4 / 5


### Collaborative filtering learning algorithm

#### Collaborative filtering cost function

$$J({\mathbf{x}^{(0)},...,\mathbf{x}^{(n_m-1)},\mathbf{w}^{(0)},b^{(0)},...,\mathbf{w}^{(n_u-1)},b^{(n_u-1)}})= \frac{1}{2}\sum_{(i,j):r(i,j)=1}(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\underbrace{
\frac{\lambda}{2}
\sum_{j=0}^{n_u-1}\sum_{k=0}^{n-1}(\mathbf{w}^{(j)}_k)^2
+ \frac{\lambda}{2}\sum_{i=0}^{n_m-1}\sum_{k=0}^{n-1}(\mathbf{x}_k^{(i)})^2
}_{regularization}
\tag{1}$$

$$
= \frac{1}{2}\sum_{j=0}^{n_u-1} \sum_{i=0}^{n_m-1}r(i,j)*(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\text{regularization}
$$

In [20]:
def cofi_cost_func(X,W,b,Y,R,lambda_):
    nm,nu=Y.shape
    J=0
    for j in range(nu):
        w=W[j,:]
        b_j=b[0,j]    

        for i in range(nm):
            x=X[i,:]
            y=Y[i,j]
            r = R[i,j]
            J += np.square(r * (np.dot(w, x) + b_j - y))
            reg= (np.sum(np.square(W)) + np.sum(np.square(X)))   
    J = J/2
    J+=(lambda_/2) * reg
    return J

In [21]:
# Reduce the data set size so that this runs faster
# num_users_r = 4
# num_movies_r = 5 
# num_features_r = 3

# X_r = X[:num_movies_r, :num_features_r]
# W_r = W[:num_users_r,  :num_features_r]
# b_r = b[0, :num_users_r].reshape(1,-1)
# Y_r = Y[:num_movies_r, :num_users_r]
# R_r = R[:num_movies_r, :num_users_r]

# with no regularization
J = cofi_cost_func(X, W, b, Y, R, 0)
print(f"Cost: {J:0.2f}")

Cost: 270821.25


In [22]:
#Evaluate cost function with regularization 
J = cofi_cost_func(X, W, b, Y, R, 1.5)
print(f"Cost (with regularization): {J:0.2f}")

Cost (with regularization): 306504.87


## vectorized implementation

In [23]:
def cofi_cost_func_v(X,W,b,Y,R,lambda_):
    j= (tf.linalg.matmul(X,tf.transpose(W))+ b-Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [26]:
J = cofi_cost_func_v(X, W, b, Y, R, 0)
print(f"Cost: {J}")

Cost: 270821.2540026271


In [27]:
J = cofi_cost_func_v(X, W, b, Y, R, 1.5)
print(f"Cost (with regularization): {J}")

Cost (with regularization): 306504.87494669505


### learn movie recommendation

In [36]:
import pandas as pd
def load_Movie_List_pd():
    df = pd.read_csv('./data/small_movie_list.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
    mlist = df["title"].to_list()
    return(mlist, df)

In [37]:
mlist,mlistdf=load_Movie_List_pd()

In [38]:
X, W, b, num_movies, num_features, num_users = load_precalc_params_small() 
Y, R = load_ratings_small()

my_ratings=np.zeros(num_movies)

In [40]:
my_ratings[2700] = 5 
my_ratings[2609] = 2
my_ratings[929]  = 5   
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   
my_ratings[1150] = 5   
my_ratings[382]  = 2   
my_ratings[366]  = 5   
my_ratings[622]  = 5   
my_ratings[988]  = 3  
my_ratings[2925] = 1   
my_ratings[2937] = 1   
my_ratings[793]  = 5   

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {mlistdf.loc[i,"title"]}');


New user ratings:

Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for  Incredibles, The (2004)
Rated 2.0 for  Persuasion (2007)
Rated 5.0 for  Toy Story 3 (2010)
Rated 3.0 for  Inception (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)


In [41]:
#normaize it to help someone when didnt rate so a movie wont get automatically 0 rating 

def normalizeRatings(Y, R):
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)

In [42]:
Y = np.c_[my_ratings, Y]
R = np.c_[(my_ratings != 0).astype(int), R]
Ynorm, Ymean = normalizeRatings(Y, R)

In [44]:
Y.shape
R.shape

(4778, 444)

In [45]:
num_movies, num_users = Y.shape
num_features = 100

W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [46]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    with tf.GradientTape() as tape:

        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    grads = tape.gradient( cost_value, [X,W,b] )

    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 2285400.5
Training loss at iteration 20: 133837.3
Training loss at iteration 40: 50616.9
Training loss at iteration 60: 23891.2
Training loss at iteration 80: 13196.2
Training loss at iteration 100: 8203.8
Training loss at iteration 120: 5609.8
Training loss at iteration 140: 4167.3
Training loss at iteration 160: 3326.9
Training loss at iteration 180: 2819.3


### Recommendation

In [47]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]
for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {mlist[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {mlist[i]}')

Predicting rating 4.40 for movie Lord of the Rings: The Two Towers, The (2002)
Predicting rating 4.39 for movie Into the Forest of Fireflies' Light (2011)
Predicting rating 4.37 for movie Delirium (2014)
Predicting rating 4.37 for movie One I Love, The (2014)
Predicting rating 4.37 for movie Laggies (2014)
Predicting rating 4.35 for movie Colourful (Karafuru) (2010)
Predicting rating 4.33 for movie 'Salem's Lot (2004)
Predicting rating 4.33 for movie Eichmann (2007)
Predicting rating 4.33 for movie Into the Abyss (2011)
Predicting rating 4.33 for movie Battle Royale 2: Requiem (Batoru rowaiaru II: Chinkonka) (2003)


Original vs Predicted ratings:

Original 5.0, Predicted 4.90 for Shrek (2001)
Original 5.0, Predicted 4.86 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Original 2.0, Predicted 2.12 for Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Original 5.0, Predicted 4.83 for Harry Potter and the Chamber of Secrets (2002)
Ori

In [51]:
filter=(mlistdf["number of ratings"] > 20)
mlistdf["pred"] = my_predictions
movieList_df = mlistdf.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
movieList_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)

Unnamed: 0,pred,mean rating,number of ratings,title
2079,3.899578,4.158537,41,In Bruges (2008)
2395,3.905183,4.136364,88,Inglourious Basterds (2009)
211,3.825626,4.122642,159,Memento (2000)
929,4.894568,4.118919,185,"Lord of the Rings: The Return of the King, The..."
2700,4.845647,4.109091,55,Toy Story 3 (2010)
393,4.024078,4.106061,198,"Lord of the Rings: The Fellowship of the Ring,..."
653,4.399335,4.021277,188,"Lord of the Rings: The Two Towers, The (2002)"
2851,3.800723,3.95,30,Limitless (2011)
1771,3.951407,3.944444,81,Casino Royale (2006)
2649,4.003961,3.943396,53,How to Train Your Dragon (2010)
