## Movies Recommendation using Collaborative Filtering
---
In this exercise, we will use Collaborative filtering to recommend movies

In [1]:
#Importing dependencies
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt

In [2]:
def getCost(X, theta, R, Y):
	return 0.5 * np.sum(((np.dot(X, theta.T) * R) - (Y*R))**2)

In [3]:
def getRegularizedCost(X, theta, R, Y, lambd):
	cost = getCost(X, theta, R, Y)
	cost += lambd/2 * np.sum(theta**2)
	cost += lambd/2 * np.sum(X**2)
	return cost

In [4]:
#Initializing data
data = loadmat('ex8_movies.mat')

Y = np.array(data['Y'])
R = np.array(data['R'])

In [5]:
#Temporary code to check cost function
num_users = 4
num_movies = 5
num_features = 3
params = loadmat('ex8_movieParams.mat')
#X - (1682 x 10)
X = np.array(params['X'])
#theta - (943 x 10)
theta = np.array(params['Theta'])
#temp_X = (5 x 3)
temp_X = X[:num_movies, :num_features]
#temp_Y = (5 x 4)
temp_Y = Y[:num_movies, :num_users]
#temp_R = (5 x 4)
temp_R = R[:num_movies, :num_users]
#temp_theta - (4 x 3)
temp_theta = theta[:num_users, :num_features]

unreg_cost = getCost(temp_X, temp_theta, temp_R, temp_Y)
print("Unregularized cost:" + str(unreg_cost))

lambd = 1.5
reg_cost = getRegularizedCost(temp_X, temp_theta, temp_R, temp_Y, lambd)
print("Regularized cost:" + str(reg_cost))

Unregularized cost:22.224603725685675
Regularized cost:31.34405624427422


In [6]:
movies = []
with open('movie_ids.txt') as f:
    for line in f:
        movies.append(' '.join(line.strip('\n').split(' ')[1:]))

In [7]:
#Entering my ratings (My preferences)
my_ratings = np.zeros((1682,1))
#Here, my_ratings[index] - index is used to refer to a movie, index according to movie_ids.txt
my_ratings[0]   = 4
my_ratings[97]  = 2
my_ratings[6]   = 3
my_ratings[11]  = 5
my_ratings[53]  = 4
my_ratings[63]  = 5
my_ratings[65]  = 3
my_ratings[68]  = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354] = 5

In [8]:
#nf - number of features we want per movie
#Instead of manually deciding features, we let the algorithm learn the features
nf = 10
myRatings_row = my_ratings > 0
Y = np.hstack((Y,my_ratings))
R = np.hstack((R,myRatings_row))

#nm - number of movies
#nu - number of users
nm, nu = Y.shape

In [9]:
#Normalize Ratings
Ymean = np.sum(Y,axis=1,keepdims=True)/np.sum(R,axis=1, keepdims=True)
Y = Y - Ymean

In [10]:
np.random.seed(7)
#X is a matrix of movies x features
#We initialize it randomly and they are learned by algorithm according to the preferences of users
X = np.random.rand(nm, nf)
theta = np.random.rand(nu, nf)

#Hyperparameters
lambd = 10
iters = 50
alpha = 1e-03

In [11]:
#Gradient Descent
for i in range(iters):
	print("Cost:" + str(getRegularizedCost(X, theta, R, Y, lambd)))
	X = X - (alpha * (np.dot(((np.dot(X, theta.T)*R) - (Y*R)), theta) + lambd*X))
	theta = theta - (alpha * (np.dot(((np.dot(X, theta.T)*R) - (Y*R)).T, X) + lambd*theta))

print("Final Cost:" + str(getRegularizedCost(X, theta, R, Y, lambd)))

Cost:423843.128765416
Cost:158789.69531425607
Cost:118802.1012599242
Cost:102581.539213448
Cost:93520.6415763488
Cost:87648.19241601456
Cost:83470.59236117403
Cost:80302.16740871203
Cost:77784.72352363136
Cost:75713.09822968477
Cost:73961.32002755617
Cost:72447.70300209937
Cost:71116.89239409703
Cost:69929.9889919369
Cost:68858.81180427162
Cost:67882.4094481689
Cost:66984.85531024708
Cost:66153.8070142311
Cost:65379.53768097998
Cost:64654.267739134695
Cost:63971.69357766099
Cost:63326.6483227361
Cost:62714.853263876765
Cost:62132.73270863321
Cost:61577.27401326715
Cost:61045.92031058407
Cost:60536.48725189481
Cost:60047.097623656184
Cost:59576.129433911665
Cost:59122.17426574308
Cost:58684.00354049751
Cost:58260.54093654787
Cost:57850.83964484171
Cost:57454.06346071833
Cost:57069.4709464943
Cost:56696.40207457437
Cost:56334.26689266272
Cost:55982.53585254826
Cost:55640.73152013065
Cost:55308.421442779305
Cost:54985.211995111305
Cost:54670.74305903778
Cost:54364.68342085914
Cost:54066.7

In [12]:
prediction = np.dot(X, theta.T)
my_preds = prediction[:,-1] + Ymean.flatten()

pred_idxs_sorted = np.argsort(my_preds)
pred_idxs_sorted[:] = pred_idxs_sorted[::-1]

In [13]:
print("Top recommendations for you:")
for i in range(10):
    print('Predicting rating %0.1f for movie %s.' % (my_preds[pred_idxs_sorted[i]],movies[pred_idxs_sorted[i]]))
    
print("\nOriginal ratings provided:")
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('Rated %d for movie %s.' % (my_ratings[i],movies[i]))

Top recommendations for you:
Predicting rating 6.1 for movie Prefontaine (1997).
Predicting rating 6.0 for movie Saint of Fort Washington, The (1993).
Predicting rating 6.0 for movie Marlene Dietrich: Shadow and Light (1996) .
Predicting rating 6.0 for movie Someone Else's America (1995).
Predicting rating 6.0 for movie Great Day in Harlem, A (1994).
Predicting rating 5.9 for movie They Made Me a Criminal (1939).
Predicting rating 5.8 for movie Aiqing wansui (1994).
Predicting rating 5.8 for movie Star Kid (1997).
Predicting rating 5.8 for movie Entertaining Angels: The Dorothy Day Story (1996).
Predicting rating 5.7 for movie Santa with Muscles (1996).

Original ratings provided:
Rated 4 for movie Toy Story (1995).
Rated 3 for movie Twelve Monkeys (1995).
Rated 5 for movie Usual Suspects, The (1995).
Rated 4 for movie Outbreak (1995).
Rated 5 for movie Shawshank Redemption, The (1994).
Rated 3 for movie While You Were Sleeping (1995).
Rated 5 for movie Forrest Gump (1994).
Rated 2 for