In [None]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize

df_Y = pd.read_csv('Y.csv', sep=";", header=None)
df_R = pd.read_csv('R.csv', sep=";", header=None)
df_Theta = pd.read_csv('Theta.csv', sep=";", header=None)
df_X = pd.read_csv('X.csv', sep=";", header=None)

In [88]:
# Tansform DataFrames into numpy arrays

X = np.array(df_X.values)
Theta = np.array(df_Theta.values)
Y = np.array(df_Y.values)
R = np.array(df_R.values)

In [89]:
# Re-adjust data for development

num_users = 4
num_movies = 5
num_features = 3

X_dev = X[:num_movies, :num_features]
Theta_dev = Theta[:num_users, :num_features]
Y_dev = Y[:num_movies, :num_users]
R_dev = R[:num_movies, :num_users]
value_lambda_dev = 1.5

print("X_dev = {}" .format(X_dev.shape))
print("Theta_dev = {}".format(Theta_dev.shape))
print("Y_dev = {}".format(Y_dev.shape))
print("R_dev = {}".format(R_dev.shape))

X_dev = (5, 3)
Theta_dev = (4, 3)
Y_dev = (5, 4)
R_dev = (5, 4)


In [90]:
# Collaborative filtering Cost function

def CoFi_Cost(params, Y, R, num_users, num_movies, num_features, val_lambda):
    
    #Unfold X and Theta from matrices
    X = np.array(params[:num_movies*num_features]).reshape(num_movies,num_features).copy()
    Theta = np.array(params[num_movies*num_features:]).reshape(num_users,num_features,).copy()
    
    #Initialization
    J = 0 
    X_Grad = np.zeros(X.shape)
    Theta_Grad = np.zeros(Theta.shape)
    
    #Penalties
    p_Theta = np.power(Theta,2).sum() * (val_lambda / 2)
    p_X = np.power(X,2).sum() * (val_lambda / 2)
    
    #Cost Function
    J = np.multiply(np.power((X.dot(Theta.T) - Y),2), R).sum() / 2 + p_Theta + p_X

    #Gradients
    X_Grad = np.multiply(R,(X.dot(Theta.T) - Y)).dot(Theta)
    Theta_Grad = np.multiply(R,X.dot(Theta.T) - Y).T.dot(X)

    Grad = np.r_[X_Grad.flatten(),Theta_Grad.flatten()]
    
    return J, Grad

In [91]:
J, Grad = CoFi_Cost(np.r_[X_dev.flatten(), Theta_dev.flatten()],Y_dev, R_dev, num_users, num_movies, num_features, value_lambda_dev)

In [92]:
J

31.34405624427422

In [93]:
# Product list

df_Product = pd.read_fwf('movie_ids.txt', header=None)
Product = np.array(df_Product.values)

In [94]:
# Add personal Rating

Rating = np.zeros(Product.shape)

Rating[0] = 4
Rating[97] = 2
Rating[6] = 3
Rating[11] = 5
Rating[53] = 4
Rating[63] = 5
Rating[65] = 3
Rating[68] = 5
Rating[182] = 4
Rating[225] = 5
Rating[354] = 5

for idx, customer_R in enumerate(Rating):
    if customer_R > 0:
        print(Product[idx])

['1 Toy Story (1995)']
['7 Twelve Monkeys (1995)']
['12 Usual Suspects, The (1995)']
['54 Outbreak (1995)']
['64 Shawshank Redemption, The (1994)']
['66 While You Were Sleeping (1995)']
['69 Forrest Gump (1994)']
['98 Silence of the Lambs, The (1991)']
['183 Alien (1979)']
['226 Die Hard 2 (1990)']
['355 Sphere (1998)']


In [95]:
# Add Customer Rating to data matrix

Y = np.c_[Rating, Y]
R = np.c_[(Rating > 0).astype(int), R]

In [96]:
def normalizeRatings(Y, R):
    """normalized Y so that each movie has a rating of 0 on average,
    and returns the mean rating in Ymean.
    """

    m, n = Y.shape
    Ymean = np.zeros(m)
    Ynorm = np.zeros(Y.shape)

    for i in range(n):
        idx = (R[i,:]==1).nonzero()[0]
        if len(idx):
            Ymean[i] = np.mean(Y[i, idx])
            Ynorm[i, idx] = Y[i, idx] - Ymean[i]
        else:
            Ymean[i] = 0.0
            Ynorm[i,idx] = 0.0

    return Ynorm, Ymean

In [97]:
Ynorm, Ymean = normalizeRatings(Y, R)

In [98]:
Ynorm = Y / Y.sum(axis = 1, keepdims = True)
#Ymean = Y.mean()

In [99]:
# Useful values

num_users = Y.shape[1]
num_movies = Y.shape[0]
num_features = 10

In [100]:
X = np.random.rand(num_movies, num_features)
Theta = np.random.rand(num_users, num_features)

initial_parameters = np.r_[X.flatten(), Theta.flatten()]
Lambda = 10

In [108]:
costFunc = lambda p: CoFi_Cost(p, Ynorm, R, num_users, num_movies, num_features, Lambda)[0]
gradFunc = lambda p: CoFi_Cost(p, Ynorm, R, num_users, num_movies, num_features, Lambda)[1]

result = minimize(costFunc, initial_parameters, method='CG', jac=gradFunc, options={'disp': True, 'maxiter': 100.0})
theta = result.x
cost = result.fun

X = theta[:num_movies*num_features].reshape(num_movies, num_features)
Theta = theta[num_movies*num_features:].reshape(num_users, num_features)

         Current function value: 12473.140214
         Iterations: 53
         Function evaluations: 172
         Gradient evaluations: 160


In [107]:
p = X.dot(Theta.T)
my_predictions = p[:, 0] + Ymean

movieList = Product

# sort predictions descending
pre=np.array([[idx, p] for idx, p in enumerate(my_predictions)])
post = pre[pre[:,1].argsort()[::-1]]
r = post[:,1]
ix = post[:,0]

print('\nTop recommendations for you:')
for i in range(10):
    j = int(ix[i])
    print('Predicting rating %.1f for movie %s\n' % (my_predictions[j], Product[j]))

print('\nOriginal ratings provided:')
for i in range(len(Rating)):
    if Rating[i] > 0:
        print('Rated %d for %s\n' % (Rating[i], Product[i]))


Top recommendations for you:
Predicting rating 7.6 for movie ['814 Great Day in Harlem, A (1994)']

Predicting rating 7.5 for movie ['119 Maya Lin: A Strong Clear Vision (1994)']

Predicting rating 6.2 for movie ['711 Substance of Fire, The (1996)']

Predicting rating 6.0 for movie ['677 Fire on the Mountain (1996)']

Predicting rating 5.9 for movie ['867 Whole Wide World, The (1996)']

Predicting rating 5.7 for movie ['113 Horseman on the Roof, The (Hussard sur le toit, Le) (1']

Predicting rating 5.7 for movie ['857 Paris Was a Woman (1995)']

Predicting rating 5.7 for movie ["600 Daniel Defoe's Robinson Crusoe (1996)"]

Predicting rating 5.6 for movie ['701 Wonderful, Horrible Life of Leni Riefenstahl, The (199']

Predicting rating 5.5 for movie ['850 Perfect Candidate, A (1996)']


Original ratings provided:
Rated 4 for ['1 Toy Story (1995)']

Rated 3 for ['7 Twelve Monkeys (1995)']

Rated 5 for ['12 Usual Suspects, The (1995)']

Rated 4 for ['54 Outbreak (1995)']

Rated 5 for ['6