In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [17]:
# prepare data
ratings_df = pd.read_csv('ratings.csv')
movies_df = pd.read_csv('movies.csv')

movies_df['movieRow'] = movies_df.index
movies_df = movies_df[['movieRow', 'movieId', 'title']]
movies_df.to_csv('moviesProcessed.csv')

ratings_df = pd.merge(ratings_df, movies_df, on='movieId')
ratings_df = ratings_df[['userId', 'movieRow', 'rating']]

In [29]:
# movie-user matrix
userNo = ratings_df['userId'].max() + 1
movieNo = ratings_df['movieRow'].max() + 1

rating = np.zeros((movieNo, userNo))
flag = 0
ratings_df_length = np.shape(ratings_df)[0]

for index, row in ratings_df.iterrows():
    rating[int(row['movieRow']), int(row['userId'])] = row['rating']
    flag += 1

record = rating > 0
record = np.array(record, dtype = int)

In [30]:
record

array([[0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [36]:
# build model
def normalizeRatings(rating, record):
    m, n = rating.shape # m=number of movies, n=number of users
    ratings_mean = np.zeros((m,1)) # for each movie, we calculate its mean(rating) value
    ratings_norm = np.zeros((m,n))
    
    for i in range(m):
        idx = record[i,:] != 0
        ratings_mean[i] = np.mean(rating[i,idx])
        ratings_norm[i,idx] -=ratings_mean[i]
    return ratings_norm, ratings_mean

rating_norm, rating_mean = normalizeRatings(rating, record)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [41]:
rating_norm = np.nan_to_num(rating_norm)
rating_norm

rating_mean= np.nan_to_num(rating_mean)
rating_mean

array([[3.87246964],
       [3.40186916],
       [3.16101695],
       ...,
       [3.        ],
       [0.        ],
       [5.        ]])

In [44]:
num_featurs = 10 
X_parameters = tf.Variable(tf.random_normal([movieNo, num_featurs], stddev = 0.35))
Theta_parameters = tf.Variable(tf.random_normal([userNo, num_featurs], stddev = 0.35))

loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, Theta_parameters, transpose_b = True) - rating_norm) * record) **2) + 1/2 * (tf.reduce_sum(X_parameters ** 2) + tf.reduce_sum(Theta_parameters ** 2)) 

In [45]:
optimizer = tf.train.AdamOptimizer(1e-4)
train = optimizer.minimize(loss)

In [49]:
# train model
tf.summary.scalar('loss', loss)
summaryMerged = tf.summary.merge_all()
filename = '/Users/mwang/Desktop/System/movie_tensorboard'
writer = tf.summary.FileWriter(filename)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [50]:
for i in range(5000):
    _, movie_summary = sess.run([train, summaryMerged])
    writer.add_summary(movie_summary, i)

In [53]:
# evaluate
Current_X_paramters, Current_Theta_paramters = sess.run([X_parameters, Theta_parameters])
predicts = np.dot(Current_X_paramters, Current_Theta_paramters.T) + rating_mean

error = np.sqrt(np.sum((predicts - rating)**2))
error

4153.318254600069

In [54]:
# build recommendation system 
user_id = input('user_id: ')
sortedResult = predicts[:, int(user_id)].argsort()[::-1]
idx = 0
print('top 20 recommended movies: ')
for i in sortedResult:
    print('score: %.2f, movie: %s' %(predicts[i,int(user_id)], movies_df.iloc[i]['title']))
    idx += 1
    if idx == 20:break

user_id: 123
top 20 recommended movies: 
score: 5.35, movie: Boy Crazy (2009)
score: 5.25, movie: To the Left of the Father (Lavoura Arcaica) (2001)
score: 5.19, movie: Defying Gravity (1997)
score: 5.10, movie: 29th and Gay (2005)
score: 5.08, movie: FAQs (2005)
score: 5.04, movie: Curiosity of Chance, The (2006)
score: 5.03, movie: Faces (1968)
score: 5.00, movie: Edge of Heaven, The (Auf der anderen Seite) (2007)
score: 5.00, movie: Shelter (2007)
score: 5.00, movie: Shining Through (1992)
score: 4.97, movie: Romeos (2011)
score: 4.93, movie: Me Before You (2016)
score: 4.92, movie: Germany Year Zero (Germania anno zero) (Deutschland im Jahre Null) (1948)
score: 4.92, movie: The Man I Love (1997)
score: 4.91, movie: Get Your Stuff (2000)
score: 4.90, movie: Dorian Blues (2004)
score: 4.88, movie: The Biggest Fan (2002)
score: 4.85, movie: The Big Gay Musical (2009)
score: 4.85, movie: I Think I Do (1997)
score: 4.84, movie: Au Hasard Balthazar (1966)
