## Loading Datasets

In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Reading ratings file
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', 
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

# Reading ratings file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', 
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading ratings file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', 
                     usecols=['movie_id', 'title', 'genres'])

## Creating sets

In [2]:
# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print ('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print ('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [5411 5439  367 ...  853 4032  785] , shape = (1000209,)
Movies: [2682  903 3716 ... 3101 3478 1390] , shape = (1000209,)
Ratings: [2 5 4 ... 3 5 4] , shape = (1000209,)


## Deep Learning Model


In [3]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
# Import CF Model Architecture
from CFModel import CFModel

Using TensorFlow backend.


In [4]:
# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 2000 # A random test user (user_id = 2000)

Compile the model using Mean Squared Error (MSE) as the loss function and the AdaMax learning algorithm.

In [5]:
# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.compile(loss='mse', optimizer='adamax')

Instructions for updating:
keep_dims is deprecated, use keepdims instead


  self.add(Merge([P, Q], mode='dot', dot_axes=1))


### Train the Model

In [7]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=10, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 900188 samples, validate on 100021 samples
Epoch 1/10
701s - loss: 1.0795 - val_loss: 0.9774
Epoch 2/10
686s - loss: 0.9178 - val_loss: 0.8960
Epoch 3/10
687s - loss: 0.8627 - val_loss: 0.8581
Epoch 4/10
691s - loss: 0.8300 - val_loss: 0.8342
Epoch 5/10
698s - loss: 0.8042 - val_loss: 0.8176
Epoch 6/10
695s - loss: 0.7819 - val_loss: 0.8005
Epoch 7/10
691s - loss: 0.7608 - val_loss: 0.7880
Epoch 8/10
696s - loss: 0.7399 - val_loss: 0.7775
Epoch 9/10
702s - loss: 0.7189 - val_loss: 0.7678
Epoch 10/10
700s - loss: 0.6981 - val_loss: 0.7593


### Root Mean Square Error
During the training process above, I saved the model weights each time the validation loss has improved. Thus, I can use that value to calculate the best validation Root Mean Square Error.

In [9]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 10 = 0.8714


### Predict the Ratings
The next step is to actually predict the ratings a random user will give to a random movie. Below I apply the freshly trained deep learning model for all the users and all the movies, using 100 dimensional embeddings for each of them. I also load pre-trained weights from *[weights.h5](https://github.com/khanhnamle1994/movielens/blob/master/weights.h5)* for the model.

In [14]:
# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

In [10]:
# Pick a random test user
users[users['user_id'] == TEST_USER]

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
1999,2000,M,44685,18-24,college/grad student


In [11]:
# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    #return trained_model.rate(user_id - 1, movie_id - 1)
    return model.rate(user_id - 1, movie_id - 1)

Show the top 20 movies that user 2000 has already rated, including the *predictions* column showing the values that used 2000 would have rated based on the defined *predict_rating* function.

In [12]:
user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

Unnamed: 0,user_id,movie_id,rating,prediction,title,genres
0,2000,1639,5,3.624804,Chasing Amy (1997),Drama|Romance
1,2000,2529,5,3.63248,Planet of the Apes (1968),Action|Sci-Fi
2,2000,1136,5,4.160076,Monty Python and the Holy Grail (1974),Comedy
3,2000,2321,5,3.832311,Pleasantville (1998),Comedy
4,2000,2858,5,4.255318,American Beauty (1999),Comedy|Drama
5,2000,2501,5,4.207597,October Sky (1999),Drama
6,2000,2804,5,4.236019,"Christmas Story, A (1983)",Comedy|Drama
7,2000,1688,5,3.525112,Anastasia (1997),Animation|Children's|Musical
8,2000,1653,5,3.707461,Gattaca (1997),Drama|Sci-Fi|Thriller
9,2000,527,5,4.472422,Schindler's List (1993),Drama|War


### Recommend Movies
List of unrated 20 movies sorted by prediction value for our test user.

In [13]:
recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)

Unnamed: 0,movie_id,prediction,title,genres
0,2905,4.887506,Sanjuro (1962),Action|Adventure
1,668,4.682042,Pather Panchali (1955),Drama
2,3030,4.636191,Yojimbo (1961),Comedy|Drama|Western
3,3022,4.615102,"General, The (1927)",Comedy
4,2360,4.58141,"Celebration, The (Festen) (1998)",Drama
5,910,4.552507,Some Like It Hot (1959),Comedy|Crime
6,1147,4.551263,When We Were Kings (1996),Documentary
7,670,4.548893,"World of Apu, The (Apur Sansar) (1959)",Drama
8,669,4.534374,Aparajito (1956),Drama
9,1204,4.529888,Lawrence of Arabia (1962),Adventure|War
