In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
ratings = pd.read_csv('ml-20m/ratings.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [3]:
ratings['userEmbId'] = ratings['userId'] - 1
ratings['movieEmbId'] = ratings['movieId'] - 1
# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['userId'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movieId'].drop_duplicates().max()

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,userEmbId,movieEmbId
0,1,2,3.5,1112486027,0,1
1,1,29,3.5,1112484676,0,28
2,1,32,3.5,1112484819,0,31
3,1,47,3.5,1112484727,0,46
4,1,50,3.5,1112484580,0,49


In [4]:
users = pd.read_csv('ml-20m/ratings.csv')

users = users.drop_duplicates(subset='userId')

users = users['userId'].reset_index()

users = users.drop(columns='index')

In [5]:
users.head()

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5


In [6]:
movies = pd.read_csv('ml-20m/movies.csv')

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
genres = pd.get_dummies(movies.genres.str.split('|',expand=True).stack()).sum(level=0)

In [8]:
movies = pd.concat([movies, genres], axis=1)
movies = movies.drop(columns=['genres','(no genres listed)'])

In [9]:
movies.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['userEmbId'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movieEmbId'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [122269  49017  89526 ...  15190  92010 111372] , shape = (20000263,)
Movies: [  8359     31 109373 ...   2716   1275   2639] , shape = (20000263,)
Ratings: [3.5 2.  3.5 ... 3.5 5.  3.5] , shape = (20000263,)


In [11]:
import keras
from keras.layers import Input, Embedding, dot, Dense, Reshape
from keras.models import Model, Sequential

Using TensorFlow backend.


In [12]:
user_input = Input(shape=(1,), name='user_input')
x = Embedding(input_dim=max_userid, output_dim=100, input_length=1)(user_input)
user_output = Reshape((100,))(x)

In [13]:
movie_input = Input(shape=(1,), name='movie_input')
y = Embedding(input_dim=max_movieid, output_dim=100, input_length=1)(movie_input)
movie_output = Reshape((100,))(y)

In [14]:
z = dot([Dense(10)(user_output), Dense(10)(movie_output)], axes=1)

In [15]:
model = Model(inputs=[user_input, movie_input], outputs=z)

In [16]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       13849300    user_input[0][0]                 
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 100)       13126200    movie_input[0][0]                
__________________________________________________________________________________________________
reshape_1 

In [17]:
model.compile(loss='mse', optimizer='adamax')

In [18]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

In [19]:
callbacks = [EarlyStopping('val_loss', patience=2),
            ModelCheckpoint('weights.h5', save_best_only=True)]

history=model.fit([Users, Movies], Ratings, epochs=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 18000236 samples, validate on 2000027 samples
Epoch 1/30


KeyboardInterrupt: 

In [None]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))