In [2]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [3]:
ratings = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_ratings.csv').drop("Unnamed: 0", axis=1) 
ratings['user_emb_id'] = ratings['user_id'] - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1
# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['user_id'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movie_id'].drop_duplicates().max()

ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,user_emb_id,movie_emb_id
0,196,242,3,881250949.0,195,241
1,186,302,3,891717742.0,185,301
2,22,377,1,878887116.0,21,376
3,244,51,2,880606923.0,243,50
4,166,346,1,886397596.0,165,345


In [4]:
users = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_users.csv').drop("Unnamed: 0", axis=1)
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
# Reading movie file
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('../matrix_factorization/data/ml-100k/u.item', sep='|', encoding='latin-1', names=i_cols)

movies.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [268 576  42 ... 436 283 221] , shape = (100105,)
Movies: [ 527 1043   13 ...  474  321  199] , shape = (100105,)
Ratings: [4 4 2 ... 3 3 3] , shape = (100105,)


In [72]:
import keras
from keras.layers import Input, Embedding, dot, Dense
from keras.models import Model, Sequential

In [73]:
user_input = Input(shape=(1,), name='user_input')
x = Embedding(input_dim=max_userid, output_dim=100, input_length=1)(user_input)
user_output = Reshape((100,))(x)

In [74]:
movie_input = Input(shape=(1,), name='movie_input')
y = Embedding(input_dim=max_movieid, output_dim=100, input_length=1)(movie_input)
movie_output = Reshape((100,))(y)

In [75]:
z = dot([Dense(10)(user_output), Dense(10)(movie_output)], axes=1)

In [79]:
model = Model(inputs=[user_input, movie_input], outputs=z)

In [83]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_29 (Embedding)        (None, 1, 100)       94800       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_30 (Embedding)        (None, 1, 100)       168200      movie_input[0][0]                
__________________________________________________________________________________________________
reshape_20

In [80]:
model.compile(loss='mse', optimizer='adamax')

In [81]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

In [82]:
callbacks = [EarlyStopping('val_loss', patience=2),
            ModelCheckpoint('weights.h5', save_best_only=True)]

history=model.fit([Users, Movies], Ratings, epochs=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 90094 samples, validate on 10011 samples
Epoch 1/30
 - 10s - loss: 1.7251 - val_loss: 0.9310
Epoch 2/30
 - 14s - loss: 0.9020 - val_loss: 0.9057
Epoch 3/30
 - 15s - loss: 0.8840 - val_loss: 0.8959
Epoch 4/30
 - 14s - loss: 0.8588 - val_loss: 0.8724
Epoch 5/30
 - 11s - loss: 0.8359 - val_loss: 0.8673
Epoch 6/30
 - 10s - loss: 0.8197 - val_loss: 0.8573
Epoch 7/30
 - 10s - loss: 0.8035 - val_loss: 0.8454
Epoch 8/30
 - 10s - loss: 0.7827 - val_loss: 0.8356
Epoch 9/30
 - 10s - loss: 0.7617 - val_loss: 0.8387
Epoch 10/30
 - 10s - loss: 0.7422 - val_loss: 0.8393


In [84]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 8 = 0.9141
