In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.models import Model, Input
from keras.layers import Embedding, Flatten, dot
from keras.optimizers import Adam
%matplotlib inline

dataset source: https://www.kaggle.com/prajitdatta/movielens-100k-dataset

In [2]:
path = '../../data/kaggle_movielens_100k/'

In [3]:
def read_rating_data(filename):
    return pd.read_csv(path+filename, sep='\t', header=None, names=['userId', 'movieId', 'rating', 'timestamp'])

In [4]:
train_df = read_rating_data('ua.base')
test_df = read_rating_data('ua.test')
full_df = pd.concat([train_df, test_df])

In [97]:
userid_to_idx = {x:i for i,x in enumerate(full_df['userId'].unique())}
movieid_to_idx = {x:i for i,x in enumerate(full_df['movieId'].unique())}

train_df['userId'] = train_df['userId'].map(userid_to_idx)
train_df['movieId'] = train_df['movieId'].map(movieid_to_idx)
test_df['userId'] = test_df['userId'].map(userid_to_idx)
test_df['movieId'] = test_df['movieId'].map(movieid_to_idx)
full_df['userId'] = full_df['userId'].map(userid_to_idx)
full_df['movieId'] = full_df['movieId'].map(movieid_to_idx)

In [98]:
pd.crosstab(full_df['userId'], full_df['movieId'], full_df['rating'], aggfunc=np.sum)

movieId,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
1.0,4.0,,,,,,,,,2.0,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,4.0,3.0,,,,,,,,,...,,,,,,,,,,
5.0,4.0,,,,,,2.0,4.0,4.0,,...,,,,,,,,,,
6.0,,,,5.0,,,5.0,5.0,5.0,4.0,...,,,,,,,,,,
7.0,,,,,,,3.0,,,,...,,,,,,,,,,
8.0,,,,,,5.0,4.0,,,,...,,,,,,,,,,
9.0,4.0,,,4.0,,,4.0,,4.0,,...,,,,,,,,,,


In [99]:
n_users = full_df['userId'].nunique()
n_movies = full_df['movieId'].nunique()
n_factors = 128

## Dot Product

Matrix Factorization (MF) Models

In [117]:
u_in = Input(shape=(1, ), dtype=np.int64, name='user_in')
m_in = Input(shape=(1, ), dtype=np.int64, name='movie_in')

u_emb = Embedding(n_users, n_factors, embeddings_regularizer=keras.regularizers.l2(1e-5))(u_in)
m_emb = Embedding(n_movies, n_factors, embeddings_regularizer=keras.regularizers.l2(1e-5))(m_in)

x = layers.dot([u_emb, m_emb], 2)
x = Flatten()(x)

model = Model([u_in, m_in], x)
model.summary()

ValueError: Dimension incompatibility 1 != 128. Layer shapes: (None, 1, 128), (None, 1, 128)

In [115]:
model.compile(Adam(1e-4), loss='mse')

In [116]:
model.fit([train_df['userId'], train_df['movieId']], train_df['rating'], 
          validation_data=([test_df['userId'], test_df['movieId']], test_df['rating']),
          epochs=5)

Train on 90570 samples, validate on 9430 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x121009c50>

## Add bias

In [119]:
u_in = Input(shape=(1, ), dtype=np.int64, name='user_in')
m_in = Input(shape=(1, ), dtype=np.int64, name='moive_in')

u_emb = Embedding(n_users, n_factors, embeddings_regularizer=regularizers.l2(1e-5))(u_in)
m_emb = Embedding(n_movies, n_factors, embeddings_regularizer=regularizers.l2(1e-5))(m_in)

x = layers.dot([u_emb, m_emb], 2)
# x = Flatten()(x)

u_bias = Embedding(n_users, 1)(u_in)
m_bias = Embedding(n_movies, 1)(m_in)

x = layers.add([x, u_bias, m_bias])
x = Flatten()(x)

model = Model([u_in, m_in], x)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
moive_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_37 (Embedding)         (None, 1, 128)        120576      user_in[0][0]                    
____________________________________________________________________________________________________
embedding_38 (Embedding)         (None, 1, 128)        215296      moive_in[0][0]                   
___________________________________________________________________________________________

In [120]:
model.compile(Adam(1e-4), 'mse')

In [126]:
model.optimizer.lr = 1e-4
model.fit([train_df['userId'], train_df['movieId']], train_df['rating'], 
          validation_data=([test_df['userId'], test_df['movieId']], test_df['rating']),
          epochs=5)

Train on 90570 samples, validate on 9430 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1210f18d0>