[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/James-Leslie/deep-collaborative-filtering/blob/master/movielens_skorch.ipynb)

In [None]:
# !pip install tensorflow-gpu==2.0.0-beta1

In [1]:
%load_ext tensorboard

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, datetime
import warnings

from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

from sklearn.model_selection import train_test_split

%matplotlib inline

In [3]:
# path = 'https://raw.githubusercontent.com/James-Leslie/deep-collaborative-filtering/master/data/ml-10M100K/'  # access from anywhere
path = 'data/ml-10M100K/'  # if the files are local
train = df = pd.concat((pd.read_csv(path+'train1.csv'), pd.read_csv(path+'train2.csv')), ignore_index=True)
test = pd.read_csv(path+'test.csv')
df = pd.concat((train, test), ignore_index=True)

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating
0,17186,12,3.0
1,4960,5764,4.0
2,52144,1616,4.0
3,68149,1466,3.5
4,16898,1124,2.0


In [5]:
df.shape

(10000054, 3)

In [7]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()
print('Number of users:', n_users)
print('Number of items:', n_items)

Number of users: 69878
Number of items: 10677


In [8]:
max_users = df.userId.max()
max_items = df.movieId.max()
print("Max user ID:", max_users)
print("Max movie ID:", max_items)

Max user ID: 69877
Max movie ID: 10676


## Predict ratings

### Trunk of model
We will re-use the item embedding layer later on

In [9]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(logdir, histogram_freq=1, embeddings_freq=1)
os.makedirs(logdir, exist_ok=True)

In [10]:
# item embedding path
item_input = Input(shape=[1], name="Item-Input")
item_embedding = Embedding(n_items, 25, name="Item-Embedding")(item_input)
item_vec = Flatten(name="Flatten-Items")(item_embedding)

# user embedding path
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(n_users, 25, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)

### Head of model 
#### Option 1: dot product

In [12]:
# # dot product user and item vectors
# prod = Dot(name="Dot-Product", axes=1)([item_vec, user_vec])

# # create model and compile it
# model = Model([user_input, item_input], prod)
# model.compile('adam', 'mean_squared_error')
# model.summary()

#### Option 2: neural network

In [13]:
max_rating = df.rating.max()
min_rating = df.rating.min()

In [15]:
min_rating, max_rating

(0.5, 5.0)

In [16]:
from tensorflow.keras import backend as K

def stretch(x):
    return K.sigmoid(x) * (max_rating - min_rating) + min_rating

In [17]:
# concatenate features
conc = Concatenate()([item_vec, user_vec])

# add fully-connected-layers
fc1 = Dense(64, activation='relu')(conc)
d1 = Dropout(.5)(fc1)
fc2 = Dense(32, activation='relu')(d1)
d2 = Dropout(.25)(fc2)
out = Dense(1, activation=stretch)(d2)

# create model and compile it
model = Model([user_input, item_input], out)
model.compile('adam', 'mean_squared_error')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item-Input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
User-Input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
Item-Embedding (Embedding)      (None, 1, 25)        266925      Item-Input[0][0]                 
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 25)        1746950     User-Input[0][0]                 
______________________________________________________________________________________________

### (Optional) load weights from checkpoint

In [18]:
# ignore this if running in colaboratory
checkpoint_path = "models/movielens10M.ckpt"

# Create checkpoint callback
cp_callback = ModelCheckpoint(checkpoint_path,
                              save_weights_only=True,
                              verbose=1)

In [15]:
# model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x27a54ea5be0>

In [20]:
history = model.fit([df.userId, df.movieId], df.rating, 
                    batch_size=256,
                    epochs=10,
                    verbose=1,
                    validation_split=.3,
                    callbacks=[cp_callback, tensorboard_callback])

Train on 8000043 samples, validate on 2000011 samples
Epoch 1/10
Epoch 00001: saving model to models/movielens10M.ckpt
Epoch 2/10
Epoch 00002: saving model to models/movielens10M.ckpt
Epoch 3/10
Epoch 00004: saving model to models/movielens10M.ckpt
Epoch 5/10
Epoch 00005: saving model to models/movielens10M.ckpt
Epoch 6/10
Epoch 00006: saving model to models/movielens10M.ckpt
Epoch 7/10
Epoch 00007: saving model to models/movielens10M.ckpt
Epoch 8/10
Epoch 00009: saving model to models/movielens10M.ckpt
Epoch 10/10
Epoch 00010: saving model to models/movielens10M.ckpt


In [21]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 1572), started 1 day, 19:51:29 ago. (Use '!kill 1572' to kill it.)

In [23]:
model.evaluate([df.userId, df.movieId], df.rating, batch_size=1024)



0.6411908849552477

In [24]:
np.sqrt(0.6412)

0.8007496487667041

In [26]:
predictions = model.predict([df.userId.head(10), df.movieId.head(10)])

[print(predictions[i], df.rating.iloc[i]) for i in range(0,10)]

[3.4416304] 3.0
[3.6104715] 4.0
[3.5779662] 4.0
[3.340524] 3.5
[2.1260123] 2.0
[2.9319258] 3.0
[3.3849223] 3.5
[2.7936099] 2.5
[2.7770824] 3.0
[4.0854893] 4.0


[None, None, None, None, None, None, None, None, None, None]

## Making recommendations

In [27]:
# Creating dataset for making recommendations for the first user
item_data = np.array(list(set(df.movieId)))
item_data[:5]

array([0, 1, 2, 3, 4])

In [28]:
user = np.array([1 for i in range(len(item_data))])
user[:5]

array([1, 1, 1, 1, 1])

In [30]:
predictions = model.predict([user, item_data])

predictions = np.array([a[0] for a in predictions])

# get top 5 predictions
recommended_movie_ids = (-predictions).argsort()[:5]

recommended_movie_ids

array([1293,   60,  798,   64,  177], dtype=int64)

In [31]:
# print predicted scores
predictions[recommended_movie_ids]

array([4.3233833, 4.3116584, 4.3048005, 4.3000402, 4.2402205],
      dtype=float32)

In [33]:
movies = pd.read_csv(path+'movies.tsv', sep='\t')

In [34]:
movies[movies['movieId'].isin(recommended_movie_ids)]

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
60,60,"Lord of the Rings: The Two Towers, The (2002)",1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
64,64,"Lord of the Rings: The Return of the King, The...",1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
177,177,Star Wars: Episode V - The Empire Strikes Back...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
798,798,"Lord of the Rings: The Fellowship of the Ring,...",1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1293,1293,"Shawshank Redemption, The (1994)",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
