[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/James-Leslie/deep-collaborative-filtering/blob/master/movielens_skorch.ipynb)

In [1]:
# !pip install tensorflow-gpu==2.0.0-beta1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, datetime
import warnings

from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Add, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.math import add

from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# path = 'https://raw.githubusercontent.com/James-Leslie/deep-collaborative-filtering/master/data/ml-100k/'  # access from anywhere
path = 'data/ml-100k/'  # if the files are local
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
df = pd.concat((train, test), ignore_index=True)

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating
0,435,647,5
1,377,91,3
2,419,216,5
3,264,768,1
4,632,294,1


In [4]:
df.shape

(100000, 3)

In [5]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()
print('Number of users:', n_users)
print('Number of items:', n_items)

Number of users: 943
Number of items: 1682


In [6]:
min_rating = df.rating.min()
max_rating = df.rating.max()
mean_rating = train.rating.mean()
print("Min item rating:", min_rating)
print("Max item rating:", max_rating)
print("Mean item rating:", mean_rating)

Min item rating: 1
Max item rating: 5
Mean item rating: 3.5278125


In [7]:
user_averages = train.groupby('userId') \
    .agg({'rating':'mean'}) \
    .rename({'rating': 'user_avg'}, axis=1) \
    .reset_index()

user_averages.head()

Unnamed: 0,userId,user_avg
0,0,3.612903
1,1,3.513514
2,2,3.45098
3,3,3.647368
4,4,3.625


In [8]:
item_averages = train.groupby('movieId') \
    .agg({'rating':'mean'}) \
    .rename({'rating': 'item_avg'}, axis=1) \
    .reset_index()

item_averages.head()

Unnamed: 0,movieId,item_avg
0,0,4.068182
1,1,4.182203
2,2,2.181818
3,3,3.477612
4,4,3.645161


In [9]:
train = pd.merge(train, user_averages, how='left', on='userId')
train = pd.merge(train, item_averages, how='left', on='movieId')
test = pd.merge(test, user_averages, how='left', on='userId')
test = pd.merge(test, item_averages, how='left', on='movieId')

In [10]:
test.user_avg = test.user_avg.fillna(mean_rating)
test.item_avg = test.item_avg.fillna(mean_rating)

In [11]:
train['bias'] = (train['user_avg'] + train['item_avg'])/2 - mean_rating
test['bias'] = (test['user_avg'] + test['item_avg'])/2 - mean_rating

In [12]:
test.head()

Unnamed: 0,userId,movieId,rating,user_avg,item_avg,bias
0,692,103,3,3.791209,3.653386,0.194485
1,121,364,4,3.195652,3.656109,-0.101932
2,318,769,4,4.10274,2.631579,-0.160653
3,254,750,5,4.353933,4.072727,0.685517
4,312,166,4,3.071429,3.909871,-0.037163


## Predict ratings

### Trunk of model
We will re-use the item embedding layer later on

In [13]:
# item embedding path
item_input = Input(shape=[1], name="Item-Input")
item_embedding = Embedding(n_items, 25, name="Item-Embedding")(item_input)
item_vec = Flatten(name="Flatten-Items")(item_embedding)

# user embedding path
user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(n_users, 25, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)

### Head of model 
#### Option 1: dot product

In [14]:
# # dot product user and item vectors
# prod = Dot(name="Dot-Product", axes=1)([item_vec, user_vec])

# # create model and compile it
# model = Model([user_input, item_input], prod)
# model.compile('adam', 'mean_squared_error')
# model.summary()

#### Option 2: neural network

In [15]:
from tensorflow.keras import backend as K

# use this function to force all outputs of model to be between min and max rating
def stretch(x):
    return K.sigmoid(x) * (max_rating - min_rating) + min_rating

def add_mean(x):
#     upper = max_rating - mean_rating
#     lower = min_rating - mean_rating
#     stretch = K.sigmoid(x) * (upper - lower) + lower
    
    return add(x, mean_rating)

In [16]:
# concatenate features
conc = Concatenate()([item_vec, user_vec])

# add fully-connected-layers
fc1 = Dense(32, activation='relu')(conc)
d1 = Dropout(.5)(fc1)
fc2 = Dense(16, activation='relu')(d1)
d2 = Dropout(.25)(fc2)
rating = Dense(1, add_mean)(d2)

# add in baseline prediction
base = Input(shape=[1])
out = Add()([rating, base])

# create model and compile it
model = Model([user_input, item_input, base], out)
model.compile('adam', 'mean_squared_error')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item-Input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
User-Input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
Item-Embedding (Embedding)      (None, 1, 25)        42050       Item-Input[0][0]                 
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 25)        23575       User-Input[0][0]                 
______________________________________________________________________________________________

In [17]:
history = model.fit([train.userId, train.movieId, train.bias], train.rating, 
                    batch_size=256,
                    epochs=10,
                    verbose=2,
                    validation_split=.1)

Train on 72000 samples, validate on 8000 samples
Epoch 1/10
72000/72000 - 5s - loss: 0.9151 - val_loss: 0.9120
Epoch 2/10
72000/72000 - 2s - loss: 0.8717 - val_loss: 0.9002
Epoch 3/10
72000/72000 - 2s - loss: 0.8461 - val_loss: 0.8987
Epoch 4/10
72000/72000 - 2s - loss: 0.8300 - val_loss: 0.8922
Epoch 5/10
72000/72000 - 2s - loss: 0.8178 - val_loss: 0.8914
Epoch 6/10
72000/72000 - 2s - loss: 0.8055 - val_loss: 0.8887
Epoch 7/10
72000/72000 - 2s - loss: 0.7949 - val_loss: 0.8886
Epoch 8/10
72000/72000 - 2s - loss: 0.7854 - val_loss: 0.8847
Epoch 9/10
72000/72000 - 2s - loss: 0.7792 - val_loss: 0.8838
Epoch 10/10
72000/72000 - 2s - loss: 0.7666 - val_loss: 0.8844


In [18]:
print('RMSE:', np.sqrt(model.evaluate([test.userId, test.movieId, test.bias], test.rating, batch_size=1024)))

RMSE: 0.923233695065196


In [19]:
predictions = model.predict([test.userId.head(10), test.movieId.head(10), test.bias.head(10)])

[print(predictions[i], df.rating.iloc[i]) for i in range(0,10)]

[3.37188] 5
[3.3151326] 3
[3.8582907] 5
[4.648497] 1
[3.5095294] 1
[4.3504896] 3
[2.4771874] 3
[2.3774352] 5
[2.793356] 2
[2.961024] 3


[None, None, None, None, None, None, None, None, None, None]

In [20]:
movies = pd.read_csv(path+'movies.csv')

In [21]:
movies.head()

Unnamed: 0,movieId,title,releaseDate,link,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,24,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,147,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,233,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,47,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,75,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
# make new genre model
g_hidden = Dense(16, activation='relu')(item_vec)
g_drop = Dropout(.25)(g_hidden)
g_out = Dense(18, activation='sigmoid')(g_drop)  # there are 18 genres

# Create model and compile it
model2 = Model(item_input, g_out)
# freeze the embedding layer
model2.layers[1].trainable = False
model2.compile(optimizer='adam', loss='binary_crossentropy' , metrics=['accuracy'])
model2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Item-Input (InputLayer)      [(None, 1)]               0         
_________________________________________________________________
Item-Embedding (Embedding)   (None, 1, 25)             42050     
_________________________________________________________________
Flatten-Items (Flatten)      (None, 25)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                416       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 18)                306       
Total params: 42,772
Trainable params: 722
Non-trainable params: 42,050
_____________________________________________________

In [23]:
history = model2.fit(movies.movieId, movies.iloc[:,-18:], 
                     batch_size=64, 
                     epochs=10,
                     validation_split=.2)

W0819 21:35:47.475854 13788 deprecation.py:323] From C:\Users\jlesl\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 1345 samples, validate on 337 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [25]:
y_true = movies.iloc[:,-18:]
y_score = pd.DataFrame(model2.predict(movies.movieId), columns=y_true.columns)
y_pred = y_score.round().astype('int')

In [26]:
drama_true = y_true.Drama
drama_pred = y_pred.Drama

In [27]:
print(classification_report(drama_true, drama_pred))

              precision    recall  f1-score   support

           0       0.59      0.95      0.73       957
           1       0.65      0.13      0.21       725

   micro avg       0.59      0.59      0.59      1682
   macro avg       0.62      0.54      0.47      1682
weighted avg       0.61      0.59      0.50      1682



In [28]:
pd.DataFrame(confusion_matrix(drama_true, drama_pred))

Unnamed: 0,0,1
0,907,50
1,634,91
