[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/James-Leslie/deep-collaborative-filtering/blob/master/movielens_skorch.ipynb)

# To do:
  - grid search hyperparameters of rating model
  - grid search hyperparameters of genre model

In [1]:
# !pip install tensorflow-gpu==2.0.0-beta1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, datetime
import warnings

from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Add, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.math import add

from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# path = 'https://raw.githubusercontent.com/James-Leslie/deep-collaborative-filtering/master/data/ml-100k/'  # access from anywhere
path = 'data/ml-100k/'  # if the files are local
df = pd.read_csv(path+'ratings.csv')

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating
0,0,0,3
1,1,1,3
2,2,2,1
3,3,3,2
4,4,4,1


In [4]:
df.shape

(100000, 3)

In [5]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()
print('Number of users:', n_users)
print('Number of items:', n_items)

Number of users: 943
Number of items: 1682


## Add baseline predictors

In [6]:
train, test, _, _ = train_test_split(df, df.rating, test_size=.1, random_state=42)

In [7]:
min_rating = df.rating.min()
max_rating = df.rating.max()
mean_rating = train.rating.mean()
print("Min item rating:", min_rating)
print("Max item rating:", max_rating)
print("Mean item rating:", mean_rating)

Min item rating: 1
Max item rating: 5
Mean item rating: 3.5308444444444445


In [8]:
user_averages = train.groupby('userId') \
    .agg({'rating':'mean'}) \
    .rename({'rating': 'user_avg'}, axis=1) \
    .reset_index()

user_averages.head()

Unnamed: 0,userId,user_avg
0,0,3.542857
1,1,3.433735
2,2,3.285714
3,3,3.671233
4,4,3.388889


In [9]:
item_averages = train.groupby('movieId') \
    .agg({'rating':'mean'}) \
    .rename({'rating': 'item_avg'}, axis=1) \
    .reset_index()

item_averages.head()

Unnamed: 0,movieId,item_avg
0,0,3.980952
1,1,4.164122
2,2,2.111111
3,3,3.449275
4,4,3.652542


In [10]:
train = pd.merge(train, user_averages, how='left', on='userId')
train = pd.merge(train, item_averages, how='left', on='movieId')
test = pd.merge(test, user_averages, how='left', on='userId')
test = pd.merge(test, item_averages, how='left', on='movieId')

In [11]:
# users and items in test that aren't in train will get the average rating of all users
test.user_avg = test.user_avg.fillna(mean_rating)
test.item_avg = test.item_avg.fillna(mean_rating)

In [12]:
train['bias'] = (train['user_avg'] + train['item_avg'])/2 - mean_rating
test['bias'] = (test['user_avg'] + test['item_avg'])/2 - mean_rating

In [13]:
test.head()

Unnamed: 0,userId,movieId,rating,user_avg,item_avg,bias
0,873,377,4,3.777778,3.588235,0.152162
1,808,601,3,3.738372,3.697674,0.187179
2,90,354,4,3.637119,3.562963,0.069197
3,409,570,2,3.844639,2.755102,-0.230974
4,496,356,2,3.351485,3.920398,0.105097


## Predict ratings

In [21]:
from tensorflow.keras import backend as K

# use this function to force all outputs of model to be between min and max rating
def stretch(x):
    return K.sigmoid(x) * (max_rating - min_rating) + min_rating

# adds the global mean rating to the unscaled logit
def add_mean(x):
#     upper = max_rating - mean_rating
#     lower = min_rating - mean_rating
#     stretch = K.sigmoid(x) * (upper - lower) + lower
    
    return add(x, mean_rating)

In [22]:
# hyper parameters
HP_LATENT_SIZE = [5, 10, 25, 50]
HP_NUM_HIDDEN_1 = [8, 16, 32, 64]
HP_NUM_HIDDEN_2 = [False, 16, 32]
HP_DROPOUT = [.1, .15, .2, .25]
HP_OPTIMISER = ['adam', 'sgd']

In [23]:
# grid search loop

for latent_size in HP_LATENT_SIZE:
    for num_hidden_1 in HP_NUM_HIDDEN_1:
        for num_hidden_2 in HP_NUM_HIDDEN_2:
            
            # item embedding path
            item_input = Input(shape=[1], name="Item-Input")
            item_embedding = Embedding(n_items, latent_size, name="Item-Embedding")(item_input)
            item_vec = Flatten(name="Flatten-Items")(item_embedding)

            # user embedding path
            user_input = Input(shape=[1], name="User-Input")
            user_embedding = Embedding(n_users, latent_size, name="User-Embedding")(user_input)
            user_vec = Flatten(name="Flatten-Users")(user_embedding)
            
            # concatenate features
            conc = Concatenate()([item_vec, user_vec])
            
            # first hidden layer
            fc1 = Dense(num_hidden_1, activation='relu')(conc)
            d1 = Dropout(.15)(fc1)
            
            if num_hidden_2:
                # second hidden layer
                fc2 = Dense(num_hidden_2, activation='relu')(d1)
                d2 = Dropout(.25)(fc2)
                baseline = Dense(1, add_mean)(d2)
            else:
                baseline = Dense(1, add_mean)(d1)

            # add in user x item bias
            bias = Input(shape=[1])
            rating = Add()([baseline, bias])

            # create model and compile it
            model = Model([user_input, item_input, bias], rating)
            model.compile('adam', 'mean_squared_error')
            
            model.fit([train.userId, train.movieId, train.bias], train.rating, 
                      batch_size=256,
                      epochs=5,
                      verbose=2,
                      validation_split=.1)
            break
        break
    break

Train on 81000 samples, validate on 9000 samples
Epoch 1/5
81000/81000 - 1s - loss: 0.9144 - val_loss: 0.9094
Epoch 2/5
81000/81000 - 0s - loss: 0.8646 - val_loss: 0.8915
Epoch 3/5
81000/81000 - 0s - loss: 0.8438 - val_loss: 0.8831
Epoch 4/5
81000/81000 - 0s - loss: 0.8327 - val_loss: 0.8796
Epoch 5/5
81000/81000 - 0s - loss: 0.8244 - val_loss: 0.8765


In [23]:
history = model.fit([train.userId, train.movieId, train.bias], train.rating, 
                    batch_size=256,
                    epochs=10,
                    verbose=2,
                    validation_split=.1)

Train on 81000 samples, validate on 9000 samples
Epoch 1/10
81000/81000 - 5s - loss: 0.9131 - val_loss: 0.9081
Epoch 2/10
81000/81000 - 2s - loss: 0.8726 - val_loss: 0.8906
Epoch 3/10
81000/81000 - 2s - loss: 0.8485 - val_loss: 0.8819
Epoch 4/10
81000/81000 - 2s - loss: 0.8321 - val_loss: 0.8710
Epoch 5/10
81000/81000 - 2s - loss: 0.8181 - val_loss: 0.8677
Epoch 6/10
81000/81000 - 2s - loss: 0.8096 - val_loss: 0.8614
Epoch 7/10
81000/81000 - 2s - loss: 0.7952 - val_loss: 0.8565
Epoch 8/10
81000/81000 - 2s - loss: 0.7860 - val_loss: 0.8550
Epoch 9/10
81000/81000 - 2s - loss: 0.7746 - val_loss: 0.8532
Epoch 10/10
81000/81000 - 2s - loss: 0.7681 - val_loss: 0.8500


In [24]:
print('RMSE:', np.sqrt(model.evaluate([test.userId, test.movieId, test.bias], test.rating, batch_size=1024)))

RMSE: 0.9263132393441963


In [25]:
predictions = model.predict([test.userId.head(10), test.movieId.head(10), test.bias.head(10)])

[print(predictions[i], df.rating.iloc[i]) for i in range(0,10)]

[3.5937083] 3
[4.073673] 3
[3.502314] 1
[3.3557773] 2
[3.8541598] 1
[3.5785315] 4
[3.907664] 2
[4.06156] 5
[3.7167099] 3
[3.1557455] 3


[None, None, None, None, None, None, None, None, None, None]

In [26]:
movies = pd.read_csv(path+'movies.csv')

In [27]:
movies.head()

Unnamed: 0,movieId,title,releaseDate,link,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,24,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,147,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,233,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,47,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,75,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
# make new genre model
g_hidden = Dense(16, activation='relu')(item_vec)
g_drop = Dropout(.25)(g_hidden)
g_out = Dense(18, activation='sigmoid')(g_drop)  # there are 18 genres

# Create model and compile it
model2 = Model(item_input, g_out)
# freeze the embedding layer
model2.layers[1].trainable = False
model2.compile(optimizer='adam', loss='binary_crossentropy' , metrics=['accuracy'])
model2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Item-Input (InputLayer)      [(None, 1)]               0         
_________________________________________________________________
Item-Embedding (Embedding)   (None, 1, 25)             42050     
_________________________________________________________________
Flatten-Items (Flatten)      (None, 25)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                416       
_________________________________________________________________
dropout_6 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 18)                306       
Total params: 42,772
Trainable params: 722
Non-trainable params: 42,050
_____________________________________________________

In [30]:
history = model2.fit(movies.movieId, movies.iloc[:,-18:].values, 
                     batch_size=64, 
                     epochs=10,
                     validation_split=.2)

Train on 1345 samples, validate on 337 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [32]:
y_true = movies.iloc[:,-18:]
y_score = pd.DataFrame(model2.predict(movies.movieId), columns=y_true.columns)
y_pred = y_score.round().astype('int')

In [33]:
drama_true = y_true.Drama
drama_pred = y_pred.Drama

In [34]:
print(classification_report(drama_true, drama_pred))

              precision    recall  f1-score   support

           0       0.58      0.90      0.71       957
           1       0.55      0.15      0.24       725

   micro avg       0.58      0.58      0.58      1682
   macro avg       0.57      0.53      0.47      1682
weighted avg       0.57      0.58      0.51      1682



In [35]:
pd.DataFrame(confusion_matrix(drama_true, drama_pred))

Unnamed: 0,0,1
0,866,91
1,615,110
