[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/James-Leslie/deep-collaborative-filtering/blob/master/movielens_skorch.ipynb)

# To do:
  - use cross validation in grid search (ML100K README explains how to use 5-fold CV for evaluation)
  - regularize embedding layer
  - try [this link](https://github.com/keras-team/keras/issues/9001) for making an SkLearn base estimator (look for comment by hughfdjackson)
  - create object classes for models
  - implement TF 2.0 data classes
  - [paperswithcode link](https://paperswithcode.com/sota/collaborative-filtering-on-movielens-100k)
  - [ML 100k state of the art paper](https://arxiv.org/pdf/1706.02263v2.pdf) (RMSE=0.905): details their evaluation method
  - include genre model in grid search

In [1]:
# !pip install tensorflow-gpu==2.0.0-beta1

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, datetime
import warnings

from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Add, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.math import add

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [3]:
# path = 'https://raw.githubusercontent.com/James-Leslie/deep-collaborative-filtering/master/data/ml-100k/'  # access from anywhere
path = 'data/ml-100k/'  # if the files are local
df = pd.read_csv(path+'ratings.csv')

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating
0,0,0,3
1,1,1,3
2,2,2,1
3,3,3,2
4,4,4,1


In [5]:
df.shape

(100000, 3)

In [6]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()
print('Number of users:', n_users)
print('Number of items:', n_items)

Number of users: 943
Number of items: 1682


## Add baseline predictors

In [7]:
train, test, _, _ = train_test_split(df, df.rating, test_size=.1, random_state=42)

In [8]:
min_rating = df.rating.min()
max_rating = df.rating.max()
mean_rating = train.rating.mean()
print("Min item rating:", min_rating)
print("Max item rating:", max_rating)
print("Mean item rating:", mean_rating)

Min item rating: 1
Max item rating: 5
Mean item rating: 3.5308444444444445


In [9]:
user_averages = train.groupby('userId') \
    .agg({'rating':'mean'}) \
    .rename({'rating': 'user_avg'}, axis=1) \
    .reset_index()

user_averages.head()

Unnamed: 0,userId,user_avg
0,0,3.542857
1,1,3.433735
2,2,3.285714
3,3,3.671233
4,4,3.388889


In [10]:
item_averages = train.groupby('movieId') \
    .agg({'rating':'mean'}) \
    .rename({'rating': 'item_avg'}, axis=1) \
    .reset_index()

item_averages.head()

Unnamed: 0,movieId,item_avg
0,0,3.980952
1,1,4.164122
2,2,2.111111
3,3,3.449275
4,4,3.652542


In [11]:
train = pd.merge(train, user_averages, how='left', on='userId')
train = pd.merge(train, item_averages, how='left', on='movieId')
test = pd.merge(test, user_averages, how='left', on='userId')
test = pd.merge(test, item_averages, how='left', on='movieId')

In [12]:
# users and items in test that aren't in train will get the average rating of all users
test.user_avg = test.user_avg.fillna(mean_rating)
test.item_avg = test.item_avg.fillna(mean_rating)

In [13]:
train['bias'] = (train['user_avg'] + train['item_avg'])/2 - mean_rating
test['bias'] = (test['user_avg'] + test['item_avg'])/2 - mean_rating

In [14]:
test.head()

Unnamed: 0,userId,movieId,rating,user_avg,item_avg,bias
0,873,377,4,3.777778,3.588235,0.152162
1,808,601,3,3.738372,3.697674,0.187179
2,90,354,4,3.637119,3.562963,0.069197
3,409,570,2,3.844639,2.755102,-0.230974
4,496,356,2,3.351485,3.920398,0.105097


## Ratings model

### Grid search
**To do**:
  - tune dropout rates and optimiser
  - measure impact on accuracy of genre model

In [15]:
from tensorflow.keras.regularizers import l2

In [16]:
def compile_model(n_factors=25, n_hidden_1=0, n_hidden_2=0, dropout_1=.1, dropout_2=.1):
    
    # TODO: n_items, n_users and mean_rating are all hard coded at the moment
    
    # item latent factors
    item_in = Input(shape=[1])  # name='item'
    item_em = Embedding(n_items, n_factors)(item_in)
    item_vec = Flatten()(item_em)
    
    # user latent factors
    user_in = Input(shape=[1])
    user_em = Embedding(n_users, n_factors)(user_in)
    user_vec = Flatten()(user_em)
    
    # user x item bias
    bias = Input(shape=[1])
    
    # if there is a hidden layer
    if n_hidden_1:
        # concatenate user and item vectors
        conc = Concatenate()([item_vec, user_vec])
        # hidden layer 1
        hidden_1 = Dense(n_hidden_1, activation='relu')(conc)
        drop_1 = Dropout(dropout_1)(hidden_1)
        
        # if there is a second hidden layer
        if n_hidden_2:
            # hidden layer 2
            hidden_2 = Dense(n_hidden_2, activation='relu')(drop_1)
            drop_2 = Dropout(dropout_2)(hidden_2)
            
            # unscaled output
            out = Dense(1)(drop_2)
        else:
            out = Dense(1)(drop_1)
        
    # if there are no hidden layers
    else:
        out = Dot(name="Dot-Product", axes=1)([item_vec, user_vec])
    
    rating = add(Add()([out, bias]), mean_rating)
    
    # create model and compile it
    model = Model([user_in, item_in, bias], rating)
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

In [17]:
# hyper parameters
HP_N_FACTORS = [5, 10, 25, 50]
HP_N_HIDDEN_1 = [0, 8, 16, 32, 64]
HP_N_HIDDEN_2 = [0, 8, 16, 32]
HP_DROPOUT_1 = [.1, .15, .2, .25]
HP_DROPOUT_2 = [.1, .15, .2, .25]

In [18]:
# dataframe to store results of grid search
grid_results = []
best_loss = 1
searches = 1

n_models = len(HP_N_FACTORS) * len(HP_N_HIDDEN_1) * len(HP_N_HIDDEN_2)

print(f'Fitting total of {n_models} models')

for N_FACTORS in HP_N_FACTORS:
    for N_HIDDEN_1 in HP_N_HIDDEN_1:
        for N_HIDDEN_2 in HP_N_HIDDEN_2:
            
            # avoid duplicate models
            if N_HIDDEN_1 < N_HIDDEN_2:
                continue
            
            print(f'Fitting model #{searches} with {N_FACTORS}: {N_HIDDEN_1}: {N_HIDDEN_2} architecture')
            searches += 1
            
            model = compile_model(
                n_factors = N_FACTORS,
                n_hidden_1 = N_HIDDEN_1,
                n_hidden_2 = N_HIDDEN_2,
                dropout_1 = .2,
                dropout_2 = .25
            )
            
            result = model.fit(x=[train.userId.values, train.movieId.values, train.bias.values],
                               y=train.rating.values, 
                               batch_size=256,
                               epochs=10,
                               verbose=0,
                               validation_data=([test.userId.values, test.movieId.values, test.bias.values], test.rating.values))
            
            loss = np.sqrt(np.min(result.history['loss']))
            
            val_loss = np.sqrt(np.min(result.history['val_loss']))
            val_epochs = np.argmin(result.history['val_loss']) + 1
            
            if val_loss < best_loss:
                best_loss = val_loss
            
#             plt.plot(result.history['loss'], label='train')
#             plt.plot(result.history['val_loss'], label='val')
#             plt.axhline(y=best_loss, color='r', lw=1, ls='-')
#             plt.legend()
#             plt.show()
            print(f'Validation loss: {val_loss:.4f} after {val_epochs} epochs')
            print(f'Best loss: {best_loss:.4f}\n')
            
            grid_results.append({'n_factors':N_FACTORS,
                                 'hidden_1':N_HIDDEN_1,
                                 'hidden_2':N_HIDDEN_2,
                                 'train_rmse':loss,
                                 'val_rmse':val_loss,
                                 'val_epochs':val_epochs,
                                 'train_hist':result.history['loss'],
                                 'val_hist':result.history['val_loss']})
#             break
#         break
#     break
grid_results = pd.DataFrame(data=grid_results, columns=grid_results[0].keys())

Fitting total of 80 models
Fitting model #1 with 5: 0: 0 architecture
Validation loss: 0.9368 after 9 epochs
Best loss: 0.9368

Fitting model #2 with 5: 8: 0 architecture
Validation loss: 0.9301 after 9 epochs
Best loss: 0.9301

Fitting model #3 with 5: 8: 8 architecture
Validation loss: 0.9344 after 6 epochs
Best loss: 0.9301

Fitting model #4 with 5: 16: 0 architecture
Validation loss: 0.9214 after 9 epochs
Best loss: 0.9214

Fitting model #5 with 5: 16: 8 architecture
Validation loss: 0.9270 after 9 epochs
Best loss: 0.9214

Fitting model #6 with 5: 16: 16 architecture
Validation loss: 0.9244 after 9 epochs
Best loss: 0.9214

Fitting model #7 with 5: 32: 0 architecture
Validation loss: 0.9211 after 9 epochs
Best loss: 0.9211

Fitting model #8 with 5: 32: 8 architecture
Validation loss: 0.9239 after 8 epochs
Best loss: 0.9211

Fitting model #9 with 5: 32: 16 architecture
Validation loss: 0.9245 after 8 epochs
Best loss: 0.9211

Fitting model #10 with 5: 32: 32 architecture
Validation

In [20]:
grid_results.sort_values('val_rmse').head(20)

Unnamed: 0,n_factors,hidden_1,hidden_2,train_rmse,val_rmse,val_epochs,train_hist,val_hist
39,25,64,8,0.835775,0.915291,5,"[0.9053687337133619, 0.8581402198261685, 0.831...","[0.899822188949585, 0.8722046106338501, 0.8610..."
9,5,32,32,0.868045,0.915903,8,"[0.9064022123124864, 0.859295240773095, 0.8359...","[0.9023335346221923, 0.8781603580474854, 0.864..."
48,50,32,0,0.830764,0.916575,4,"[0.8949839315096537, 0.8436127673890855, 0.814...","[0.8874345230102539, 0.8650611972808838, 0.855..."
40,25,64,16,0.819343,0.917029,5,"[0.9041556889533997, 0.8532259832170275, 0.826...","[0.8935522471427918, 0.8680219422340393, 0.860..."
24,10,64,0,0.85902,0.917685,8,"[0.900384271759457, 0.8509309415711297, 0.8237...","[0.8896165184020997, 0.8705772150993347, 0.858..."
26,10,64,16,0.857411,0.917707,8,"[0.9101618134816488, 0.8633727712419298, 0.837...","[0.8998611957550049, 0.8763123644828796, 0.864..."
21,10,32,8,0.8686,0.917954,7,"[0.9143970405048795, 0.8691738733397589, 0.847...","[0.9007119054794311, 0.8852790510177613, 0.870..."
25,10,64,8,0.860939,0.917964,7,"[0.910198387061225, 0.8635449924151103, 0.8380...","[0.9025518995285035, 0.8766087692260742, 0.865..."
45,50,16,0,0.860678,0.918108,9,"[0.8964159065034655, 0.846752588388655, 0.8231...","[0.8889385526657104, 0.8678888578414917, 0.861..."
35,25,32,8,0.847964,0.918221,6,"[0.9085188418918185, 0.8612834253523085, 0.838...","[0.8941777168273926, 0.8745749899864197, 0.866..."
