In [2]:
import csv

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.layers import Activation, Lambda
from tensorflow.keras.layers import Concatenate, Dense, Dropout
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD


Data loading and imports similar to other workbooks.

In [14]:
with open ("../data/ml-100k/u.data", "r") as f:
    data = list(csv.reader(f, delimiter="\t"))
data = np.array(data)
film_dim = np.amax(np.array(data[:,1]).astype(np.float))
user_dim = np.amax(np.array(data[:,0]).astype(np.float))
print("film size", film_dim)
print("user size", user_dim)
data = data.astype(np.int)
data = pd.DataFrame(data)
data.sort_values([0,3],inplace=True) ## Sort data
data = data.rename(columns= {0: 'user_id',
                      1: 'movie_id',
                      2: 'rating',
                      3: 'time'},
            inplace=True)

film size 1682.0
user size 943.0


We use the LabelEncoder to transform IDs using a relation (x,y) -> (0,n) where n = # of IDs, so that we do not have IDs just as a random number, but going from 0 to n. 

In [13]:
user_encoder = LabelEncoder()
data['user_id_encoded'] = user_encoder.fit_transform(data['user_id'].values)
user_count = data['user_id_encoded'].nunique()

item_encoder = LabelEncoder()
data['movie_id_encoded'] = item_encoder.fit_transform(data['movie_id'].values)
movie_count = data['movie_id_encoded'].nunique()

TypeError: 'NoneType' object is not subscriptable

We load the data into test and train sets, but for better performance, we will rescale the ratings to a more Gaussian distribution as regression models tend to work better with normally distributed data.

In [231]:
X_data = data[['user', 'movie']].values
Y_data = data['rating'].values

y_scaler = StandardScaler().fit(Y_data.reshape(len(Y_data), 1))
y_scaled = y_scaler.transform(Y_data.reshape(len(Y_data), 1))[:, 0]

train_x, test_x, train_y, test_y = train_test_split(X_data, y_scaled, test_size=0.1, random_state=315)


((90000, 2), (10000, 2), (90000,), (10000,))

We select a number of factors for user / movie which we input into the Embedding layer. We also store the minimum and maximum ratings we need to use in the final Lambda layer and we prepare train and test arrays (each containing two arrays for users and for movies IDs).

We define a Recommender model starting with an input layer followed by Embedding layers, one for both movies and users.  We then Concatenate the two Embeddings and follow them by a Dense layer with relu activation function. The final layers contains a Dense layer with a single numeric output, which we need to multiply by a formula to get the desired rating. For this we use the Lambda layer which in fact is not a layer just wraps up a lambda expression to make the model look pretty and consistent - layerlike. 

In [361]:
def ModelDenseEmbed(x, y, f):
    user = Input(shape=(1,))
    movie = Input(shape=(1,))
    u = Embedding(x, f)(user)
    m = Embedding(y, f)(movie)
    x = Concatenate()([u, m])
    x = Dropout(0.1)(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    model = Model(inputs=[user, movie], outputs=x)
    return model


We load the model and fit it to the training data. 

In [359]:
learning_rate = 0.001
decay_rate = 0.001 / 25
momentum = 0.5

sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate)

model = ModelDenseEmbed(user_count, movie_count, 64)
model.compile(loss='MSE', optimizer=sgd)
model.summary()

Model: "model_93"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_195 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_196 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_283 (Embedding)       (None, 1, 100)       94300       input_195[0][0]                  
__________________________________________________________________________________________________
embedding_284 (Embedding)       (None, 1, 100)       168200      input_196[0][0]                  
___________________________________________________________________________________________

In [360]:
history = model.fit(x=[train_x[:, 0], train_x[:, 1]], y=train_y, batch_size=64, epochs=25, verbose=1, validation_data=([test_x[:, 0], test_x[:, 1]], test_y))

ValueError: A target array with shape (90000, 1) was passed for an output of shape (None, 100, 100) while using as loss `mean_squared_error`. This loss expects targets to have the same shape as the output.

The desired models converges to a loss and val_loss value of abt. 0.66. The loss function used is mean squared error. Since the difference between the max and min of the y_scaled is 3.5, 0.66 can be considered a bit underwhelming result. We can print out some sample data to see the actual numbers (we need to inversely transform the data using the y_scaler instantiated sooner in the code).

In [331]:
predictions = model.predict([test_x[:, 0], test_x[:, 1]])
predictions_values = predictions * (max(y_scaled) - min(y_scaled)) + min(y_scaled)

In [332]:
max(data['rating'])-min(data['rating'])

4.0

In [333]:
max(y_scaled)-min(y_scaled)

3.5534458

In [337]:
y_scaler.inverse_transform(predictions[10:20])

array([[[2.7255344]],

       [[3.5234368]],

       [[1.8686908]],

       [[3.32947  ]],

       [[3.5301437]],

       [[2.9680274]],

       [[4.244357 ]],

       [[1.6430584]],

       [[3.483638 ]],

       [[3.753055 ]]], dtype=float32)

In [336]:
y_scaler.inverse_transform(test_y[10:20])

array([3., 3., 1., 1., 3., 4., 5., 5., 3., 4.], dtype=float32)