Imports

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Embedding, Flatten, Dot, Dense
from keras.optimizers import Adam, RMSprop, SGD
from keras.callbacks import ModelCheckpoint
from keras.losses import MeanAbsoluteError

# Load data
We read the CSV file and load it into a pandas DataFrame. 

In [22]:
data = pd.read_csv('../Data/book/ratings.csv')

books_df = pd.read_csv('../Data/book/books.csv')
book_id_to_name = pd.Series(books_df.title.values, index = books_df.book_id).to_dict()

AttributeError: 'dict' object has no attribute 'to_csv'

We print the first few records and a summary of the data for a quick examination.

In [3]:
print(data.head())
print(data.describe())

   book_id  user_id  rating
0        1      314       5
1        1      439       3
2        1      588       5
3        1     1169       4
4        1     1185       4
             book_id        user_id         rating
count  981756.000000  981756.000000  981756.000000
mean     4943.275636   25616.759933       3.856534
std      2873.207415   15228.338826       0.983941
min         1.000000       1.000000       1.000000
25%      2457.000000   12372.000000       3.000000
50%      4921.000000   25077.000000       4.000000
75%      7414.000000   38572.000000       5.000000
max     10000.000000   53424.000000       5.000000


Check for missing values

In [4]:
print(data.isnull().sum())

book_id    0
user_id    0
rating     0
dtype: int64


# Create user-id and book-id mapping
We're creating two mapping dictionaries for users and books - from id to index and from index to id.  
This will help in embedding layer where we'll be dealing with indices.

In [5]:
user_ids = data['user_id'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
book_ids = data['book_id'].unique().tolist()
book2book_encoded = {x: i for i, x in enumerate(book_ids)}
book_encoded2book = {i: x for i, x in enumerate(book_ids)}


# Map user-id and book-ids to user and book indices
We're creating two new columns in our DataFrame to hold the indices of users and books.

In [6]:
data['user'] = data['user_id'].map(user2user_encoded)
data['book'] = data['book_id'].map(book2book_encoded)

# Split data into training and testing set
We split our data into a training set (80%) and a test set (20%).

In [7]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Get the number of users and books
We calculate the total number of unique users and books in our data.

In [8]:
num_users = len(user2user_encoded)
num_books = len(book_encoded2book)

# Set embedding dimension
This is a hyperparameter for our model representing the size of the embedding vectors.

In [9]:
embedding_size=10

# Build model
We're using Keras Functional API to build a model with Embedding layers for users and books.  
These embeddings will learn to represent user preferences and book properties during training.

In [10]:
user_input = Input(shape=[1])
user_embedding = Embedding(num_users, embedding_size)(user_input)
user_vec = Flatten()(user_embedding)

book_input = Input(shape=[1])
book_embedding = Embedding(num_books, embedding_size)(book_input)
book_vec = Flatten()(book_embedding)

We then calculate the dot product of these vectors to predict the user's rating of the book.

In [11]:
product = Dot(axes=1)([book_vec, user_vec])

Our model takes as input the user and book indices, and outputs the predicted rating.

In [12]:
model = Model(inputs=[user_input, book_input], outputs=product)

We compile our model with a mean squared error loss function, perfect for regression problem, and an Adam optimizer.

In [13]:
# Define the path where you want to save the best model
mae_checkpoint_path = '../data/book/mae_best_model.h5'
mse_checkpoint_path = '../data/book/mse_best_model.h5'

# Define a callback for model checkpointing
mae_checkpoint = ModelCheckpoint(mae_checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)
mse_checkpoint = ModelCheckpoint(mse_checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)

mae_initial_weights=model.get_weights()
mse_initial_weights=model.get_weights()

# Train model
We train our model for 5 epochs, with a batch size of 64. We also specify our validation data for validation accuracy.

In [19]:
model.compile(loss=MeanAbsoluteError(), optimizer=Adam())
print('loss function=MeanAbsoluteError()')
print('optimizer=Adam()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')
model.compile(loss=MeanAbsoluteError(), optimizer=RMSprop())
print('optimizer=RMSprop()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')
model.compile(loss=MeanAbsoluteError(), optimizer=SGD())
print('optimizer=SGD()')
print('batch_size=8')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mae_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mae_checkpoint])
print('')
print('')
print('')

loss function=MeanAbsoluteError()
optimizer=Adam()
batch_size=8
Epoch 1/20
Epoch 1: val_loss improved from inf to 1.52491, saving model to ../data/book\mae_best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 1.52491 to 0.96768, saving model to ../data/book\mae_best_model.h5
Epoch 3/20
Epoch 3: val_loss improved from 0.96768 to 0.85716, saving model to ../data/book\mae_best_model.h5
Epoch 4/20
Epoch 4: val_loss improved from 0.85716 to 0.81646, saving model to ../data/book\mae_best_model.h5
Epoch 5/20
Epoch 5: val_loss improved from 0.81646 to 0.79604, saving model to ../data/book\mae_best_model.h5
Epoch 6/20
Epoch 6: val_loss improved from 0.79604 to 0.78428, saving model to ../data/book\mae_best_model.h5
Epoch 7/20
Epoch 7: val_loss improved from 0.78428 to 0.77697, saving model to ../data/book\mae_best_model.h5
Epoch 8/20
Epoch 8: val_loss improved from 0.77697 to 0.77613, saving model to ../data/book\mae_best_model.h5
Epoch 9/20
Epoch 9: val_loss improved from 0.77613 to 0.7736

In [None]:
model.compile(loss='mean_squared_error', optimizer=Adam())
print('loss function=mean_squared_error')
print('optimizer=Adam()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=20, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')
model.compile(loss='mean_squared_error', optimizer=RMSprop())
print('optimizer=RMSprop()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')
model.compile(loss='mean_squared_error', optimizer=SGD())
print('optimizer=SGD()')
print('batch_size=8')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=8, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=16')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=16, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=32')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=32, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=64')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=64, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('batch_size=128')
model.set_weights(mse_initial_weights)
history = model.fit(x=[train.user.values, train.book.values], y=train.rating.values,
                    batch_size=128, epochs=100, verbose=1,
                    validation_data=([test.user.values, test.book.values], test.rating.values),
                    callbacks=[mse_checkpoint])
print('')
print('')
print('')

# Evaluate model
We evaluate our trained model on the test data to see how well it generalizes.

In [21]:
model=load_model('../data/book/mse_best_model.h5')
mse = model.evaluate([test.user.values, test.book.values], test.rating.values)
print(f'Test MSE: {mse}')

Test MSE: 0.9375975728034973


In [23]:
model=load_model('../data/book/mae_best_model.h5')
mae = model.evaluate([test.user.values, test.book.values], test.rating.values)
print(f'Test MSE: {mae}')

Test MSE: 0.7050154209136963
