In [2]:
# To store\load the data
import pandas as pd

# To do linear algebra
import numpy as np

# To compute similarities between vectors
from sklearn.metrics import mean_squared_error

# To create deep learning models
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.layers import Input, Embedding, Reshape, Dot
from tensorflow.keras.models import Model

# --------------------------------------------------------------------- #

# Load the 'Books and Book-Titles' 
df_books_and_titles = pd.read_csv('books.csv')
df_books_and_titles = df_books_and_titles[['book_id','title']]
print("\n\nSample of the 'Books and Book-Titles' dataframe\n")
print(df_books_and_titles.sample(5))

# Create a dictionary mapping 'book_id' to 'title'
di_book_title = {}
for i in range(len(df_books_and_titles)):
    bk_id = df_books_and_titles.loc[i,'book_id']
    title = df_books_and_titles.loc[i,'title']
    di_book_title[bk_id] = title
    
# Load the 'User-ID ratings of Book-ID'
df_user_book_ratings = pd.read_csv('ratings-books.csv')
print("\n\nSample of the 'User-ID ratings of Book-ID' dataframe\n")
print(df_user_book_ratings.sample(5))

# --------------------------------------------------------------------- #

# Filter sparse movies
min_book_ratings = 100
filter_books = (df_user_book_ratings['book_id'].value_counts()>min_book_ratings)
filter_books = filter_books[filter_books].index.tolist()

# Filter sparse users
min_user_ratings = 100
filter_users = (df_user_book_ratings['user_id'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filtered = df_user_book_ratings[(df_user_book_ratings['book_id'].isin(filter_books)) & \
                                   (df_user_book_ratings['user_id'].isin(filter_users))]

del filter_books, filter_users, min_book_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df_user_book_ratings.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filtered.shape))

print("\n unique user_id counts:", len(df_filtered.groupby(['user_id']).count()))
print("\n unique book_id counts:", len(df_filtered.groupby(['book_id']).count()))

# --------------------------------------------------------------------- #

# Testingsize
n = 1000000

# Split train- & testset
df_train = df_filtered[:-n]
df_test = df_filtered[-n:]
print("Shape of training set:",df_train.shape, "Shape of testing set:",df_test.shape)

# --------------------------------------------------------------------- #

# Create user and movie-id mapping to convert to numbers
user_id_mapping = {id:i for i, id in enumerate(df_filtered['user_id'].unique())}
#print(user_id_mapping) # user_id_mapping is a dictionary that simply re-enumerates userIDs to sequential numbers 0,1,2,3
book_id_mapping = {id:i for i, id in enumerate(df_filtered['book_id'].unique())}

# --------------------------------------------------------------------- #

# use dataframe map function to map users & movies to mapped ids based on above mapping
train_user_data = df_train['user_id'].map(user_id_mapping)
train_book_data = df_train['book_id'].map(book_id_mapping)

# --------------------------------------------------------------------- #

# do the same for test data
test_user_data = df_test['user_id'].map(user_id_mapping)
test_book_data = df_test['book_id'].map(book_id_mapping)

# --------------------------------------------------------------------- #

# Get input variable-sizes
users = len(user_id_mapping)
books = len(book_id_mapping)
embedding_size = 100

# --------------------------------------------------------------------- #

# use Input() to create tensors for - 'user' and 'movie'
user_id_input = Input(shape=(1,), name='user')
book_id_input = Input(shape=(1,), name = 'book')

# --------------------------------------------------------------------- #

# Create embedding layer for users 
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)

# create embedding layer for movies just like users
book_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=books,
                           input_length=1, 
                           name='book_embedding')(book_id_input)

# --------------------------------------------------------------------- #

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)

book_vector = Reshape([embedding_size])(book_embedding)

# --------------------------------------------------------------------- #

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, book_vector])

# --------------------------------------------------------------------- #

# Setup model
model = Model(inputs=[user_id_input, book_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')
model.summary()

# --------------------------------------------------------------------- #

# Fit model
X = [train_user_data, train_book_data]
y = df_train['rating']

batch_size = 1024
epochs = 5
validation_split = 0.1

model.fit(X, y,
          batch_size=batch_size, 
          epochs=epochs,
          validation_split=validation_split,
          shuffle=True,
          verbose=1)

# --------------------------------------------------------------------- #

# Test model by making predictions on test data
y_pred = model.predict([test_user_data, test_book_data]).ravel()
# clip upper and lower ratings
y_pred = list(map(lambda x: 1.0 if x < 1 else 5.0 if x > 5.0 else x, y_pred))
# get true labels
y_true = df_test['rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With DL Matrix-Factorization: {:.4f} RMSE'.format(rmse))

# --------------------------------------------------------------------- #

di_book_title = {}
for i in range(len(df_books_and_titles)):
    bk_id = df_books_and_titles.loc[i,'book_id']
    title = df_books_and_titles.loc[i,'title']
    di_book_title[bk_id] = title

lst_titles = []
for i in test_book_data.values:
    if i > 0:
        tit = di_book_title[i]
        lst_titles.append(tit)
    else:
        lst_titles.append("N/A")
    
results_df = pd.DataFrame({
    'User ID': test_user_data.values,
    'Book ID': test_book_data.values,
    'Book Name' : lst_titles,
    'Predicted Rating': np.round(y_pred, 3),
    'Actual Rating': y_true
})

print("First 20 rows:\n\n")
print(results_df.head(20))
print("\n\n 20 random rows:\n\n")
print(results_df.sample(20))

# --------------------------------------------------------------------- #



def get_user_id():
    x = int(input("Enter a user-id from 1 to 36198 (0 to 'quit'): "))
    if x < 1 or x > 36198:
        return int(0)
    else:
        return int(x)

# now sort them
def Sort_Tuple(tup): 
  
    # reverse = None (Sorts in Ascending order) 
    # key is set to sort using second element of 
    # sublist lambda has been used 
    tup.sort(key = lambda x: x[1],reverse = True) 
    return tup 

def print_books_with_this_rating(id,top_rating):

    the_user_vector = model.get_layer('user_embedding').get_weights()[0][id]
    all_book_vectors = model.get_layer('book_embedding').get_weights()[0]

    lst = []
    for i in range(len(all_book_vectors)):
        book_vec = all_book_vectors[i]
        x = np.dot(the_user_vector,book_vec)
        lst.append((i,x))

    sorted_lst = Sort_Tuple(lst)
    
    top_index = 0

    for i in range(len(sorted_lst)):
        bk_id,rating = sorted_lst[i]
        if rating <= top_rating:  # first index that is <= top_index
            top_index = i
            break;    
            
    # print out the top 10 books for this user
    print("\n-----------------------------------------------------------------------------\n")
    if top_rating >= 4.00:  #4.00 to 5.00
        print("Books User ",id," will LOVE\n")
    elif top_rating >= 3.00:  #3.00 to 3.99
        print("Books User ",id," will LIKE\n")
    elif top_rating > 2.00:  #2.00 to 2.99
        print("Books User ",id," will consider 'SO-SO'\n")
    elif top_rating > 1.00:  #1.00 to 1.99
        print("Books User ",id," will DISLIKE\n")
    else:
        print("Books User ",id," will HATE\n")

    num_books_to_display = 10
    for i in range(num_books_to_display):
        bk_id,rating = sorted_lst[top_index + i]
        bk_title = lst_titles[bk_id]
        if (rating <= top_rating) and ((top_rating - 1.0) <= rating):
            print(f'{bk_title[:65].ljust(70)}  {rating:3.2f}')

    
the_user_id = 5000
while the_user_id != 0:
    the_user_id = get_user_id()
    if the_user_id == 0:
        break
    print_books_with_this_rating(the_user_id,4.99)
    print_books_with_this_rating(the_user_id,3.99)
    print_books_with_this_rating(the_user_id,2.99)
    print_books_with_this_rating(the_user_id,1.99)
    print_books_with_this_rating(the_user_id,0.99)
    
    
    

"""
# first show the books they've already read (and rated)
df_the_user = df_filtered[df_filtered['user_id'] == the_user_id]
print(df_the_user)

# this user has read 111 of the 9466 books.
"""

# --------------------------------------------------------------------- #






Sample of the 'Books and Book-Titles' dataframe

      book_id                                              title
8419     8420                               Batman: Hush, Vol. 1
435       436                            The God of Small Things
5520     5521           The Six Sacred Stones (Jack West Jr, #2)
1861     1862  Heir to the Empire (Star Wars: The Thrawn Tril...
3286     3287                      Food Rules: An Eater's Manual


Sample of the 'User-ID ratings of Book-ID' dataframe

         user_id  book_id  rating
1558732    12445      576       3
5837973    29797       24       4
2103226    28350     7301       2
5218814    29069     1728       5
4688029    49783     5065       3
Shape User-Ratings unfiltered:	(5976479, 3)
Shape User-Ratings filtered:	(4508993, 3)

 unique user_id counts: 36199

 unique book_id counts: 9466
Shape of training set: (3508993, 3) Shape of testing set: (1000000, 3)
Model: "functional_3"
___________________________________________________________

Enter a user-id from 1 to 36198 (0 to 'quit'): 5000

-----------------------------------------------------------------------------

Books User  5000  will LOVE

The Gunslinger (The Dark Tower, #1)                                     4.66
Marlfox (Redwall, #11)                                                  4.60
The Day the Crayons Quit                                                4.49
In the Garden of Beasts: Love, Terror, and an American Family in        4.47
Then He Ate My Boy Entrancers (Confessions of Georgia Nicolson, #       4.44
An Anthropologist on Mars: Seven Paradoxical Tales                      4.44
A Streetcar Named Desire                                                4.43
Home                                                                    4.39
The Hidden Child (Patrik Hedström, #5)                                  4.36
Green Mars (Mars Trilogy, #2)                                           4.34

--------------------------------------------------------------------

"\n# first show the books they've already read (and rated)\ndf_the_user = df_filtered[df_filtered['user_id'] == the_user_id]\nprint(df_the_user)\n\n# this user has read 111 of the 9466 books.\n"