In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow import keras

from sklearn.metrics import mean_squared_error

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# load in the data
df = pd.read_csv('dataset\\ratings-books.csv')

N = df.user_id.max() + 1 # number of users
M = df.book_id.max() + 1 # number of books
print(f"Number of Users: {N-1}, Number of Books: {M-1}")

print("\nSample of the 'Books-Ratings DataFrame'\n")
print(df.sample(10))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# Load the 'Books and Book-Titles' 
df_books_and_titles = pd.read_csv('dataset\\books.csv')
df_books_and_titles = df_books_and_titles[['book_id','title']]
print("\n\nSample of the 'Books and Book-Titles' dataframe\n")
print(df_books_and_titles.sample(5))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# split into train and test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]
print(f"\nTrain set uses rows 1 thru {cutoff-1}, test set uses rows {cutoff} to {len(df)}")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# initialize variables
K = 10 # latent dimensionality
mu = df_train.rating.mean()
epochs = 1
reg = 0. # regularization penalty

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# keras model
u = Input(shape=(1,),name='user')
m = Input(shape=(1,),name='book')
u_embedding = Embedding(N, K, embeddings_regularizer=l2(reg),name='user_embedding')(u) # (N, 1, K)
m_embedding = Embedding(M, K, embeddings_regularizer=l2(reg),name='book_embedding')(m) # (N, 1, K)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

u_bias = Embedding(N, 1, embeddings_regularizer=l2(reg))(u) # (N, 1, 1)
m_bias = Embedding(M, 1, embeddings_regularizer=l2(reg))(m) # (N, 1, 1)
x = Dot(axes=2)([u_embedding, m_embedding]) # (N, 1, 1)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

#x = Add()([x, u_bias, m_bias])
#x = Add()([x])
x = Flatten()(x) # (N, 1)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

model = Model(inputs=[u, m], outputs=x)
model.compile(
  loss='mse',
  # optimizer='adam',
  # optimizer=Adam(lr=0.01),
  optimizer=SGD(lr=0.08, momentum=0.9),
  metrics=['mse'],
)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

r = model.fit(
  x=[df_train.user_id.values, df_train.book_id.values],
  y=df_train.rating.values, 
  epochs=epochs,
  batch_size=128,
  validation_data=(
    [df_test.user_id.values, df_test.book_id.values],
    df_test.rating.values 
  )
)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #


import pickle
model.save("MatrixFactorModel")
model = keras.models.load_model("MatrixFactorModel")  #WORKS!!!

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #


y_pred = model.predict([df_test.user_id.values, df_test.book_id.values]).ravel() 
y_true = df_test.rating.values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With DL Matrix-Factorization: {:.4f} RMSE'.format(rmse))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

di_book_title = {}
for i in range(len(df_books_and_titles)):
    bk_id = df_books_and_titles.loc[i,'book_id']
    title = df_books_and_titles.loc[i,'title']
    di_book_title[bk_id] = title

lst_titles = []
for i in df_test.book_id.values:
    if i > 0:
        tit = di_book_title[i]
        lst_titles.append(tit)
    else:
        lst_titles.append("N/A")

pickle.dump(lst_titles, open("lst_titles_mf.p", 'wb'))
pickle.dump(M, open("M_mf.p", 'wb'))
pickle.dump(N, open("N_mf.p", 'wb'))

results_df = pd.DataFrame({
    'User ID': df_test.user_id.values,
    'Book ID': df_test.book_id.values,
    'Book Name' : lst_titles,
    'Predicted Rating': np.round(y_pred, 1),
    'Actual Rating': y_true
})

print("\n\n Sample Predictions for Book-Ratings:\n\n")
print(results_df.sample(5))


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

def get_user_id():
    x = int(input(f"Enter a user-id from 1 to {N-1}  (0 to 'quit'): "))
    if x < 1 or x > (N-1):
        return int(0)
    else:
        return int(x)

# now sort them
def Sort_Tuple(tup): 
  
    # reverse = None (Sorts in Ascending order) 
    # key is set to sort using second element of 
    # sublist lambda has been used 
    tup.sort(key = lambda x: x[1],reverse = True) 
    return tup 

def print_books_with_this_rating(id,top_rating):

    lst_userid = np.array([id for i in range(1,M)]).reshape(-1,)
    lst_book_id = np.array([i for i in range(1,M)]).reshape(-1,)    
    
    pred = model.predict([lst_userid,lst_book_id]).ravel() 
    lst_bk_rating = [(lst_book_id[i],pred[i]) for i in range(len(lst_book_id))]
    sorted_lst_bk_rating = Sort_Tuple(lst_bk_rating)
    
    top_index = 0

    for i in range(len(sorted_lst_bk_rating)):
        bk_id,rating = sorted_lst_bk_rating[i]
        if rating <= top_rating:  # first index that is <= top_index
            top_index = i
            break;    
            
    # print out the top 10 books for this user
    print("\n-----------------------------------------------------------------------------\n")
    if top_rating >= 4.00:  #4.00 to 5.00
        print("Books User ",id," will LOVE\n")
    elif top_rating >= 3.00:  #3.00 to 3.99
        print("Books User ",id," will LIKE\n")
    elif top_rating > 2.00:  #2.00 to 2.99
        print("Books User ",id," will consider 'SO-SO'\n")
    elif top_rating > 1.00:  #1.00 to 1.99
        print("Books User ",id," will DISLIKE\n")
    else:
        print("Books User ",id," will HATE\n")

    num_books_to_display = 10
    for i in range(num_books_to_display):
        if (top_index + i) < len(sorted_lst_bk_rating):
            bk_id,rating = sorted_lst_bk_rating[top_index + i]
            bk_title = lst_titles[bk_id]
            if (rating <= top_rating) and ((top_rating - 1.0) <= rating):
                print(f'{bk_title[:65].ljust(70)}  {rating:3.2f}')

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

the_user_id = 5000
while the_user_id != 0:
    the_user_id = get_user_id()
    if the_user_id == 0:
        break
    print_books_with_this_rating(the_user_id,4.99)
    print_books_with_this_rating(the_user_id,3.99)
    print_books_with_this_rating(the_user_id,2.99)
    print_books_with_this_rating(the_user_id,1.99)
    print_books_with_this_rating(the_user_id,0.99)
    print("\n-----------------------------------------------------------------------------\n")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #





Number of Users: 53424, Number of Books: 10000

Sample of the 'Books-Ratings DataFrame'

         user_id  book_id  rating
2200030    29538     3828       4
5565119    36710      195       4
1390006    20106     1881       5
143928      3403        4       5
645735     11072     1399       5
3889652    44607        7       4
3375224    15056       77       5
5034418    39893      309       3
295809      6656     8054       5
2548068    21018     1455       2


Sample of the 'Books and Book-Titles' dataframe

      book_id                                      title
6198     6199                              When She Woke
9406     9407                                    The Haj
1643     1644                         Peace Like a River
7710     7711  Fahrenheit 451: The Authorized Adaptation
608       609                                       Cell

Train set uses rows 1 thru 4781182, test set uses rows 4781183 to 5976479




Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: MatrixFactorModel\assets


Testing Result With DL Matrix-Factorization: 0.9433 RMSE


 Sample Predictions for Book-Ratings:


         User ID  Book ID                                          Book Name  \
733671      7627     2614                  River Marked (Mercy Thompson, #6)   
932681      2139     4335                                     The Last Child   
1186253    28427      206                  The Selection (The Selection, #1)   
856616     30336     1502  Days of Blood & Starlight (Daughter of Smoke &...   
1186551    28726        4                              To Kill a Mockingbird   

         Predicted Rating  Actual Rating  
733671                4.1              4  
932681                3.6             