In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings
from surprise import Reader, Dataset, SVD, model_selection, accuracy
from surprise.model_selection import GridSearchCV

In [2]:
reduced_books_users_ratings = pd.read_csv("data/clean/reduced_books_users_ratings.csv")

In [3]:
books_users_ratings = pd.read_csv("data/clean/books_users_ratings.csv")

In [4]:
user_item_rating = reduced_books_users_ratings[['user_id', 'unique_isbn', 'book_rating']]
user_item_rating.head()

Unnamed: 0,user_id,unique_isbn,book_rating
0,11676,038550120X,10
1,11676,0671537458,8
2,11676,0679776818,8
3,11676,0684867621,3
4,11676,8437606322,8


In [5]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(user_item_rating, reader)

In [6]:
#splitting into train and test
train_data, test_data = model_selection.train_test_split(data, test_size=0.2)

In [7]:
param_grid = {'n_factors': [80, 100, 120], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.01, 0.02, 0.04]}

# Optimize SVD algorithm for both root mean squared error ('rmse') and mean average error ('mae')
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)

In [8]:
# Fit the gridsearch result on the entire dataset
%time gs.fit(data)

CPU times: user 4min 13s, sys: 2.91 s, total: 4min 16s
Wall time: 4min 23s


In [9]:
# Return the best version of the SVD algorithm
model = gs.best_estimator['rmse']

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.5718280100211657
{'n_factors': 80, 'lr_all': 0.005, 'reg_all': 0.04}


In [10]:
%time model_selection.cross_validate(model, data, measures=['rmse', 'mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5693  1.5717  1.5764  1.5748  1.5700  1.5724  0.0028  
MAE (testset)     1.2155  1.2088  1.2127  1.2093  1.2119  1.2116  0.0025  
Fit time          4.51    4.49    4.54    4.48    4.46    4.50    0.03    
Test time         0.15    0.15    0.15    0.16    0.23    0.17    0.03    
CPU times: user 24.7 s, sys: 52.1 ms, total: 24.7 s
Wall time: 24.7 s


{'test_rmse': array([1.56927535, 1.57170023, 1.57643769, 1.57480061, 1.56998285]),
 'test_mae': array([1.21552353, 1.20875547, 1.21271826, 1.20928167, 1.21190043]),
 'fit_time': (4.513628959655762,
  4.486056089401245,
  4.5405848026275635,
  4.477702856063843,
  4.4607648849487305),
 'test_time': (0.15320396423339844,
  0.1505589485168457,
  0.14806485176086426,
  0.15732812881469727,
  0.2307729721069336)}

In [11]:
### Use the new parameters with the training set
model = SVD(n_factors=80, lr_all=0.005, reg_all=0.04)
model.fit(train_data) # re-fit on only the training data using the best hyperparameters
test_pred = model.test(test_data)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)

SVD : Test Set
RMSE: 1.5875


1.5875037441944968

In [12]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        
    return top_n

In [13]:
def get_reading_list(userid, predictions):
    """
    Retrieve full book titles from full 'books_users_ratings' dataframe
    """
    reading_list = defaultdict(list)
    top_n = get_top_n(predictions, n=10)
    for n in top_n[userid]:
        book, rating = n
        title = books_users_ratings.loc[books_users_ratings.unique_isbn == book].book_title.unique()[0]
        reading_list[title] = rating
    return reading_list

In [14]:
get_reading_list(111637, test_pred)

defaultdict(list,
            {'The Amazing Adventures of Kavalier &amp; Clay': 7.400026743538274,
             'The Book of Illusions: A Novel': 7.007931043368879,
             "CORELLI'S MANDOLIN : A Novel": 6.331719735947438,
             'The Last Time They Met : A Novel': 6.320274567617097})