# Book Recommender System

In [1]:
# Importing necessary libraries
import os
import random
import warnings
import zipfile

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from surprise import BaselineOnly, Dataset, KNNBasic, NMF, NormalPredictor, Reader, SVD
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

# Suppressing warnings
warnings.filterwarnings('ignore')

In [2]:
data = {}
filename = 'books_users_data.zip'
zfile = zipfile.ZipFile(filename, 'r')
for name in zfile.namelist():
    if name.endswith('.csv'):
        data[name.split('.')[0]] = zfile.open(name)

In [3]:
books_df = pd.read_csv(data['Books'], encoding='latin-1')
ratings_df = pd.read_csv(data['Ratings'], encoding='latin-1')
users_df = pd.read_csv(data['Users'], encoding='latin-1')

In [4]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [7]:
# No. of rows for each dataset
datasets = [("Ratings", ratings_df), ("Books", books_df), ("Users", users_df)]
for i in datasets:
    print("No. of rows for {} = {}".format(i[0], len(i[1])))

No. of rows for Ratings = 1149780
No. of rows for Books = 271360
No. of rows for Users = 278858


In [8]:
ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


## Data preprocessing
---

### Data cleaning

In [9]:
books_df.isna().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [10]:
users_df.isna().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [11]:
# Fill na values from Age with mean
age_mean = users_df['Age'].mean()
users_df['Age'].fillna(age_mean, inplace = True)

In [12]:
users_df.isna().sum()

User-ID     0
Location    0
Age         0
dtype: int64

### Remove duplicates

In [13]:
r_row = len(ratings_df.index)

In [14]:
ratings_df.drop_duplicates(subset=['ISBN', 'User-ID'], keep='first', inplace=True)
print("No. of duplicates detected: {}".format(len(ratings_df.index) - r_row))

No. of duplicates detected: 0


In [15]:
books_df.drop_duplicates(subset='ISBN', inplace=True)
users_df.drop_duplicates(subset='User-ID', inplace=True)

### Remove wrong data entry
It is found that there are some *errors in data entry* in `books_df`

In [16]:
df0 = books_df.copy()

In [17]:
df0['Year-Of-Publication'] = pd.to_numeric(df0['Year-Of-Publication'], errors='coerce')

In [18]:
df0[df0['Year-Of-Publication'].isna()]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,


The *year of publication* for these 3 rows was written under the column of `Book-Author` instead of `Year-Of-Publication`.<p>
The *correct table* should be:
    
| | ISBN | Book-Title | Book-Author | Year-Of-Publication | ... |
| -: | -: | -: | -: | -: | -: |
| **209538** | 078946697X | DK Readers:<br> Creating the X<br>-Men, How It All<br> Beg... | NaN | 2000 | ... |
| **220731** | 2070426769 | Peuple du ciel,<br> suivi de 'Les<br> Bergers\";Jean-<br>M... | NaN | 2003 | ... |
| **221678** | 0789466953 | DK Readers:<br> Creating the X-<br>Men, How<br> Comic Book... | NaN | 2000 | ... |

**However**, to avoid confusion or wrong correction of data, these rows with wrong data entry are *dropped*.

In [19]:
# Drop the incorrect data entry
valid_rows = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce').notna()
books_df = books_df[valid_rows]

books_df['Year-Of-Publication'] = books_df['Year-Of-Publication'].astype(int)

In [20]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [21]:
del df0

### Remove books with very few ratings and users that rated very few books

In [22]:
ratings_count = ratings_df.groupby('ISBN')['Book-Rating'].count().reset_index()
ratings_count.rename(columns={'Book-Rating': 'Ratings_Count'}, inplace=True)

In [23]:
ratings_count.head()

Unnamed: 0,ISBN,Ratings_Count
0,330299891,2
1,375404120,2
2,586045007,1
3,9022906116,2
4,9032803328,1


In [24]:
# Add the number of ratings of each book into the books dataset
books_df = books_df.merge(ratings_count, on='ISBN', how='left')
books_df = books_df[books_df['Ratings_Count'].notnull()]

In [25]:
# set at least 50 ratings as the threshold
books_df = books_df[books_df['Ratings_Count'] >= 50] 

In [26]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Ratings_Count
18,440234743,The Testament,John Grisham,1999,Dell,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...,http://images.amazon.com/images/P/0440234743.0...,422.0
19,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,180.0
26,971880107,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,2502.0
27,345402871,Airframe,Michael Crichton,1997,Ballantine Books,http://images.amazon.com/images/P/0345402871.0...,http://images.amazon.com/images/P/0345402871.0...,http://images.amazon.com/images/P/0345402871.0...,207.0
28,345417623,Timeline,MICHAEL CRICHTON,2000,Ballantine Books,http://images.amazon.com/images/P/0345417623.0...,http://images.amazon.com/images/P/0345417623.0...,http://images.amazon.com/images/P/0345417623.0...,407.0


In [27]:
ratings_df = ratings_df[ratings_df['ISBN'].isin(books_df['ISBN'])]
ratings_df = ratings_df[ratings_df['User-ID'].isin(users_df['User-ID'])]

In [28]:
# Filter users that rated very few books
user_counts = ratings_df['User-ID'].value_counts()
ratings_df = ratings_df[ratings_df['User-ID'].isin(user_counts[user_counts >= 10].index)]

In [29]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
795,277042,60505885,0
796,277042,61097101,0
798,277042,312283709,8
799,277042,312983271,0
800,277042,380731851,0


### Nomalization

In [30]:
ratings_df.head(10)

Unnamed: 0,User-ID,ISBN,Book-Rating
795,277042,60505885,0
796,277042,61097101,0
798,277042,312283709,8
799,277042,312983271,0
800,277042,380731851,0
801,277042,446605484,7
802,277042,446611212,8
803,277042,451188454,0
805,277042,609804138,9
806,277042,671003755,0


In [31]:
sorted(ratings_df['Book-Rating'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

Ratings are *ranged from 0 to 10*

In [32]:
# Finding the mean ratings of each user
mean_rating = ratings_df.groupby('User-ID')['Book-Rating'].mean().reset_index()
mean_rating.rename(columns={'Book-Rating': 'Mean_Rating'}, inplace=True)

In [33]:
mean_rating.head()

Unnamed: 0,User-ID,Mean_Rating
0,243,1.946429
1,254,1.909091
2,487,1.428571
3,507,1.742857
4,638,7.235294


#### Adjusted Ratings
The adjusted ratings are calculated to reduce bias and to get a more accurate result:

$$
\text{adjusted rating}(r) = r - \text{mean rating for user}(u),
$$

where $r$ is the original rating value for a particular book, and $u$ is the user who provided the rating.

In [34]:
ratings_df = ratings_df.merge(mean_rating, on='User-ID', how='left')
ratings_df['Adj_Rating'] = ratings_df['Book-Rating'] - ratings_df['Mean_Rating']

In [35]:
ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating,Mean_Rating,Adj_Rating
0,277042,0060505885,0,3.230769,-3.230769
1,277042,0061097101,0,3.230769,-3.230769
2,277042,0312283709,8,3.230769,4.769231
3,277042,0312983271,0,3.230769,-3.230769
4,277042,0380731851,0,3.230769,-3.230769
...,...,...,...,...,...
158049,276688,0553575090,7,2.200000,4.800000
158050,276688,0553575104,6,2.200000,3.800000
158051,276688,0679459618,0,2.200000,-2.200000
158052,276688,0679751521,0,2.200000,-2.200000


In [36]:
len(ratings_df['ISBN'].unique())

2161

In [37]:
ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating,Mean_Rating,Adj_Rating
0,277042,0060505885,0,3.230769,-3.230769
1,277042,0061097101,0,3.230769,-3.230769
2,277042,0312283709,8,3.230769,4.769231
3,277042,0312983271,0,3.230769,-3.230769
4,277042,0380731851,0,3.230769,-3.230769
...,...,...,...,...,...
158049,276688,0553575090,7,2.200000,4.800000
158050,276688,0553575104,6,2.200000,3.800000
158051,276688,0679459618,0,2.200000,-2.200000
158052,276688,0679751521,0,2.200000,-2.200000


In [38]:
print(f"Range of Adj_Rating = [{ratings_df['Adj_Rating'].min()}, {ratings_df['Adj_Rating'].max()}]")

Range of Adj_Rating = [-9.238095238095237, 9.952830188679245]


In [39]:
# Scale it to [0, 10]
min_rating = ratings_df['Adj_Rating'].min()
max_rating = ratings_df['Adj_Rating'].max()

normalized_rating = (ratings_df['Adj_Rating']- min_rating)/(max_rating - min_rating)*10

In [40]:
print(f"New range = [{normalized_rating.min()}, {normalized_rating.max()}]")

New range = [0.0, 10.0]


In [41]:
ratings_df['Adj_Rating'] = normalized_rating

## Model Evaluation
---

In [42]:
reader = Reader(rating_scale = (0,10))
data = Dataset.load_from_df(ratings_df[['User-ID', 'ISBN', "Adj_Rating"]], reader)

In [43]:
algos = [SVD(n_epochs=10), NMF(), NormalPredictor()]

In [44]:
for algo in algos:
    cross_validate(algo, data, measures = ['RMSE'], cv = 5, verbose=True)
    algo != algos[len(algos)-1] and print('------------------')

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.7576  1.7415  1.7473  1.7600  1.7572  1.7527  0.0071  
Fit time          0.59    0.62    0.60    0.56    0.56    0.59    0.02    
Test time         0.16    0.22    0.10    0.14    0.15    0.15    0.04    
------------------
Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9356  1.9605  1.9488  1.9468  1.9490  1.9481  0.0079  
Fit time          1.50    1.59    1.51    1.53    1.51    1.52    0.03    
Test time         0.08    0.08    0.14    0.14    0.08    0.10    0.03    
------------------
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.4227  2.4441  2.4184  2.4415  2.4448  2.4343  0.0114  
Fit time          0.07    0.09    0.09    0.08    0.0

### Parameter Tuning

In [45]:
# Set up the Surprise Dataset object and Reader object
reader = Reader(rating_scale=(0, 5))

# Define the parameter grid to search over
param_grid = {'n_factors': [50, 100, 200],
              'n_epochs': [5, 10, 20],
              'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.4, 0.6, 0.8]}

# Set up the GridSearchCV object
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)

# Run the grid search to find the best model
gs.fit(data)

# Print the best RMSE score and the combination of parameters that gave the best RMSE score
print('Best RMSE score:', gs.best_score['rmse'])
print('Best parameters:', gs.best_params['rmse'])

Best RMSE score: 1.7220711798576147
Best parameters: {'n_factors': 50, 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.8}


## Recommendation

In [46]:
# Set up the Surprise Dataset object and Reader object
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(ratings_df[['User-ID', 'ISBN', 'Adj_Rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=.25)

# Set the best hyperparameters found during grid search
best_params = {'n_epochs': 5, 'lr_all': 0.002, 'n_factors': 50, 'reg_all': 0.8}

# Train the SVD model on the entire dataset using the best hyperparameters
algo = SVD(**gs.best_params['rmse'])
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x286194590>

In [47]:
# user list
user_list = ratings_df['User-ID'].unique()

In [48]:
def recommendation(user_input = None):
    user_id = random.choice(user_list) if bool(user_input) == False else user_input

    books_df[books_df['ISBN'].isin(ratings_df[ratings_df['User-ID']==user_id]['ISBN'])]
    user_items = ratings_df[ratings_df['User-ID'] == user_id]['ISBN']
    other_items = set(ratings_df['ISBN']) - set(user_items)

    testset = [[user_id, item_id, 0] for item_id in other_items]
    predictions = algo.test(testset)

    # Sort the predicted ratings in descending order and recommend the top-rated books to the user
    top_n = 10 # The number of books to recommend
    predictions.sort(key=lambda x: x.est, reverse=True)
    recommended_books = [pred.iid for pred in predictions][:top_n]

    # The books that the user rated
    print("Recommended books for User #{}:".format(user_id))
    display(books_df[books_df['ISBN'].isin(recommended_books)])

In [49]:
# Input User-ID OR input nothing for random user
recommendation(input("Input a User-ID or input nothing for a random user: ")) # e.g. 277042

Recommended books for User #14422:


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Ratings_Count
37,0446310786,To Kill a Mockingbird,Harper Lee,1988,Little Brown &amp; Company,http://images.amazon.com/images/P/0446310786.0...,http://images.amazon.com/images/P/0446310786.0...,http://images.amazon.com/images/P/0446310786.0...,389.0
2143,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books,http://images.amazon.com/images/P/059035342X.0...,http://images.amazon.com/images/P/059035342X.0...,http://images.amazon.com/images/P/059035342X.0...,571.0
2232,0812550706,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1994,Tor Books,http://images.amazon.com/images/P/0812550706.0...,http://images.amazon.com/images/P/0812550706.0...,http://images.amazon.com/images/P/0812550706.0...,195.0
2809,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,168.0
3839,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,197.0
5431,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...,194.0
5432,0439064864,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,170.0
5506,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,334.0
6932,0439139600,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2002,Scholastic Paperbacks,http://images.amazon.com/images/P/0439139600.0...,http://images.amazon.com/images/P/0439139600.0...,http://images.amazon.com/images/P/0439139600.0...,193.0
19640,0877017883,Griffin &amp; Sabine: An Extraordinary Corresp...,Nick Bantock,1991,Chronicle Books,http://images.amazon.com/images/P/0877017883.0...,http://images.amazon.com/images/P/0877017883.0...,http://images.amazon.com/images/P/0877017883.0...,72.0
