# Book Recommender (Collaborative based) using clustering

In [1]:
import pandas as pd
import numpy as np

In [2]:
books = pd.read_csv('data/BX-Books.csv', sep=';', error_bad_lines=False, encoding='latin-1')

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [4]:
books.shape

(271360, 8)

In [5]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

## Prétraitement des données

In [6]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-L']]

In [7]:
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [8]:
books.rename(columns = {
    "Book-Title":"title",
    "Book-Author":"author",
    "Year-Of-Publication":"year",
    "Publisher":"publisher",
    "Image-URL-L" : "img_url"
}, inplace=True)

In [9]:
books.head(2)

Unnamed: 0,ISBN,title,author,year,publisher,img_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [3]:
users = pd.read_csv('data/BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1')

In [11]:
users.head(2)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [12]:
users.shape

(278858, 3)

In [4]:
ratings = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')

In [14]:
ratings.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5


In [15]:
ratings.shape

(1149780, 3)

In [16]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(271360, 6)
(278858, 3)
(1149780, 3)


In [17]:
ratings.rename(columns = {
    "User-ID" : "user_id",
    "Book-Rating":"rating"
}, inplace=True)

In [18]:
ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5


In [19]:
ratings['user_id'].value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
158698        1
17920         1
277135        1
275086        1
187812        1
Name: user_id, Length: 105283, dtype: int64

as we can see here there are users that rated more than 13k books and thats impossible se can filter those rows out

In [20]:
ratings['user_id'].unique().shape

(105283,)

In [21]:
x=ratings['user_id'].value_counts()>200
x

11676      True
198711     True
153662     True
98391      True
35859      True
          ...  
158698    False
17920     False
277135    False
275086    False
187812    False
Name: user_id, Length: 105283, dtype: bool

true => have more than 200 ratings , false => have less than 200 ratings

In [22]:
x[x].shape

(899,)

so there is 899 user who had rated more than 200 ratings, so we can filter out those users
##### so first we will get there indexes ( user_id )

In [23]:
y = x[x].index
y

Int64Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352,
            110973, 235105,
            ...
            260183, 155916,  44296,  73681,  59727,  28634, 188951,   9856,
            268622, 274808],
           dtype='int64', length=899)

In [24]:
ratingsMore200 = ratings[ratings['user_id'].isin(y)]

In [25]:
ratingsMore200.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [26]:
ratingsMore200.shape

(526356, 3)

In [27]:
books.head(2)

Unnamed: 0,ISBN,title,author,year,publisher,img_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [28]:
rating_with_books = ratingsMore200.merge(books, on = "ISBN")
rating_with_books.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...


In [29]:
rating_with_books.shape

(487671, 8)

In [30]:
num_rating = rating_with_books.groupby('title')['rating'].count().reset_index()
num_rating.head()

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


we can see here that some books has just one rating, we can filter out those books because we can't rely on 1 rating, it will disturb the analysis, so we need to just take books which has more than 50 ratings for example. which mean those books had been rated by people but 1 rating is not enough maybe just a friend of the author who had rated hhhhhh

In [31]:
num_rating.rename(columns = {"rating" : "num_of_rating"}, inplace= True)

In [32]:
num_rating.head(2)

Unnamed: 0,title,num_of_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1


In [33]:
final_rating = rating_with_books.merge(num_rating, on="title")
final_rating.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82


now that we have this dataset we can cut out those books which has below than 50 num_of_rating, we will work just with more than 50 ratings

In [34]:
final_rating = final_rating[final_rating['num_of_rating'] >= 50]
final_rating['num_of_rating'].unique()

array([ 82, 133, 108,  79,  77,  91, 103, 209, 105, 115,  55, 114, 116,
        68, 270, 146,  53,  83, 137,  66,  86,  94,  78,  84, 180, 142,
        71,  58, 224, 120,  61, 136,  67,  52,  56, 121,  64, 123, 183,
       138,  62,  81, 159, 186, 171, 149,  54, 150, 124,  95, 148,  80,
       132,  93,  90,  50, 126,  87,  51,  92,  70, 111,  73,  57, 363,
       135,  96, 140, 182, 175, 130, 118, 127, 122,  69,  63,  97, 210,
       236, 213, 119,  89,  88,  60, 112, 101,  65, 110, 228,  85, 104,
       151,  74, 227,  76, 179,  59,  72, 188, 169, 134, 161, 241, 212,
        98, 100, 277, 174, 131, 125,  99, 107, 128, 206, 181, 168, 117,
       109, 113, 145, 200, 230, 177, 143,  75, 185, 163, 129, 162, 193],
      dtype=int64)

so we successfully cut out books with below than 50 in num_of_rating

In [35]:
final_rating.shape

(61853, 9)

In [36]:
duplicate_rows_subset = final_rating[final_rating.duplicated(subset=['user_id', 'title'], keep=False)]
print("Duplicate Rows:")
print(duplicate_rows_subset)

Duplicate Rows:
        user_id        ISBN  rating              title  \
680       11676  006440188X      10  The Secret Garden   
692      174304  006440188X       9  The Secret Garden   
700      252695  006440188X       0  The Secret Garden   
702       11676  0440977096      10  The Secret Garden   
706      174791  0440977096       0  The Secret Garden   
...         ...         ...     ...                ...   
228839   162639  0553292722       0       Still Waters   
228845   185233  0553292722       0       Still Waters   
228875   113270  0743439651       0       Still Waters   
228876   162639  0743439651       0       Still Waters   
228884   185233  074350996X       0       Still Waters   

                         author  year                   publisher  \
680     Frances Hodgson Burnett  1998                HarperTrophy   
692     Frances Hodgson Burnett  1998                HarperTrophy   
700     Frances Hodgson Burnett  1998                HarperTrophy   
702     Fra

In [37]:
filtered_rows = duplicate_rows_subset[(duplicate_rows_subset['user_id'] == 11676) & (duplicate_rows_subset['title'] == 'The Secret Garden')]
print(filtered_rows)

     user_id        ISBN  rating              title                   author  \
680    11676  006440188X      10  The Secret Garden  Frances Hodgson Burnett   
702    11676  0440977096      10  The Secret Garden  Frances Hodgson Burnett   
710    11676  0879236493       9  The Secret Garden  Frances Hodgson Burnett   

     year                  publisher  \
680  1998               HarperTrophy   
702  1989                 Laure Leaf   
710  1987  David R. Godine Publisher   

                                               img_url  num_of_rating  
680  http://images.amazon.com/images/P/006440188X.0...             79  
702  http://images.amazon.com/images/P/0440977096.0...             79  
710  http://images.amazon.com/images/P/0879236493.0...             79  


as we see there are duplicates of the same user rating the same title then we can filter those out 

In [38]:
final_rating.drop_duplicates(['user_id', 'title'], inplace=True)

In [39]:
final_rating.shape

(59850, 9)

now that we somehow cleaned the data, we will pivot it, we will create from this data a matrix that represents in columns the users and in the index the title of the book and the values inside it , it will be the rating of the user to this book
like :

In [40]:
book_pivot = final_rating.pivot_table(columns = "user_id", index="title", values="rating")
book_pivot.head(2)

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,


In [41]:
book_pivot.fillna(0, inplace=True)

In [42]:
book_pivot.head()

user_id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0


so there is a lot of zeros and if we clustring, it will compute the distance between each column like 0-0, 0-0... so it will take too much time to compute distances, to avoid wasting time and energie we need to just consider the non 0 values , for that we can use the csr matrix ( search on what is it ), so what it does : automatically those zeros won't be considered , it will ignore all those zeros ( to win some time )

In [43]:
from scipy.sparse import csr_matrix

In [44]:
book_sparse = csr_matrix(book_pivot)

In [45]:
book_sparse

<742x888 sparse matrix of type '<class 'numpy.float64'>'
	with 14942 stored elements in Compressed Sparse Row format>

we will pass this matrix to our model that we will use which is nearest neighbor from sklearn

In [46]:
from sklearn.neighbors import NearestNeighbors

In [47]:
model = NearestNeighbors(algorithm='brute') # we will use brute technique for this model

In [48]:
model.fit(book_sparse) #training the model

NearestNeighbors(algorithm='brute')

## model evaluation

In [49]:
final_rating

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,img_url,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
...,...,...,...,...,...,...,...,...,...
236701,255489,0553579983,7,And Then You Die,Iris Johansen,1998,Bantam,http://images.amazon.com/images/P/0553579983.0...,50
236702,256407,0553579983,0,And Then You Die,Iris Johansen,1998,Bantam,http://images.amazon.com/images/P/0553579983.0...,50
236703,257204,0553579983,0,And Then You Die,Iris Johansen,1998,Bantam,http://images.amazon.com/images/P/0553579983.0...,50
236704,261829,0553579983,0,And Then You Die,Iris Johansen,1998,Bantam,http://images.amazon.com/images/P/0553579983.0...,50


now after training the model, i want to see its results : for example let's see the 6 books ( 6 nearest neighbors ) that the model suggest using a book of harry poter which has the index 237

In [50]:
distance, suggestion = model.kneighbors(book_pivot.iloc[237,:].values.reshape(1,-1), n_neighbors=6)

In [51]:
distance

array([[ 0.        , 68.78953409, 69.5413546 , 72.64296249, 76.83098333,
        77.28518616]])

In [52]:
suggestion

array([[237, 240, 238, 241, 184, 536]], dtype=int64)

so that we have those indexes of the books suggested by the model, we can now print there titles like this

In [53]:
book_pivot.index

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='title', length=742)

In [54]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [55]:
import numpy as np
book_id = np.where(book_pivot.index == 'Harry Potter and the Chamber of Secrets (Book 2)')[0][0]

In [56]:
book_id

237

we will store all books name into a variable to use it later in our recommendation system

In [58]:
books_name = book_pivot.index

after creating and training our model we will save this model

In [59]:
import pickle
#save the model
pickle.dump(model, open('model.pkl', 'wb'))

pickle.dump(books_name, open('books_name.pkl', 'wb'))

pickle.dump(final_rating, open('final_rating.pkl', 'wb'))

pickle.dump(book_pivot, open('book_pivot.pkl', 'wb'))


we will see how that recommendation system works

In [60]:
def recommend_book(book_name):
    ### what the system will do
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6)
    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
        for book in books:
            print(book)

In [61]:
#np.where(book_pivot.index == "4 Blondes")[0][0]
book_name = 'Harry Potter and the Chamber of Secrets (Book 2)'
recommend_book(book_name)

Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Book 1)
Exclusive
The Cradle Will Fall


In [62]:
# load model
loaded_model = pickle.load(open('book_pivot.pkl', 'rb'))

In [5]:
import pickle
loaded_model = pickle.load(open('webapp/books_name.pkl', 'rb'))