In [132]:
# Importing all the required packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [133]:
# Import the dataset and convert it to Pandas DataFrame
books = pd.read_csv('Dataset/BX-Books.csv',delimiter=';',on_bad_lines='skip',encoding='latin')

  books = pd.read_csv('Dataset/BX-Books.csv',delimiter=';',on_bad_lines='skip',encoding='latin')


In [134]:
# There are various sizes of the book covers
# I'll be using the large size image for the website
# Feature Selection: I will remove the columns Image-URL-S, Image-URL-M and

books = books[['ISBN','Book-Title','Book-Author','Year-Of-Publication','Publisher','Image-URL-L']]
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [135]:
# The column names in the csv file are long
# So, I am shortening them to something simple

books.rename(columns={
    "Book-Title": "title",
    "Book-Author": "author",
    "Year-Of-Publication": "year",
    "Publisher": "publisher",
    "Image-UNL-L":"img_url"},inplace = True)

In [136]:
books.head()

Unnamed: 0,ISBN,title,author,year,publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [137]:
# Importing the BX-Users.csv

users = pd.read_csv('Dataset/BX-Users.csv',delimiter = ";",on_bad_lines = 'skip', encoding='latin-1')
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [138]:
ratings = pd.read_csv('Dataset/BX-Book-Ratings.csv',delimiter = ";",on_bad_lines = 'skip', encoding='latin-1')
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [139]:
# Editing the names of the columns in the rating csv

ratings.rename(columns = {
    "User-ID":"user_id",
    "Book-Rating":"rating"
},inplace = True)

ratings.head()

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [140]:
# Creating a boolean series x (condition is: any user that has rated more than 20 books)

x = ratings['user_id'].value_counts() > 20

# Boolean indexing uses this boolean Series to filter the original Series or DataFrame
# It selects rows or elements where the corresponding boolean value is True

y = x[x].index

# Filtered out all the people that have more than 20 book ratings

ratings = ratings[ratings['user_id'].isin(y)]

ratings.head()

Unnamed: 0,user_id,ISBN,rating
31,276762,034544003X,0
32,276762,0380000059,0
33,276762,0380711524,5
34,276762,0451167317,0
35,276762,0451454952,0


In [141]:
# Merging the ratings with the books
# This would store the ratings and book names in the same dataframe

ratings_with_books = ratings.merge(books,on="ISBN")
ratings_with_books.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,Image-URL-L
0,276762,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...
1,29259,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...
2,35050,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...
3,98391,034544003X,9,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...
4,134797,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...


In [142]:
# Grouping the books with the number of ratings
# When groupby is performed (or any other aggregation), 
# the resulting dataframe has no index at all
# reset_index moves the current index
# into columns and resets the index to the default integer index (0, 1, 2, ...).

num_rating = ratings_with_books.groupby('title')['rating'].count().reset_index()
num_rating.head()

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [143]:
# Renamed rating to number_of_rating

num_rating.rename(columns={'rating':'number_of_rating'},inplace=True)

# Merging the ratings with number of ratings
# This is to filter out all the books that have more than 20 ratings
# The lesser rating books might not give accurate recommendations

final_rating = ratings_with_books.merge(num_rating,on="title")
final_rating.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,Image-URL-L,number_of_rating
0,276762,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...,13
1,29259,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...,13
2,35050,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...,13
3,98391,034544003X,9,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...,13
4,134797,034544003X,0,Southampton Row (Charlotte &amp; Thomas Pitt N...,Anne Perry,2002,Ballantine Books,http://images.amazon.com/images/P/034544003X.0...,13


In [144]:
# Now, filtering out all the books that have more than 30 ratings

final_rating = final_rating[final_rating['number_of_rating'] >= 50]
final_rating.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,Image-URL-L,number_of_rating
55,276762,451167317,0,The Dark Half,Stephen King,1994,Signet Book,http://images.amazon.com/images/P/0451167317.0...,169
56,254,451167317,8,The Dark Half,Stephen King,1994,Signet Book,http://images.amazon.com/images/P/0451167317.0...,169
57,2276,451167317,0,The Dark Half,Stephen King,1994,Signet Book,http://images.amazon.com/images/P/0451167317.0...,169
58,5543,451167317,0,The Dark Half,Stephen King,1994,Signet Book,http://images.amazon.com/images/P/0451167317.0...,169
59,6900,451167317,9,The Dark Half,Stephen King,1994,Signet Book,http://images.amazon.com/images/P/0451167317.0...,169


In [145]:
# Now, we need to drop all the duplicate values of title and user_id

final_rating.drop_duplicates(['user_id','title'],inplace = True)
final_rating.shape

(175082, 9)

In [146]:
# Making the pivot table for clustering
# Filling all the null values to 0
# This is logical, cuz the user might not have read the book
# The non reading people would be in the same cluster

book_pivot = final_rating.pivot_table(columns='user_id',index='title',values='rating')
book_pivot.fillna(0,inplace=True)
book_pivot.head()

user_id,183,242,243,254,383,388,408,446,487,503,...,278194,278202,278221,278356,278418,278535,278582,278633,278843,278851
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
# Handling the zeroes
# CSR Matrices only store the non-zero values, hence more efficient

book_sparse = csr_matrix(book_pivot)
book_sparse

<1713x6515 sparse matrix of type '<class 'numpy.float64'>'
	with 57245 stored elements in Compressed Sparse Row format>

In [148]:
# Declaring an instance of the model
# I am using the Nearest Neighbour algorithm for clutering

model = NearestNeighbors(algorithm = "brute")

In [149]:
# Fiting the pivot table (dataset) to the model

model.fit(book_sparse)

In [150]:
books_name = book_pivot.index

In [151]:
def recommend_book(book_name):
    book_id = np.where(book_pivot.index == book_name)[0][0]
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1),num_books)
    for i in range(len(suggestion)):
        books = book_pivot.index[suggestion[i]]
        for j in books:
            print(j)

In [152]:
book_name = input("Name the book you like")
num_books = 8
recommend_book(book_name)

Jane Eyre (Penguin Classics)
Honor Among Thieves
Where You Belong
Monster Blood (Goosebumps, No 3)
Whirlwind
From the Heart: Tonight and Always/A Matter of Choice/Endings and Beginnings
A Sudden Change of Heart
Ground Zero and Beyond
