In [18]:
#Importing required packages
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [19]:
#Reading the dataset
books_df = pd.read_csv('books.csv',usecols=['book_id','title'],dtype={'book_id': 'int32', 'title': 'str'})
user_ratings_df=pd.read_csv('ratings.csv',usecols=['user_id', 'book_id', 'rating'],
    dtype={'user_id': 'int32', 'book_id': 'int32', 'rating': 'float32'})

In [20]:
#Displaying the books DataFrame
books_df.head()

Unnamed: 0,book_id,title
0,2767052,"The Hunger Games (The Hunger Games, #1)"
1,3,Harry Potter and the Sorcerer's Stone (Harry P...
2,41865,"Twilight (Twilight, #1)"
3,2657,To Kill a Mockingbird
4,4671,The Great Gatsby


In [21]:
#Displaying the user ratings DataFrame
user_ratings_df.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5.0
1,1,439,3.0
2,1,588,5.0
3,1,1169,4.0
4,1,1185,4.0


In [22]:
#Merge the dataset based on book_id
merged_dataset = pd.merge(user_ratings_df,books_df,on='book_id')
merged_dataset.head()

Unnamed: 0,book_id,user_id,rating,title
0,1,314,5.0,Harry Potter and the Half-Blood Prince (Harry ...
1,1,439,3.0,Harry Potter and the Half-Blood Prince (Harry ...
2,1,588,5.0,Harry Potter and the Half-Blood Prince (Harry ...
3,1,1169,4.0,Harry Potter and the Half-Blood Prince (Harry ...
4,1,1185,4.0,Harry Potter and the Half-Blood Prince (Harry ...


In [23]:
#Calculating the total number of ratings for a book
combined_books_rating = merged_dataset.dropna(axis = 0, subset = ['title']) #dropping rows with missing values in the column 'title'
books_rating_count = (combined_books_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
books_rating_count.head()

Unnamed: 0,title,totalRatingCount
0,'Salem's Lot,74
1,"'Tis (Frank McCourt, #2)",100
2,1421: The Year China Discovered America,100
3,1776,100
4,1984,100


In [24]:
#Merge the dataset with the total rating count
dataset_with_totalRatingCount = combined_books_rating.merge(books_rating_count, left_on = 'title', right_on = 'title', how = 'left')
dataset_with_totalRatingCount.head()

Unnamed: 0,book_id,user_id,rating,title,totalRatingCount
0,1,314,5.0,Harry Potter and the Half-Blood Prince (Harry ...,100
1,1,439,3.0,Harry Potter and the Half-Blood Prince (Harry ...,100
2,1,588,5.0,Harry Potter and the Half-Blood Prince (Harry ...,100
3,1,1169,4.0,Harry Potter and the Half-Blood Prince (Harry ...,100
4,1,1185,4.0,Harry Potter and the Half-Blood Prince (Harry ...,100


In [28]:
#Creating a histogram to see the total ratings count, mean,standard dev, etc for setting a threshold value
pd.set_option('display.float_format', lambda x:'%.3f' % x)
print(books_rating_count['totalRatingCount'].describe())

count   812.000
mean     98.154
std       5.546
min      57.000
25%      99.000
50%     100.000
75%     100.000
max     100.000
Name: totalRatingCount, dtype: float64


In [29]:
#Setting a threshold value for the rating count 
ratings_threshold = 100
#Filtering the dataset
popular_books= dataset_with_totalRatingCount.query('totalRatingCount >= @ratings_threshold')
popular_books.head()

Unnamed: 0,book_id,user_id,rating,title,totalRatingCount
0,1,314,5.0,Harry Potter and the Half-Blood Prince (Harry ...,100
1,1,439,3.0,Harry Potter and the Half-Blood Prince (Harry ...,100
2,1,588,5.0,Harry Potter and the Half-Blood Prince (Harry ...,100
3,1,1169,4.0,Harry Potter and the Half-Blood Prince (Harry ...,100
4,1,1185,4.0,Harry Potter and the Half-Blood Prince (Harry ...,100


In [30]:
#Getting the rows and column count of the dataframe "popularBooks"
popular_books.shape

(60700, 5)

In [31]:
#Creating a pivot table
books_rating_pivot=popular_books.pivot_table(index='title',columns='user_id',values='rating').fillna(0) #Filling NaN with 0
books_rating_pivot

user_id,4,7,10,19,22,23,24,27,31,35,...,53390,53393,53403,53406,53409,53416,53419,53420,53422,53424
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'Tis (Frank McCourt, #2)",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1421: The Year China Discovered America,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1776,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1984,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
A Bend in the River,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wuthering Heights,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Year of Wonders,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
You Shall Know Our Velocity!,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Zen and the Art of Motorcycle Maintenance: An Inquiry Into Values,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,2.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [36]:
#Getting the rows and column count of the dataframe "booksRatingPivot"
books_rating_pivot.shape

(607, 19858)

In [37]:
#Using cosine similarity to find the similarity score and NearestNeighbors class for searching neighbors
books_rating_pivot_matrix = csr_matrix(books_rating_pivot.values)
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model.fit(books_rating_pivot_matrix) #Fitting the model

NearestNeighbors(algorithm='brute', metric='cosine')

In [47]:
#Choosing a random record from the "booksRatingPivot" dataframe
random_record = np.random.choice(books_rating_pivot.shape[0])
print(random_record)
distances, indices = model.kneighbors(books_rating_pivot.iloc[random_record,:].values.reshape(1, -1))

368


In [48]:
#Using a for loop to print out the recommended books along with their distance
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Book Name:\n{0}\n'.format(books_rating_pivot.index[random_record]))
        print("Recommended books:")
    else:
        print('{0}) {1}, with a distance of {2}'.format(i, books_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Book Name:
The Autograph Man

Recommended books:
1) Lord of the Flies, with a distance of 0.5610199570655823
2) Jackdaws, with a distance of 0.9022625684738159
3) Drowning Ruth, with a distance of 0.9292013049125671
4) Redwall (Redwall, #1), with a distance of 0.9385129809379578
