In [19]:
# Import the datasets
import pandas as pd
url = 'https://raw.githubusercontent.com/GoldbergData/Machine-Learning-Book-Ratings/master/data/clean/books_clean.csv'
books = pd.read_csv(url,parse_dates=[0])
books = pd.DataFrame(books)
books.head(2)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada


In [47]:
url = 'https://raw.githubusercontent.com/GoldbergData/Machine-Learning-Book-Ratings/master/data/clean/books_users_ratings.csv'
books_ratings = pd.read_csv(url,parse_dates=[0])
books_ratings.head(2)

Unnamed: 0,user_id,isbn,book_rating,book_title,book_author,year_of_publication,publisher,unique_isbn,Unnamed: 0.1,age,city,state,country
0,276726,0155061224,5,Rites of Passage,Judith Rae,2001.0,Heinle,0155061224,276725,34.786876,seattle,washington,usa
1,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999.0,Cambridge University Press,052165615X,276728,16.0,rijeka,,croatia


In [6]:
url = 'https://raw.githubusercontent.com/GoldbergData/Machine-Learning-Book-Ratings/master/data/clean/ratings_clean.csv'
ratings = pd.read_csv(url,parse_dates=[0])
ratings.head(2)

Unnamed: 0,user_id,isbn,book_rating
1,276726,0155061224,5
3,276729,052165615X,3


In [7]:
url = 'https://raw.githubusercontent.com/GoldbergData/Machine-Learning-Book-Ratings/master/data/clean/users_clean.csv'
users = pd.read_csv(url,parse_dates=[0])
users.head(2)

Unnamed: 0,user_id,age,city,state,country
0,1,34.786876,nyc,new york,usa
1,2,18.0,stockton,california,usa


In [9]:
print(books.shape)
print(users.shape)
print(books_ratings.shape)
print(ratings.shape)

(266732, 5)
(277332, 5)
(375580, 13)
(433671, 3)


In [14]:
ratings_count = pd.DataFrame(ratings.groupby('isbn')['book_rating'].count())
ratings_count.sort_values('book_rating', ascending=False).head(5)
# highest rating count of 707 for ISBN 0316666343

Unnamed: 0_level_0,book_rating
isbn,Unnamed: 1_level_1
316666343,707
971880107,581
385504209,487
312195516,383
679781587,333


In [49]:
# use books_ratings dataset
books_ratings_new = books_ratings.drop(['book_author','year_of_publication','publisher','Unnamed: 0.1','age','city','state','country'], axis = 1)
books_ratings_new.head(5)
#books_ratings_new = books_ratings['user_id','isbn','book_rating','book_title'] 

Unnamed: 0,user_id,isbn,book_rating,book_title,unique_isbn
0,276726,0155061224,5,Rites of Passage,0155061224
1,276729,052165615X,3,Help!: Level 1,052165615X
2,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,0521795028
3,276744,038550120X,7,A Painted House,038550120X
4,11676,038550120X,10,A Painted House,038550120X


In [51]:
# Group by book titles and create a new column for total ratings count
book_rating_count = (books_ratings_new.groupby(by = ['book_title'])['book_rating'].count().reset_index().rename(columns = {'book_rating':'total_rating_count'})[['book_title','total_rating_count']])

In [52]:
book_rating_count.head(5)

Unnamed: 0,book_title,total_rating_count
0,A Light in the Storm: The Civil War Diary of ...,1
1,"Ask Lily (Young Women of Faith: Lily Series, ...",1
2,Dark Justice,1
3,Earth Prayers From around the World: 365 Pray...,7
4,Final Fantasy Anthology: Official Strategy Gu...,2


In [54]:
rating_with_totalRatingCount = books_ratings_new.merge(book_rating_count, left_on = 'book_title', right_on = 'book_title', how = 'left')
rating_with_totalRatingCount.head(5)

Unnamed: 0,user_id,isbn,book_rating,book_title,unique_isbn,total_rating_count
0,276726,0155061224,5,Rites of Passage,0155061224,1
1,276729,052165615X,3,Help!: Level 1,052165615X,1
2,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,0521795028,1
3,276744,038550120X,7,A Painted House,038550120X,365
4,11676,038550120X,10,A Painted House,038550120X,365


In [58]:
# statistics of total rating count
# pd.set_option('display.float_format', lambda x:'%3f' % x)
print(book_rating_count['total_rating_count'].describe())
# describe() calculates the stat metrics for the column 

count   132567.000000
mean         2.833133
std          9.160989
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        702.000000
Name: total_rating_count, dtype: float64


In [60]:
import numpy as np
# since median = 1.0000, look at the top 10% distribution values
print(book_rating_count['total_rating_count'].quantile(np.arange(0.9,1,0.01)))

0.900000    5.000000
0.910000    5.000000
0.920000    5.720000
0.930000    6.000000
0.940000    7.000000
0.950000    8.000000
0.960000   10.000000
0.970000   13.000000
0.980000   17.000000
0.990000   29.000000
Name: total_rating_count, dtype: float64


In [65]:
# unique books with top 1% of the ratings 
popularity_threshold = 29
rating_pop_book = rating_with_totalRatingCount.query('total_rating_count >= @popularity_threshold')
rating_pop_book.head(5)

Unnamed: 0,user_id,isbn,book_rating,book_title,unique_isbn,total_rating_count
3,276744,038550120X,7,A Painted House,038550120X,365
4,11676,038550120X,10,A Painted House,038550120X,365
5,11676,0671537458,8,Waiting to Exhale,0671537458,30
7,11676,0684867621,3,The Girl Who Loved Tom Gordon : A Novel,0684867621,60
13,11676,0440498058,8,A Wrinkle In Time,0440498058,81


In [68]:
# merge dataset with users
combined = rating_pop_book.merge(users, left_on = 'user_id', right_on= 'user_id', how = 'left')
combined = combined.drop('age', axis =1)
combined.head()

Unnamed: 0,user_id,isbn,book_rating,book_title,unique_isbn,total_rating_count,city,state,country
0,276744,038550120X,7,A Painted House,038550120X,365,torrance,california,usa
1,11676,038550120X,10,A Painted House,038550120X,365,,,
2,11676,0671537458,8,Waiting to Exhale,0671537458,30,,,
3,11676,0684867621,3,The Girl Who Loved Tom Gordon : A Novel,0684867621,60,,,
4,11676,0440498058,8,A Wrinkle In Time,0440498058,81,,,


In [73]:
# kNN implementation using supervised sklearn.neighbors(brute, metric = cosine)
# the algorithm will calculate the cosine similarity between rating vectors.

from scipy.sparse import csr_matrix

combined = combined.drop_duplicates(['user_id','book_title'])
combined_pivot = combined.pivot(index = 'book_title', columns = 'user_id', values = 'book_rating').fillna(0)
combined_matrix = csr_matrix(combined_pivot.values)

from sklearn.neighbors import NearestNeighbors

knn_base = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_base.fit(combined_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [75]:
query_index =  np.random.choice(combined_pivot.shape[0])
distances,indices = knn_base.kneighbors(combined_pivot.iloc[query_index,:].values.reshape(1,-1),n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(combined_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, combined_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for The Cat Who Talked to Ghosts:

1: The Cat Who Lived High, with distance of 0.7712712189710385:
2: The Cat Who Knew Shakespeare, with distance of 0.7749485045670069:
3: The Cat Who Said Cheese, with distance of 0.8294669685725472:
4: The Cat Who Smelled a Rat (Cat Who... (Paperback)), with distance of 0.8296169465002219:
5: The Cat Who Could Read Backwards, with distance of 0.8517829445031538:
