# Books Recommender

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

In [None]:
#read data
books = pd.read_csv('data/books/listing.csv', encoding = 'latin-1')
descriptions = pd.read_csv('data/books/description.csv', encoding = 'latin-1')
ratings = pd.read_csv('data/books/ratings.csv', encoding = 'latin-1')

In [None]:
books.head()

In [None]:
descriptions.head()

In [None]:
ratings.head()

In [None]:
# Number of users
print('The ratings dataset has', ratings['user_id'].nunique(), 'unique users')

In [None]:
# Number of books
print('The ratings dataset has', ratings['book_id'].nunique(), 'unique movies')

## 1. Popularity Based  Recommender

Pick the most popular book (rated highly) among the users and directly recommend those

In [None]:
# Getting recommendation based on No. Of ratings 
rating_count = pd.DataFrame(ratings, columns=['book_id','no_of_ratings'])
rating_count.sort_values('no_of_ratings', ascending=False).drop_duplicates().head(10)

In [None]:
# calculating the mean
rating_mean = pd.DataFrame(ratings.groupby('book_id')['no_of_ratings'].mean())
rating_mean.head()

In [None]:
# getting the detail of most rated books
most_rated_books = pd.DataFrame([4755, 2409, 2194, 4696, 1616], index=np.arange(5), columns=['book_id'])

detail = pd.merge(most_rated_books, books, on='book_id')
detail

In [None]:
# getting the most rated book
most_rated_book = pd.DataFrame(ratings, columns=['book_id', 'user_id', 'avg_rating', 'no_of_ratings'])
most_rated_book.max()

In [None]:
#getting description for most rated book
most_rated_book.describe()

In [None]:
# can also get the description for author 
books['author'].describe()

## 2. Content Based  Recommender

We will match books based on their content (description). TF-IDF will be used to vectorize the description column and cosine similary will be used to find other similar books. Other vectorization techniques (HashingTF, Word2Vec, BERT, etc.) can be used as well.

In [None]:
# replace NaN with empty strings
descriptions['description'] = descriptions['description'].fillna('')

In [None]:
# removing the stop words
tfidf = TfidfVectorizer(stop_words='english')

# computing TF-IDF matrix required for calculating cosine similarity
books_tfidf = tfidf.fit_transform(descriptions['description'])

In [None]:
# shape of computed matrix
books_tfidf.shape

In [None]:
# computing cosine similarity matrix using linear_kernal of sklearn
cosine_similarity = linear_kernel(books_tfidf, books_tfidf)

In [None]:
# Get the pairwise similarity scores of all books compared to the book passed by index
# here 2 is the index of the book in dataset
similarity_scores = list(enumerate(cosine_similarity[2]))

#get the top 5 similar books
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similarity_scores = similarity_scores[1:6]

# Get the similar books index
books_index = [i[0] for i in similarity_scores]

# print the top 5 most similar books
print (descriptions['name'].iloc[books_index])

## 3.  Collaborative Filter   

In [None]:
# Build the user-item matrix
user_item = pd.pivot_table(data=ratings, values='user_rating', index='user_id', columns='book_id')
user_item.replace('',np.nan,inplace=True) 
user_item.head()

In [None]:
# Normalize user-item matrix
user_item_norm = user_item.subtract(user_item.mean(axis=1), axis = 0)
user_item_norm.head()

In [None]:
# User similarity matrix using Pearson correlation
user_similarity = user_item_norm.T.corr(method='pearson')
user_similarity.head()

In [None]:
# Item similarity matrix using Pearson correlation
item_similarity = user_item_norm.corr(method='pearson')
item_similarity.head()

In [None]:
# Pick a user ID
target_userid = 3472 

# Pick a book
target_bookid = 4755

# Books that the target user has rated
target_userid_rated = pd.DataFrame(user_item_norm.loc[[target_userid]].dropna(axis=1, how='all')).reset_index()
target_userid_rated.drop(target_userid_rated.iloc[:, 0:1], inplace=True, axis=1)
target_userid_rated = target_userid_rated.T
target_userid_rated.head()

In [None]:
# Similarity score of the target_bookid with all the other books
target_book_similarity_score = item_similarity.loc[[target_bookid]].dropna(axis=1, how='all').reset_index()
target_book_similarity_score.drop(target_book_similarity_score.iloc[:, 0:1], inplace=True, axis=1)
target_book_similarity_score = target_book_similarity_score.T
target_book_similarity_score.head()

In [None]:
#Rank the similarities between the books 
target_book_similarity = 

In [None]:
#Rank the similarities between target user and target book
target_userid_rated_similarity = #select top 5

In [None]:
#Calculate the predicted rating using weighted average of similarity
predicted_rating = round(np.average(target_userid_rated_similarity['rating'], 
                                    weights=target_userid_rated_similarity['similarity_score']), 6)
print(f'The predicted rating for {target_bookid} by user {target_userid} is {predicted_rating}' )