# Building Book Recommendation

1. There are two data sets; 1) reviews_Books_5 2) metadata
2. The datasets contains product reviews and metadata from Amazon, including 142.8 million reviews spanning May 1996 - July 2014
3. It takes about an hour to retrieve each dataset as it contains 8,898,041 reviews (large volume)

"""
1) reviews_Books_5 data contains following information: 
reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
asin - ID of the product, e.g. 0000013714
reviewerName - name of the reviewer
helpful - helpfulness rating of the review, e.g. 2/3
reviewText - text of the review
overall - rating of the product
summary - summary of the review
unixReviewTime - time of the review (unix time)
reviewTime - time of the review (raw)
"""

"""
2) meta data contains following information:
asin - ID of the product, e.g. 0000031852
title - name of the product
price - price in US dollars (at time of crawl)
imUrl - url of the product image
related - related products (also bought, also viewed, bought together, buy after viewing)
salesRank - sales rank information
brand - brand name
categories - list of categories the product belongs to
"""

In [None]:
import numpy as np
import pandas as pd
import gzip
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time 
import sklearn.metrics as metrics
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import sys, os
from contextlib import contextmanager

In [None]:
# read the first file: reviews_Books_5

import pandas as pd
import gzip
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('reviews_Books_5.json.gz')

In [None]:
# Check for NULL values: only the rows having NULL are displayed
# Only 'reviewerName' and 'reviewText' contains null value

print(df.isnull().sum())
(df.isnull().sum() / len(df)).plot(kind='bar', figsize=(10, 3))

In [None]:
# Sample data check

print(df.head())
print(df.shape)

In [None]:
# Compute the average rating in terms of overall score and rating counts

average_rating=pd.DataFrame(df.groupby('asin')['overall'].mean())
average_rating['ratingCount']= pd.DataFrame(df.groupby('asin')['overall'].count())
average_rating.sort_values('ratingCount',ascending=False).head()
average_rating.sort_values('ratingCount',ascending=False)[0:1000]

In [None]:
# To ensure statistical significance, users with less than 400 ratings, and books with less than 400 ratings are excluded

user_counts=df['reviewerID'].value_counts()
df=df[df['reviewerID'].isin(user_counts[user_counts>=400].index)]
overall_counts=df['overall'].value_counts()
df=df[df['overall'].isin(overall_counts[overall_counts>=400].index)]

In [None]:
# Save the modified data as df.xlsx

df.to_excel(r'C:\Users\hahas\df.xlsx')

In [None]:
# read the second file: metadata

import pandas as pd
import gzip
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('metadata.json.gz')

In [None]:
# sample data from 'metadata' file

df.head()

In [None]:
# retrieve the modified data 'df.xlsx' as 'ratings'

ratings = pd.read_excel (r'C:\Users\hahas\df.xlsx')

In [None]:
ratings.head()

In [None]:
# merge two files ('ratings' from the 1st dataset & 'df' from the 2nd dataset)
# This is mainly for getting information(e.g. the book title) that the first dataset does not contain

book_rating_combined = pd.merge(ratings,df,on='asin')

In [None]:
book_rating_combined.head()

In [None]:
print(book_rating_combined.shape)

In [None]:
# Check NULL values for the combnied data: only the rows having NULL are displayed

book_rating_combined.isnull().sum()

In [None]:
# To reduce the size of data, drop the unnecessary columns

columns = ['Unnamed: 0','reviewerName','helpful', 'reviewText','summary', 'unixReviewTime', 'reviewTime','imUrl','salesRank','brand']
book_rating_modified=book_rating_combined.drop(columns, axis=1)
book_rating_modified.head()

In [None]:
# Get rid of rows with 'no titles'
# This is because if a row does not contain the book title, ratings score would not be meaningful

book_rating_modified=book_rating_modified.dropna(axis=0,subset=['title'])  
book_rating_modified.head()

In [None]:
book_rating_modified.shape

In [None]:
book_rating_modified.isnull().sum()

In [None]:
# Create a new column that contains'the number of ratings each book received'
# This is to ensure that we limit books with at least 'n' ratings (This will be done after a few more steps)

book_ratingCount = (book_rating_modified.groupby(by=['title'])['overall'].count().reset_index().rename
                    (columns={'overall':"totalRatingcount"})[['title','totalRatingcount']])

In [None]:
book_ratingCount.head()

In [None]:
len(book_rating_modified.title.unique())

In [None]:
# Merge the newly created column (total_Rating_count) to the exisiting dataset 

book_rating_modified_total = book_rating_modified.merge(book_ratingCount, left_on = 'title', right_on = 'title', how='left')
book_rating_modified_total.head()

In [None]:
# Check the shape of the data
book_rating_modified_total.shape

In [None]:
# Take a look at the statistics of total_Rating_count
# Each book has the average of 3.2 rating_counts (this is not about the score, but reveals the number of ratings received)
# The median book has been rated for twice 

pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingcount'].describe())

In [None]:
# The below shows the top of the distribution 
# About 1% of the books received 20 or more ratings

print(book_ratingCount['totalRatingcount'].quantile(np.arange(.9,1,.01)))

In [None]:
# limit to books that have received 5 or more ratings

popularity_threshold = 5
book_rating_filtered=book_rating_modified_total.query('totalRatingcount > @popularity_threshold')
book_rating_filtered.head()

In [None]:
book_rating_filtered.shape

In [None]:
# With selected data, look into the ratings distribution
# The ratings are unevenly distributed and the vast majority of ratings are 4-5
# according to the following plot

plt.rc("font", size =15)
book_rating_filtered.overall.value_counts(sort=True).plot(kind='bar')
plt.title('Rating Distribution\n')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('system1.png',bbox_inches = 'tight')
plt.show()

# (method1) Top 10 Recommendations based on rating counts & scores 

- a basic recommendation system based on books' popularity by simply counting ratings. 
 - Books with more ratings are considered to be more popular in this model.

In [None]:
# Top 10 Books with the higher score
new_ratings_count = pd.DataFrame(book_rating_filtered.groupby(['title'])['overall'].sum())
top10 =  new_ratings_count.sort_values('overall',ascending=False).head(10)
top10

In [None]:
# Top 10 Books with the higher counts
rating_counts = pd.DataFrame(book_rating_filtered.groupby('title')['overall'].count())
top10_c = rating_counts.sort_values('overall', ascending=False).head(10)
top10_c

# (method2) Collaborative-filtering based recommendation 

## Part 1 - Build and test the model

In [None]:
# To build the Collaborative-filtering model,
# Create a Pivot table and Matrix and Fill the missing values with zeros 
# Using 2D matrix to compute distance between vectors 

from scipy.sparse import csr_matrix
book_rating_filtered_pivot = book_rating_filtered.pivot_table(index = 'title',columns='reviewerID', values='overall').fillna(0)
book_rating_filtered_matrix = csr_matrix(book_rating_filtered_pivot.values)

In [None]:
book_rating_filtered_pivot

In [None]:
# To find the k-Nearest Neighbors, we will use the metric 'cosine' and algorithm 'brute'
# This will compute the cosine similarity between vectors.

model_knn=NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(book_rating_filtered_matrix)

In [None]:
# Test this model with a random book. 
# The kNN algorithm will return 5 similar books measuring distances to determine the 'similarities', given one random book

query_index = np.random.choice(book_rating_filtered_pivot.shape[0])
distances, indices = model_knn.kneighbors(book_rating_filtered_pivot.iloc[query_index,:].values.reshape(1,-1), n_neighbors=6)

for i in range(0,len(distances.flatten())):
    if i == 0:
        print('Book Recommendations for {0}:\n'.format(book_rating_filtered_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}'.format(i,book_rating_filtered_pivot.index[indices.flatten()[i]],distances.flatten()[i]))

# Part 2 - Define the functions based on item-based approach

## 1) find k Nearest items 2) predict the reviewer’s rating

In [None]:
# This is the generalized version of the previous test model 
# in terms that now we will take a input value of (a speific book title & an integer k)
# rather than randomly testing the model
# Likewise, this function will return k books with the similarities value, according to the given item

def findkitems(book_input,k):
    query_index = book_rating_filtered_pivot.index.get_loc(book_input)
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn.fit(book_rating_filtered_pivot)

    distances, indices = model_knn.kneighbors(book_rating_filtered_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print ('{0} most similar items for item {1}:\n'.format(k,book_rating_filtered_pivot.index[query_index]))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]== query_index:
            continue;
        else:
            print( '{0}: {1} :, with similarity of {2}'.format(i,book_rating_filtered_pivot.index[indices.flatten()[i]], similarities.flatten()[i]))
    return similarities,indices 

In [None]:
similarities,indices=findkitems("The Hunger Games (The Hunger Games, Book 1)",5)

In [None]:
# The function 'predict_itembased' predicts ratings based on item-based CF approach
# The rating for target-item 'i' for active reviewer 'a' can be predicted by using a simple weighted average
# Therefore, it will return a predicted rating that reviewer 'a' will give item 'i'

def predict_itembased(book_input,reviewer):
    query_index = book_rating_filtered_pivot.index.get_loc(book_input)
    query_index_reviewer = book_rating_filtered_pivot.columns.get_loc(reviewer)
    k=5
    prediction= wtd_sum =0
    similarities, indices=findkitems(book_input,k) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == query_index:
            continue;
        else:
            product = book_rating_filtered_pivot.iloc[indices.flatten()[i],query_index_reviewer] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    print('\nPredicted rating for reviwer {0} -> {1}: {2}'.format(reviewer,book_rating_filtered_pivot.index[query_index],prediction))      

    return prediction

In [None]:
prediction = predict_itembased("The Hunger Games (The Hunger Games, Book 1)","A33C08C20U6DJ0")

In [None]:
# The function 'predict_itembased' predicts ratings based on item-based CF approach
# The rating for target-item 'i' for active reviewer 'a' can be predicted by using a simple weighted average
# Therefore, it will return a predicted rating that reviewer 'a' will give item 'i'
# This function excludes ratings with 0 value when computing wtd_sum & sum_wt

def predict_itembased_excluding_zero(book_input,reviewer):
    query_index = book_rating_filtered_pivot.index.get_loc(book_input)
    query_index_reviewer = book_rating_filtered_pivot.columns.get_loc(reviewer)
    k=5
    prediction= wtd_sum =0
    similarities, indices=findkitems(book_input,k) #similar users based on correlation coefficients
    product=1
    sum_wt = 0
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == query_index:
            continue;
        elif book_rating_filtered_pivot.iloc[indices.flatten()[i],query_index_reviewer] == 0:
            continue;
        elif book_rating_filtered_pivot.iloc[indices.flatten()[i],query_index_reviewer] !=0:
            product = book_rating_filtered_pivot.iloc[indices.flatten()[i],query_index_reviewer] * (similarities[i])
            sims = similarities[i]
            sum_wt = sum_wt + sims
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    print('\nPredicted rating for reviwer {0} -> {1}: {2}'.format(reviewer,book_rating_filtered_pivot.index[query_index],prediction))      

    return prediction

In [None]:
prediction = predict_itembased_excluding_zero("The Hunger Games (The Hunger Games, Book 1)","A33C08C20U6DJ0")