# **Book Recommendation System (Collaborative-Based) - Personal Project**

### Import all the required libraries

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Retrieve and store the datasets into Pandas Data Frames

In [2]:
books_org_df = pd.read_csv('datasets/books.csv', delimiter='\;', encoding='cp1252', error_bad_lines=False, warn_bad_lines=False)
users_ratings_org_df = pd.read_csv('datasets/users_ratings.csv', delimiter='\;', encoding='cp1252', error_bad_lines=False, warn_bad_lines=False)

  books_org_df = pd.read_csv('datasets/books.csv', delimiter='\;', encoding='cp1252', error_bad_lines=False, warn_bad_lines=False)
  users_ratings_org_df = pd.read_csv('datasets/users_ratings.csv', delimiter='\;', encoding='cp1252', error_bad_lines=False, warn_bad_lines=False)


### Data Cleaning

In [3]:
# Get copies of the original datasets
books_df = books_org_df.copy()
users_ratings_df = users_ratings_org_df.copy()

# Renaming all the columns
books_df.columns = ['ISBN', 'Title', 'Author', 'Publication Year', 'Publisher', 'Image URL-S', 'Image URL-M', 'Image URL-L']
users_ratings_df.columns = ['User ID', 'ISBN', 'Rating']

# Remove all double quotes from the values in all of the columns
for column in books_df:
    books_df[column] = books_df[column].str.replace('"', '')
    
for column in users_ratings_df:
    users_ratings_df[column] = users_ratings_df[column].str.replace('"', '')
    

In [4]:
# Drop unneeded features
books_df = books_df.drop(['Image URL-S', 'Image URL-M', 'Image URL-L'], axis=1)


### Building the Recommendation System

In [5]:
# Get the target user's dataset
input_org_df = pd.read_csv('datasets/target_user.csv')


In [6]:
# Get a copy of the target user's dataset
input_df = input_org_df.copy()

# Get the details about the target books from books dataset
input_Id = books_df[books_df['Title'].isin(input_df['Title'].tolist())]
input_df = pd.merge(input_Id, input_df)

# Get the users' ratings for the target books
user_subset = users_ratings_df[users_ratings_df['ISBN'].isin(input_df['ISBN'].tolist())]

# Group the users' ratings into subsets based on user's ID
user_subset_group = user_subset.groupby(['User ID'])

# Sort the user subsets to get the users that read the most amount of books in target books
user_subset_group = sorted(user_subset_group, key=lambda x: len(x[1]), reverse=True)

In [7]:
# Use Pearson Correlation to compute the Similarity Index between the target user and each of the other users
pearsonCorrelationDict = {}

for user_id, group in user_subset_group:
    group = group.sort_values(by='ISBN')
    nRatings = len(group)
    temp_df = input_df[input_df['ISBN'].isin(group['ISBN'].tolist())]
    temp_ratings_list = temp_df['Rating'].tolist()
    temp_group_list = group['Rating'].tolist()
    
    temp_ratings_list = list(map(int, temp_ratings_list))
    temp_group_list = list(map(int, temp_group_list))
    
    Sxx = sum([i**2 for i in temp_ratings_list]) - pow(sum(temp_ratings_list), 2) / float(nRatings)
    Syy = sum([i**2 for i in temp_group_list]) - pow(sum(temp_group_list), 2) / float(nRatings)
    Sxy = sum(i*j for i, j in zip(temp_ratings_list, temp_group_list)) - sum(temp_ratings_list)*sum(temp_group_list)/float(nRatings)
    
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[user_id] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[user_id] = 0
        

In [8]:
# Create a dataframe to store the similarity index results
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['Similarity Index']
pearsonDF['User ID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))


In [9]:
# Sort the users by their similarity indices
top_users = pearsonDF.sort_values(by='Similarity Index', ascending=False)[0:50]


In [10]:
# Merge each users with their respective similarity index
top_users_ratings = top_users.merge(users_ratings_df, left_on='User ID', right_on='User ID', how='inner')
top_users_ratings.head()


Unnamed: 0,Similarity Index,User ID,ISBN,Rating
0,1.0,168639,60391634,0
1,1.0,168639,345318080,7
2,1.0,168639,345339703,0
3,1.0,168639,345339711,10
4,1.0,168639,345339738,0


In [11]:
# Convert values in Rating column to type 'float'
top_users_ratings['Rating'] = pd.to_numeric(top_users_ratings['Rating'])

# Calculate the weighted ratings for each of the users' ratings
top_users_ratings['Weighted Rating'] = top_users_ratings['Similarity Index']*top_users_ratings['Rating']
top_users_ratings.head()


Unnamed: 0,Similarity Index,User ID,ISBN,Rating,Weighted Rating
0,1.0,168639,60391634,0,0.0
1,1.0,168639,345318080,7,7.0
2,1.0,168639,345339703,0,0.0
3,1.0,168639,345339711,10,10.0
4,1.0,168639,345339738,0,0.0


In [12]:
# Create a temporary dataframe to store the Sum of Similiarity Index and the Sum of Weighted Ratings
temp_top_users_ratings = top_users_ratings.groupby('ISBN').sum()[['Similarity Index', 'Weighted Rating']]
temp_top_users_ratings.columns = ['Sum of Similarity Index', 'Sum of Weighted Rating']
temp_top_users_ratings.head()

Unnamed: 0_level_0,Sum of Similarity Index,Sum of Weighted Rating
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
913154,0.612372,4.898979
1047973,1.0,9.0
2157853,0.612372,0.0
2167425,0.612372,0.0
2253097,0.906217,0.0


In [13]:
# Create a dataframe for the recommendation results
recommendation_df = pd.DataFrame()

# Get the Weighted Average Recommendation Score for each book
recommendation_df['Weighted Average Recommendation Score'] = temp_top_users_ratings['Sum of Weighted Rating'] / temp_top_users_ratings['Sum of Similarity Index']
recommendation_df['ISBN'] = temp_top_users_ratings.index
recommendation_df.head()

Unnamed: 0_level_0,Weighted Average Recommendation Score,ISBN
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
913154,8.0,913154
1047973,9.0,1047973
2157853,0.0,2157853
2167425,0.0,2167425
2253097,0.0,2253097


In [14]:
# Sort the Recommendation Scores to get the highest scores on top
recommendation_df = recommendation_df.sort_values(by='Weighted Average Recommendation Score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,Weighted Average Recommendation Score,ISBN
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
0393037525,10.0,0393037525
0156623447,10.0,0156623447
067088040X,10.0,067088040X
0192834193,10.0,0192834193
0385496095,10.0,0385496095
0192833553,10.0,0192833553
019282760X,10.0,019282760X
140003065X,10.0,140003065X
0192803735,10.0,0192803735
0670839531,10.0,0670839531


### Recommendation Results

In [21]:
# Retrieve the books with the top 20 highest recommendation scores
books_df.loc[books_df['ISBN'].isin(recommendation_df.head(20)['ISBN'].tolist())]

Unnamed: 0,ISBN,Title,Author,Publication Year,Publisher
5500,0385496095,Traveling Mercies: Some Thoughts on Faith,Anne Lamott,2000,Anchor Books/Doubleday
9925,140003065X,A Fine Balance,Rohinton Mistry,2001,Vintage Books USA
12686,0670839531,Needful Things: The Last Castle Rock Story,Stephen King,1991,Viking Books
32278,019282760X,Pride and Prejudice (World's Classics),Jane Austen,1990,Oxford University Press
37319,0156013983,Le Petit Prince (French Language Edition),Antoine de Saint-ExupÃ©ry,2001,Harvest Books
38898,0446674362,The Black Dahlia,James Ellroy,1998,Warner Books
43636,0192833553,Pride and Prejudice (Oxford World's Classics),Jane Austen,1998,Oxford University Press
48324,015601226X,As Meat Loves Salt (Harvest Original),Maria McCann,2003,Harvest Books
78032,078690108X,"Knights of the Crown (Dragonlance Warriors, Vo...",Roland Green,1995,Wizards of the Coast
103738,0786867663,Creating a Life: Professional Women and the Qu...,Sylvia Ann Hewlett,2002,Miramax
