# **Book Recommendation System (Content-Based) - Personal Project**

### Import all the required libraries

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Cleaning and Preprocessing the data

In [2]:
# Store the data into a dataframe
books_org_df = pd.read_csv('datasets/books.csv', error_bad_lines=False, warn_bad_lines=False)

In [3]:
# Copy the original dataframe into a new one
books_df = books_org_df.copy()

# Drop unnecessary features
books_df = books_df.drop(['language', 'averageRating', 'maturityRating', 'publisher', 'pageCount'], axis=1)

# Replace null values in categories and authors
books_df['categories'] = books_df['categories'].fillna('Others')
books_df['authors'] = books_df['authors'].fillna('Unknown')

# Leave only the published year in the publishedDate column
books_df['publishedDate'] = books_df['publishedDate'].astype(str)
books_df['publishedDate'] = books_df['publishedDate'].apply(lambda x: x[:4])

# Clean the categories feature
books_df['categories'] = books_df['categories'].str.replace("[", "").str.replace("]", "").str.replace("'", "")

# Clean the authors feature
books_df['authors'] = books_df['authors'].str.replace("[", "").str.replace("]", "").str.replace("'", "")

# Rename the books' id column
books_df.rename(columns={'Unnamed: 0': 'bookID'}, inplace=True)

books_df.head()

Unnamed: 0,bookID,title,authors,categories,publishedDate
0,0,Sammy Keyes and the Art of Deception,Wendelin Van Draanen,Juvenile Fiction,2009
1,1,Inward Journey,Unknown,Medical,1983
2,2,The Boston Directory ...,Unknown,Boston,1865
3,3,Bring Your Own Devices (BYOD) Survival Guide,Jessica Keyes,Business & Economics,2016
4,4,Enterprise 2.0,Jessica Keyes,Business & Economics,2016


### Split the unique categories, each with their own seperate column

In [4]:
# Get a copy of the books dataframe
books_genres_df = books_df.copy()

# Create a column for each unique category
for index, row in books_df.iterrows():
    books_genres_df.at[index, row['categories']] = 1
    
# Fill in the new columns with 0s
books_genres_df = books_genres_df.fillna(0)
books_genres_df.head()
    

Unnamed: 0,bookID,title,authors,categories,publishedDate,Juvenile Fiction,Medical,Boston,Business & Economics,Computers,...,Bibliography,China,Hairdressing,Securities,Booksellers and bookselling,Catholic literature,English poetry,"Scholars, American",Latin America,Irish
0,0,Sammy Keyes and the Art of Deception,Wendelin Van Draanen,Juvenile Fiction,2009,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Inward Journey,Unknown,Medical,1983,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,The Boston Directory ...,Unknown,Boston,1865,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Bring Your Own Devices (BYOD) Survival Guide,Jessica Keyes,Business & Economics,2016,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Enterprise 2.0,Jessica Keyes,Business & Economics,2016,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Retrieving the target user's dataset

In [5]:
# Store the target user's dataset
input_books_org = pd.read_csv('datasets/target_user.csv')

In [6]:
# Get a copy of the input dataset
input_books = input_books_org.copy()

# Get the book ID of the books in the input books dataset
input_id = books_df[books_df['title'].isin(input_books['title'].tolist())]
input_books = pd.merge(input_id, input_books)
input_books = input_books.drop(['authors', 'publishedDate', 'categories'], axis=1)

# Drop all duplicate rows
input_books = input_books.drop_duplicates(subset='title', keep='last')
input_books

Unnamed: 0,bookID,title,rating
0,59,Lord of Souls,8
9,986,Sammy Keyes and the Skeleton Man,10
10,449,Advances in Information Security and Assurance,10
11,462,A Question of Identity,8
12,736,Compendium of Problems in Genetics,7
13,854,The Shining Wall,9


In [7]:
# Get the categories columns for the input books
target_user_books = books_genres_df[books_genres_df['bookID'].isin(input_books['bookID'].tolist())]
target_user_books

Unnamed: 0,bookID,title,authors,categories,publishedDate,Juvenile Fiction,Medical,Boston,Business & Economics,Computers,...,Bibliography,China,Hairdressing,Securities,Booksellers and bookselling,Catholic literature,English poetry,"Scholars, American",Latin America,Irish
59,59,Lord of Souls,J. Gregory Keyes,Fiction,2011,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
449,449,Advances in Information Security and Assurance,"James (Jong Hyuk) Park, Hsiao-Hwa Chen, Mohamm...",Computers,2009,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
462,462,A Question of Identity,Susan Hill,Fiction,2012,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
736,736,Compendium of Problems in Genetics,"John Kuspira, Ramesh Bhambhani",Science,1994,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
854,854,The Shining Wall,Melissa Ferguson,Fiction,2019,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
986,986,Sammy Keyes and the Skeleton Man,Unknown,Juvenile Fiction,2003,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Compute the Recommendation Score for each book

In [8]:
# Reset the index for the dataframe
target_user_books = target_user_books.reset_index(drop=True)

# Drop all features except for the categories columns
target_user_genres = target_user_books.drop(['bookID', 'title', 'authors', 'categories', 'publishedDate'], axis=1)
target_user_genres

Unnamed: 0,Juvenile Fiction,Medical,Boston,Business & Economics,Computers,Biography & Autobiography,Others,History,Psychology,Law,...,Bibliography,China,Hairdressing,Securities,Booksellers and bookselling,Catholic literature,English poetry,"Scholars, American",Latin America,Irish
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Calculate the weighted genre score by multiplying the categories with the ratings for the respective books
target_user_profile = target_user_genres.transpose().dot(input_books['rating'].to_numpy())
target_user_profile

Juvenile Fiction         9.0
Medical                  0.0
Boston                   0.0
Business & Economics     0.0
Computers               10.0
                        ... 
Catholic literature      0.0
English poetry           0.0
Scholars, American       0.0
Latin America            0.0
Irish                    0.0
Length: 281, dtype: float64

In [10]:
# Get just the categories columns of the main books dataset
genre_table = books_genres_df.set_index(books_genres_df['bookID'])
genre_table = genre_table.drop(['bookID', 'title', 'authors', 'categories', 'publishedDate'], axis=1)
genre_table.head()

Unnamed: 0_level_0,Juvenile Fiction,Medical,Boston,Business & Economics,Computers,Biography & Autobiography,Others,History,Psychology,Law,...,Bibliography,China,Hairdressing,Securities,Booksellers and bookselling,Catholic literature,English poetry,"Scholars, American",Latin America,Irish
bookID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Compute the recommendation score for each book in the books dataset
recommendation_df = ((genre_table * target_user_profile).sum(axis=1))/(target_user_profile.sum())
recommendation_df.sort_values(ascending=False, inplace=True)
recommendation_df

bookID
533    0.480769
703    0.480769
427    0.480769
506    0.480769
423    0.480769
         ...   
630    0.000000
629    0.000000
628    0.000000
627    0.000000
512    0.000000
Length: 1025, dtype: float64

### Retrieve the top 20 books with the highest Recommendation Score

In [12]:
result_df = books_df[books_df['bookID'].isin(recommendation_df.head(20).keys())]
result_df

Unnamed: 0,bookID,title,authors,categories,publishedDate
38,38,No Oath Sworn,Phil Geusz,Fiction,2012
51,51,The Infernal City: An Elder Scrolls Novel,Greg Keyes,Fiction,2009
52,52,Sushi for Beginners,Marian Keyes,Fiction,2009
173,173,Legacy Discovered,Kerry Reis,Fiction,2013
218,218,Imperfect Chemistry,Mary Frame,Fiction,2014
240,240,The Woman Who Stole My Life,Marian Keyes,Fiction,2014
423,423,The Bloody Man,Bevan Amberhill,Fiction,1993
427,427,Devonshire Scream,Laura Childs,Fiction,2016
428,428,Gaywyck,Vincent Virga,Fiction,1980
506,506,اللؤلؤة,جون شتاينبك,Fiction,2018
