In [23]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import re
from PIL import Image
import io
import requests
import random
import urllib.request
import urllib
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

In [24]:
# Reading Books dataset (details about books)
books=pd.read_csv("Data/Books.csv")
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [25]:
# Reading ratings dataset (details about users, ratings, and book-id)
rating=pd.read_csv("Data/Ratings.csv")
rating.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [26]:
# Details about users
users=pd.read_csv("Data/Users.csv")
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [27]:
# Shape of the dataset

print("Books Shape: " ,books.shape )
print("Ratings Shape: " ,rating.shape )
print("Users Shape: " ,users.shape )


Books Shape:  (271360, 8)
Ratings Shape:  (1149780, 3)
Users Shape:  (278858, 3)


In [28]:
#Checking for null values

print("Any null values in Books:\n" ,books.isnull().sum())
print()
print("Any null values in Ratings:\n ",rating.isnull().sum())
print()
print("Any null values in Users:\n",users.isnull().sum())

Any null values in Books:
 ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

Any null values in Ratings:
  User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

Any null values in Users:
 User-ID          0
Location         0
Age         110762
dtype: int64


In [29]:
# Merge books and ratings dataset

books_data=books.merge(rating,on="ISBN")
books_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [30]:
# make a copy of dataset
df=books_data.copy()

# drop all null values
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)

#drop unnecessary columns
df.drop(columns=["ISBN","Year-Of-Publication","Image-URL-S","Image-URL-M"],axis=1,inplace=True)

#drop ratings equal to 0 i.e. books with no ratings
df.drop(index=df[df["Book-Rating"]==0].index,inplace=True)

# clean book title names
df["Book-Title"]=df["Book-Title"].apply(lambda x: re.sub("[\W_]+"," ",x).strip())
df.head()

Unnamed: 0,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating
1,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5
3,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8
5,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,8
8,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,9
9,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,9


# Popularity Based Recommendating System


In [31]:
def popular_books(df,n=100):
    
    #calculate the number of times each book is rated 
    rating_count=df.groupby("Book-Title").count()["Book-Rating"].reset_index()
    rating_count.rename(columns={"Book-Rating":"NumberOfVotes"},inplace=True)
    
    #calculate the average rating of each book
    rating_average=df.groupby("Book-Title")["Book-Rating"].mean().reset_index()
    rating_average.rename(columns={"Book-Rating":"AverageRatings"},inplace=True)
    
    # merge both the dataframes
    popularBooks=rating_count.merge(rating_average,on="Book-Title")
    
    #function to calculate the bayesian average for ranking books
    def weighted_rate(x):
        
        #number of votes
        v=x["NumberOfVotes"]
        
        #average ratings
        R=x["AverageRatings"]
        
        #formula for bayesian average
        bayesian_average = ((v*R) + (m*C)) / (v+C)
        
        return bayesian_average 
    
    
    # caluclate the average of all ratings
    m=popularBooks["AverageRatings"].mean()
    
    # calculating a confidence number which is the 0.90 percentile rating
    C=popularBooks["NumberOfVotes"].quantile(0.90)
    
    # filter books with count of ratings more than 250 for better average
    popularBooks=popularBooks[popularBooks["NumberOfVotes"] >=250]
    popularBooks["Popularity"]=popularBooks.apply(weighted_rate,axis=1)
    popularBooks=popularBooks.sort_values(by="Popularity",ascending=False)
    return popularBooks[["Book-Title","NumberOfVotes","AverageRatings","Popularity"]].reset_index(drop=True).head(n)

In [32]:
n=10
top_ten=pd.DataFrame(popular_books(df,10))
print("10 Most Popular Books")
print('********************')
for i in range(len(top_ten["Book-Title"].tolist())):
    book_name=df.loc[df["Book-Title"]==top_ten["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    rating = round(df[df["Book-Title"]==top_ten["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
    print(str(i+1)+". "+book_name+"|| Rating: ", rating)

10 Most Popular Books
********************
1. Harry Potter and the Prisoner of Azkaban Book 3|| Rating:  9.0
2. To Kill a Mockingbird|| Rating:  9.0
3. Harry Potter and the Sorcerer s Stone Harry Potter Paperback|| Rating:  8.9
4. Harry Potter and the Chamber of Secrets Book 2|| Rating:  8.8
5. Tuesdays with Morrie An Old Man a Young Man and Life s Greatest Lesson|| Rating:  8.6
6. The Secret Life of Bees|| Rating:  8.5
7. The Da Vinci Code|| Rating:  8.4
8. The Lovely Bones A Novel|| Rating:  8.2
9. The Red Tent Bestselling Backlist|| Rating:  8.2
10. Where the Heart Is Oprah s Book Club Paperback|| Rating:  8.1


# ITEM BASED COLLABORATIVE FILTERING

In [33]:
#clean data same as above 

df=books_data.copy()
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.drop(columns=["ISBN","Year-Of-Publication","Image-URL-S","Image-URL-M"],axis=1,inplace=True)
df.drop(index=df[df["Book-Rating"]==0].index,inplace=True)
df["Book-Title"]=df["Book-Title"].apply(lambda x: re.sub("[\W_]+"," ",x).strip())
df.head()

Unnamed: 0,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating
1,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5
3,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8
5,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,8
8,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,9
9,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,9


In [78]:
# function for item based collaborative filtering
def item_based(bookTitle):
    bookTitle=str(bookTitle)
    
    # check if bookTitle mentioned is in our dataframe
    if bookTitle in df["Book-Title"].values:
        
        # calculate the counts of each book in the dataframe
        rating_count=pd.DataFrame(df["Book-Title"].value_counts())
        
        # if the count is less than 200, then it is a rare book
        rare_books=rating_count[rating_count["Book-Title"]<=200].index
        
        # if count is more than 200 for each book, then it is a common book
        common_books=df[~df["Book-Title"].isin(rare_books)]
        
        # if book title is in rare books, then suggest 3 books with weighted samples
        if bookTitle in rare_books:
            #calcualte weighted sample common books

            most_common=pd.Series(common_books["Book-Title"].unique()).sample(n = 3, weights = df.groupby('Book-Title')['Book-Rating'].transform('count')).values
            print("No Recommendations for this Book ☹️ \n ")
            print("YOU MAY TRY: \n ")
            print("{}".format(most_common[0]), "\n")
            print("{}".format(most_common[1]), "\n")
            print("{}".format(most_common[2]), "\n")

        else:
            # Calculate the user-item ratings matrix from the data frame
            common_books_pivot=common_books.pivot_table(index=["User-ID"],columns=["Book-Title"],values="Book-Rating")
            
            title=common_books_pivot[bookTitle]
            
            # Get the recommendation_df by calculating the correlation of each book with every other book and sorting the values
            recommendation_df=pd.DataFrame(common_books_pivot.corrwith(title).sort_values(ascending=False)).reset_index(drop=False)

            # remove the given input book title from the recommendation dataframe so as to not recommend the same book 
            if bookTitle in [title for title in recommendation_df["Book-Title"]]:
                recommendation_df=recommendation_df.drop(recommendation_df[recommendation_df["Book-Title"]==bookTitle].index[0])

            # remove books with low rating (5), to maintain the quality of the book recommended
            less_rating=[]
            for i in recommendation_df["Book-Title"]:
                if df[df["Book-Title"]==i]["Book-Rating"].mean() < 5:
                    less_rating.append(i)
            
            # remove low recommended books only if the difference below is greater than 5, so as to give atleast 5 books as recommendations
            if recommendation_df.shape[0] - len(less_rating) > 5:
                recommendation_df=recommendation_df[~recommendation_df["Book-Title"].isin(less_rating)]

            # select top 5 recommendated books based on the correlation values
            recommendation_df=recommendation_df[0:5]
            recommendation_df.columns=["Book-Title","Correlation"]
            
            # Print the recommended books
            print('Would You Like To Try These Books')
            print('*********************************')
            for i in range(len(recommendation_df["Book-Title"].tolist())):
                book_name=df.loc[df["Book-Title"]==recommendation_df["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
                ratings = round(df[df["Book-Title"]==recommendation_df["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
                print(str(i+1)+". "+book_name+"|| Rating: ", ratings)
    else:
        print("❌ COULD NOT FIND ❌")

In [79]:
item_based("To Kill a Mockingbird")


Would You Like To Try These Books
*********************************
1. The Nanny Diaries A Novel|| Rating:  7.4
2. Summer Sisters|| Rating:  7.5
3. The Fellowship of the Ring The Lord of the Rings Part 1|| Rating:  8.9
4. Jurassic Park|| Rating:  7.9
5. The Lovely Bones A Novel|| Rating:  8.2


In [36]:
item_based("Harry Potter and the Chamber of Secrets Book 2")

Would You Like To Try These Books
*********************************
1. The Summons|| Rating:  8.1
2. The Pilot s Wife A Novel|| Rating:  8.1
3. Harry Potter and the Sorcerer s Stone Harry Potter Paperback|| Rating:  8.1
4. 1st to Die A Novel|| Rating:  8.1
5. The Notebook|| Rating:  8.1


In [37]:
item_based("The Da Vinci Code")

Would You Like To Try These Books
*********************************
1. Where the Heart Is Oprah s Book Club Paperback|| Rating:  8.1
2. Fahrenheit 451|| Rating:  8.1
3. 1st to Die A Novel|| Rating:  8.1
4. Harry Potter and the Order of the Phoenix Book 5|| Rating:  8.1
5. Summer Sisters|| Rating:  8.1


In [38]:
item_based("Barbie")


❌ COULD NOT FIND ❌


In [39]:
item_based("El Misterio De Sittaford")

No Recommendations for this Book ☹️ 
 
YOU MAY TRY: 
 
The Pilot s Wife A Novel 

Harry Potter and the Chamber of Secrets Book 2 

The Brethren 



# User-based collaborative filtering


In [50]:
# Drop users who vote less than 200 times.
new_df=df[df['User-ID'].map(df['User-ID'].value_counts()) > 200]  

# Calculate the user-item ratings matrix from the data frame
users_pivot=new_df.pivot_table(index=["User-ID"],columns=["Book-Title"],values="Book-Rating")

# repalce null values with 0
users_pivot.fillna(0,inplace=True)

In [51]:
# Function to get favorite books for a certain user
def users_choice(id):
    # getting favorite book by sorting on rating values given by the user to books
    users_fav=new_df[new_df["User-ID"]==id].sort_values(["Book-Rating"],ascending=False)[0:5]
    return users_fav

In [52]:

# function to find the similar users for a given users
def user_based(new_df,id):

    # check if user is present in the dataframe
    if id not in new_df["User-ID"].values:
        print("User NOT FOUND")
        
        
    else:
        # find the data from the user-item matrix with the users id as index
        index=np.where(users_pivot.index==id)[0][0]

        # calculate similarity matrix for the user with other users using cosine similarity
        similarity=cosine_similarity(users_pivot)

        # find similar users and sort them based on the similarity values
        similar_users=list(enumerate(similarity[index]))
        similar_users = sorted(similar_users,key = lambda x:x[1],reverse=True)[0:5]
    
        user_rec=[]
        # get data for each user id from similar users
        for i in similar_users:
            data=df[df["User-ID"]==users_pivot.index[i[0]]]
            user_rec.extend(list(data.drop_duplicates("User-ID")["User-ID"].values))
        
    return user_rec

In [86]:
# function to find the recommended books
def common(new_df,user,user_id):
    '''
    new_df : Orginal dataframe with book and ratings data
    user : list of all similar users from function user_based()
    user_id : recommend books to this user_id
    '''
    # get all the data for the user_id
    x=new_df[new_df["User-ID"]==user_id]

    recommend_books=[]
    user=list(user)
    # loop through all similar users
    for i in user:
        # get data from similar users
        y=new_df[(new_df["User-ID"]==i)]
        # remove books similar to the ones our user has already rated
        books=y.loc[~y["Book-Title"].isin(x["Book-Title"]),:]
        # take top 5 books from each user
        books=books.sort_values(["Book-Rating"],ascending=False)[0:5]
        # add the top 5 books to recommended list
        recommend_books.extend(books["Book-Title"].values)

    #find average rating for each recommended book
    recommended_rating = {}
    for i in recommend_books:
        recommended_rating[i] = new_df[new_df['Book-Title'] == i]['Book-Rating'].mean()
    
    # sort the recommended books based on average ratings
    top_rated_recommendation = sorted(recommended_rating, key = recommended_rating.get, reverse=True)  

    # return top 5 from the recommended books
    return top_rated_recommendation[0:5]

In [87]:
# choose random user
user_id=random.choice(new_df["User-ID"].values)
# find top 5 favorite books for the user
user_choice_df=pd.DataFrame(users_choice(user_id))
user_favorite=users_choice(user_id)
n=len(user_choice_df["Book-Title"].values)
print("USER: {} ".format(user_id))
    
print('Your Favorite Books')
print('*******************')

# print the selected users top 5 books
for i in range(n):
    book_name=new_df.loc[new_df["Book-Title"]==user_choice_df["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    ratings = round(df[df["Book-Title"]==top_ten["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
    print(str(i+1)+". "+book_name+"|| Rating: ", rating)

# find simalre users
user_based_rec=user_based(new_df,user_id)
#get recommended books from similar users
books_for_user=common(new_df,user_based_rec,user_id)

books_for_userDF=pd.DataFrame(books_for_user,columns=["Book-Title"])
print()
print()
print('You May Also Like These Books')
print('*****************************')

# print recommended books
for i in range(5):
    book_name=new_df.loc[new_df["Book-Title"]==books_for_userDF["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    ratings = round(df[df["Book-Title"]==books_for_userDF["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
    print(str(i+1)+". "+book_name+"|| Rating: ", ratings)

USER: 112001 
Your Favorite Books
*******************
1. Santa Fe Rules|| Rating:  8.1
2. The Thrill of Victory|| Rating:  8.1
3. Heaven s Price|| Rating:  8.1
4. Texas Lucky|| Rating:  8.1
5. The Steel Caress|| Rating:  8.1


You May Also Like These Books
*****************************
1. Treasury of Illustrated Classics Adventures of Huckleberry Finn|| Rating:  9.7
2. Chicken Soup for the Kid s Soul 101 Stories of Courage Hope and Laughter Chicken Soup for the Soul Paperback Health Communications|| Rating:  7.6
3. Our National Parks America s Spectacular Wilderness Heritage|| Rating:  10.0
4. Fallen Hearts Casteel|| Rating:  9.0
5. Anne of Windy Poplars|| Rating:  9.0


# Content Based Recommended System



In [93]:
def content_based(bookTitle):
    bookTitle=str(bookTitle)
    
    if bookTitle in df["Book-Title"].values:
        # calculate rare and common books based on number of ratings
        rating_count=pd.DataFrame(df["Book-Title"].value_counts())
        rare_books=rating_count[rating_count["Book-Title"]<=200].index
        common_books=df[~df["Book-Title"].isin(rare_books)]
        
        # if book title is in rare books, then suggest 3 books with weighted samples
        if bookTitle in rare_books:
            # calculate the weighted sample
            most_common=pd.Series(common_books["Book-Title"].unique()).sample(n = 3, weights = df.groupby('Book-Title')['Book-Rating'].transform('count')).values
            print("No Recommendations for this Book ☹️ \n ")
            print("YOU MAY TRY: \n ")
            print("{}".format(most_common[0]), "\n")
            print("{}".format(most_common[1]), "\n")
            print("{}".format(most_common[2]), "\n")
        else:
            # remove duplicates
            common_books=common_books.drop_duplicates(subset=["Book-Title"])
            common_books.reset_index(inplace=True)
            # add indexes to the book
            common_books["index"]=[i for i in range(common_books.shape[0])]
            # select targets for creating features
            targets=["Book-Title","Book-Author","Publisher"]
            # create a feature columns from all the target columns above
            common_books["all_features"] = [" ".join(common_books[targets].iloc[i,].values) for i in range(common_books[targets].shape[0])]
            
            # create count embeddings using count vectorizer for the features columns
            vectorizer=CountVectorizer()
            common_booksVector=vectorizer.fit_transform(common_books["all_features"])

            # find similarity basefd on the vectors
            similarity=cosine_similarity(common_booksVector)

            # get index for our given book title
            index=common_books[common_books["Book-Title"]==bookTitle]["index"].values[0]
            
            #sort the simlar books from similarity matrix and select first 5 books
            similar_books=list(enumerate(similarity[index]))
            similar_booksSorted=sorted(similar_books,key=lambda x:x[1],reverse=True)[1:6]
            books=[]

            for i in range(len(similar_booksSorted)):
                books.append(common_books[common_books["index"]==similar_booksSorted[i][0]]["Book-Title"].item())
                
            # print the recommended books
            print('You May Like These Books')
            print('***********************')
            for i in range(len(books)):
                
                book_name=common_books.loc[common_books["Book-Title"]==books[i],"Book-Title"][:1].values[0]
                ratings = round(df[df["Book-Title"]==common_books["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
                print(str(i+1)+". "+book_name+"|| Rating: ", ratings)
    else:
        print("COULD NOT FIND ") 

In [94]:
content_based("The Da Vinci Code")

You May Like These Books
***********************
1. The Catcher in the Rye|| Rating:  7.5
2. The Brethren|| Rating:  4.4
3. The Firm|| Rating:  7.9
4. The Chamber|| Rating:  9.0
5. The Partner|| Rating:  7.5


In [95]:
content_based("Tuesdays with Morrie An Old Man a Young Man and Life s Greatest Lesson")


You May Like These Books
***********************
1. The Five People You Meet in Heaven|| Rating:  7.5
2. The Brethren|| Rating:  4.4
3. Life of Pi|| Rating:  7.9
4. The Firm|| Rating:  9.0
5. The Chamber|| Rating:  7.5


In [96]:
content_based("A Soldier of the Great War")


No Recommendations for this Book ☹️ 
 
YOU MAY TRY: 
 
The Poisonwood Bible A Novel 

Harry Potter and the Goblet of Fire Book 4 

The Pilot s Wife A Novel 



In [97]:
content_based("Life of Pi")


You May Like These Books
***********************
1. The Secret Life of Bees|| Rating:  7.5
2. Snow Falling on Cedars|| Rating:  4.4
3. The Fellowship of the Ring The Lord of the Rings Part 1|| Rating:  7.9
4. Harry Potter and the Chamber of Secrets Book 2|| Rating:  9.0
5. Harry Potter and the Prisoner of Azkaban Book 3|| Rating:  7.5
