In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import re
from PIL import Image
import io
import requests
import random
import urllib.request
import urllib
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

In [3]:
# Reading Books dataset (details about books)
books=pd.read_csv("Data/Books.csv")
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [81]:
# Reading ratings dataset (details about users, ratings, and book-id)
rating=pd.read_csv("Data/Ratings.csv")
rating.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [82]:
# Details about users
users=pd.read_csv("Data/Users.csv")
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [83]:
# Shape of the dataset

print("Books Shape: " ,books.shape )
print("Ratings Shape: " ,rating.shape )
print("Users Shape: " ,users.shape )


Books Shape:  (271360, 8)
Ratings Shape:  (1149780, 3)
Users Shape:  (278858, 3)


In [84]:
#Checking for null values

print("Any null values in Books:\n" ,books.isnull().sum())
print()
print("Any null values in Ratings:\n ",rating.isnull().sum())
print()
print("Any null values in Users:\n",users.isnull().sum())

Any null values in Books:
 ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

Any null values in Ratings:
  User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

Any null values in Users:
 User-ID          0
Location         0
Age         110762
dtype: int64


In [85]:
# Merge books and ratings dataset

books_data=books.merge(rating,on="ISBN")
books_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [86]:
# make a copy of dataset
df=books_data.copy()

# drop all null values
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)

#drop unnecessary columns
df.drop(columns=["ISBN","Year-Of-Publication","Image-URL-S","Image-URL-M"],axis=1,inplace=True)

#drop ratings equal to 0 i.e. books with no ratings
df.drop(index=df[df["Book-Rating"]==0].index,inplace=True)

# clean book title names
df["Book-Title"]=df["Book-Title"].apply(lambda x: re.sub("[\W_]+"," ",x).strip())
df.head()

Unnamed: 0,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating
1,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5
3,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8
5,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,8
8,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,9
9,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,9


# Popularity Based Recommendating System


In [87]:
def popular_books(df,n=100):
    
    #calculate the number of times each book is rated 
    rating_count=df.groupby("Book-Title").count()["Book-Rating"].reset_index()
    rating_count.rename(columns={"Book-Rating":"NumberOfVotes"},inplace=True)
    
    #calculate the average rating of each book
    rating_average=df.groupby("Book-Title")["Book-Rating"].mean().reset_index()
    rating_average.rename(columns={"Book-Rating":"AverageRatings"},inplace=True)
    
    # merge both the dataframes
    popularBooks=rating_count.merge(rating_average,on="Book-Title")
    
    #function to calculate the bayesian average for ranking books
    def weighted_rate(x):
        
        #number of votes
        v=x["NumberOfVotes"]
        
        #average ratings
        R=x["AverageRatings"]
        
        #formula for bayesian average
        bayesian_average = ((v*R) + (m*C)) / (v+C)
        
        return bayesian_average 
    
    
    # caluclate the average of all ratings
    m=popularBooks["AverageRatings"].mean()
    
    # calculating a confidence number which is the 0.90 percentile rating
    C=popularBooks["NumberOfVotes"].quantile(0.90)
    
    # filter books with count of ratings more than 250 for better average
    popularBooks=popularBooks[popularBooks["NumberOfVotes"] >=250]
    popularBooks["Popularity"]=popularBooks.apply(weighted_rate,axis=1)
    popularBooks=popularBooks.sort_values(by="Popularity",ascending=False)
    return popularBooks[["Book-Title","NumberOfVotes","AverageRatings","Popularity"]].reset_index(drop=True).head(n)

In [88]:
n=10
top_ten=pd.DataFrame(popular_books(df,10))
print("10 Most Popular Books")
print('********************')
for i in range(len(top_ten["Book-Title"].tolist())):
    book_name=df.loc[df["Book-Title"]==top_ten["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    rating = round(df[df["Book-Title"]==top_ten["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
    print(str(i+1)+". "+book_name+"|| Rating: ", rating)

10 Most Popular Books
********************
1. Harry Potter and the Prisoner of Azkaban Book 3|| Rating:  9.0
2. To Kill a Mockingbird|| Rating:  9.0
3. Harry Potter and the Sorcerer s Stone Harry Potter Paperback|| Rating:  8.9
4. Harry Potter and the Chamber of Secrets Book 2|| Rating:  8.8
5. Tuesdays with Morrie An Old Man a Young Man and Life s Greatest Lesson|| Rating:  8.6
6. The Secret Life of Bees|| Rating:  8.5
7. The Da Vinci Code|| Rating:  8.4
8. The Lovely Bones A Novel|| Rating:  8.2
9. The Red Tent Bestselling Backlist|| Rating:  8.2
10. Where the Heart Is Oprah s Book Club Paperback|| Rating:  8.1


# ITEM BASED COLLABORATIVE FILTERING

In [23]:
#clean data same as above 

df=books_data.copy()
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.drop(columns=["ISBN","Year-Of-Publication","Image-URL-S","Image-URL-M"],axis=1,inplace=True)
df.drop(index=df[df["Book-Rating"]==0].index,inplace=True)
df["Book-Title"]=df["Book-Title"].apply(lambda x: re.sub("[\W_]+"," ",x).strip())
df.head()

Unnamed: 0,Book-Title,Book-Author,Publisher,Image-URL-L,User-ID,Book-Rating
1,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5
3,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,11676,8
5,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,67544,8
8,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,116866,9
9,Clara Callan,Richard Bruce Wright,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,123629,9


In [92]:
df["Book-Title"].value_counts()

The Lovely Bones A Novel                                                                   707
Wild Animus                                                                                581
The Da Vinci Code                                                                          494
The Secret Life of Bees                                                                    406
The Nanny Diaries A Novel                                                                  393
                                                                                          ... 
The Road Less Traveled A New Psychology of Love Traditional Values and Spiritual Growth      1
Clippings from My Notebook                                                                   1
Facing the Music Stories The Bright Leaf Short Fiction Series 6                              1
No Man Is an Island A Harvest Hbj Book                                                       1
From One to One Hundred                           

In [56]:
# function for item based collaborative filtering
def item_based(bookTitle):
    bookTitle=str(bookTitle)
    
    # check if bookTitle mentioned is in our dataframe
    if bookTitle in df["Book-Title"].values:
        
        # calculate the counts of each book in the dataframe
        rating_count=pd.DataFrame(df["Book-Title"].value_counts())
        
        # if the count is less than 200, then it is a rare book
        rare_books=rating_count[rating_count["Book-Title"]<=200].index
        
        # if count is more than 200 for each book, then it is a common book
        common_books=df[~df["Book-Title"].isin(rare_books)]
        
        if bookTitle in rare_books:
            most_common=pd.Series(common_books["Book-Title"].unique()).sample(3).values
            print("No Recommendations for this Book ☹️ \n ")
            print("YOU MAY TRY: \n ")
            print("{}".format(most_common[0]), "\n")
            print("{}".format(most_common[1]), "\n")
            print("{}".format(most_common[2]), "\n")
        else:
            common_books_pivot=common_books.pivot_table(index=["User-ID"],columns=["Book-Title"],values="Book-Rating")
            title=common_books_pivot[bookTitle]
            recommendation_df=pd.DataFrame(common_books_pivot.corrwith(title).sort_values(ascending=False)).reset_index(drop=False)

            if bookTitle in [title for title in recommendation_df["Book-Title"]]:
                recommendation_df=recommendation_df.drop(recommendation_df[recommendation_df["Book-Title"]==bookTitle].index[0])

            less_rating=[]
            for i in recommendation_df["Book-Title"]:
                if df[df["Book-Title"]==i]["Book-Rating"].mean() < 5:
                    less_rating.append(i)
            if recommendation_df.shape[0] - len(less_rating) > 5:
                recommendation_df=recommendation_df[~recommendation_df["Book-Title"].isin(less_rating)]

            recommendation_df=recommendation_df[0:5]
            recommendation_df.columns=["Book-Title","Correlation"]
            
            print('Would You Like To Try These Books')
            print('*********************************')
            for i in range(len(recommendation_df["Book-Title"].tolist())):
                book_name=df.loc[df["Book-Title"]==recommendation_df["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
                ratings = round(df[df["Book-Title"]==top_ten["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
                print(str(i+1)+". "+book_name+"|| Rating: ", rating)
    else:
        print("❌ COULD NOT FIND ❌")

In [89]:
item_based("To Kill a Mockingbird")


Would You Like To Try These Books
*********************************
1. The Nanny Diaries A Novel|| Rating:  8.1
2. Summer Sisters|| Rating:  8.1
3. The Fellowship of the Ring The Lord of the Rings Part 1|| Rating:  8.1
4. Jurassic Park|| Rating:  8.1
5. The Lovely Bones A Novel|| Rating:  8.1


In [90]:
item_based("Harry Potter and the Chamber of Secrets Book 2")

Would You Like To Try These Books
*********************************
1. The Summons|| Rating:  8.1
2. The Pilot s Wife A Novel|| Rating:  8.1
3. Harry Potter and the Sorcerer s Stone Harry Potter Paperback|| Rating:  8.1
4. 1st to Die A Novel|| Rating:  8.1
5. The Notebook|| Rating:  8.1


In [91]:
item_based("The Da Vinci Code")

Would You Like To Try These Books
*********************************
1. Where the Heart Is Oprah s Book Club Paperback|| Rating:  8.1
2. Fahrenheit 451|| Rating:  8.1
3. 1st to Die A Novel|| Rating:  8.1
4. Harry Potter and the Order of the Phoenix Book 5|| Rating:  8.1
5. Summer Sisters|| Rating:  8.1


In [51]:
item_based("Barbie")


❌ COULD NOT FIND ❌


In [52]:
item_based("El Misterio De Sittaford")

No Recommendations for this Book ☹️ 
 
YOU MAY TRY: 
 
The Testament 

Harry Potter and the Sorcerer s Stone Harry Potter Paperback 

The Pilot s Wife A Novel 



# User-based collaborative filtering


In [31]:
new_df=df[df['User-ID'].map(df['User-ID'].value_counts()) > 200]  # Drop users who vote less than 200 times.
users_pivot=new_df.pivot_table(index=["User-ID"],columns=["Book-Title"],values="Book-Rating")
users_pivot.fillna(0,inplace=True)

In [32]:
def users_choice(id):
    
    users_fav=new_df[new_df["User-ID"]==id].sort_values(["Book-Rating"],ascending=False)[0:5]
    return users_fav

In [33]:
def user_based(new_df,id):
    if id not in new_df["User-ID"].values:
        print("User NOT FOUND")
        
        
    else:
        index=np.where(users_pivot.index==id)[0][0]
        similarity=cosine_similarity(users_pivot)
        similar_users=list(enumerate(similarity[index]))
        similar_users = sorted(similar_users,key = lambda x:x[1],reverse=True)[0:5]
    
        user_rec=[]
    
        for i in similar_users:
                data=df[df["User-ID"]==users_pivot.index[i[0]]]
                user_rec.extend(list(data.drop_duplicates("User-ID")["User-ID"].values))
        
    return user_rec

In [34]:
def common(new_df,user,user_id):
    x=new_df[new_df["User-ID"]==user_id]
    recommend_books=[]
    user=list(user)
    for i in user:
        y=new_df[(new_df["User-ID"]==i)]
        books=y.loc[~y["Book-Title"].isin(x["Book-Title"]),:]
        books=books.sort_values(["Book-Rating"],ascending=False)[0:5]
        recommend_books.extend(books["Book-Title"].values)
        
    return recommend_books[0:5]

In [58]:
user_id=random.choice(new_df["User-ID"].values)
user_choice_df=pd.DataFrame(users_choice(user_id))
user_favorite=users_choice(user_id)
n=len(user_choice_df["Book-Title"].values)
print("USER: {} ".format(user_id))
    
print('Your Favorite Books')
print('*******************')
    
for i in range(n):
    book_name=new_df.loc[new_df["Book-Title"]==user_choice_df["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    ratings = round(df[df["Book-Title"]==top_ten["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
    print(str(i+1)+". "+book_name+"|| Rating: ", rating)

user_based_rec=user_based(new_df,user_id)
books_for_user=common(new_df,user_based_rec,user_id)
books_for_userDF=pd.DataFrame(books_for_user,columns=["Book-Title"])
print()
print()
print('You May Also Like These Books')
print('*****************************')

for i in range(5):
    book_name=new_df.loc[new_df["Book-Title"]==books_for_userDF["Book-Title"].tolist()[i],"Book-Title"][:1].values[0]
    ratings = round(df[df["Book-Title"]==top_ten["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
    print(str(i+1)+". "+book_name+"|| Rating: ", rating)

USER: 11676 
Your Favorite Books
*******************
1. The Heirloom|| Rating:  8.1
2. Puppy Love True Stories of Animal Friends|| Rating:  8.1
3. Veronika dÃ Â cide de mourir|| Rating:  8.1
4. Esperanza Rising|| Rating:  8.1
5. Chasing the dragon Hodder Christian paperbacks|| Rating:  8.1


You May Also Like These Books
*****************************
1. The Ruby in the Smoke Sally Lockhart Trilogy Book 1|| Rating:  8.1
2. Bridge to Terabithia|| Rating:  8.1
3. Life Support|| Rating:  8.1
4. The Face of Deception|| Rating:  8.1
5. Number the Stars Yearling Newbery|| Rating:  8.1


# Content Based Collaborative Filtering


In [59]:
def content_based(bookTitle):
    bookTitle=str(bookTitle)
    
    if bookTitle in df["Book-Title"].values:
        rating_count=pd.DataFrame(df["Book-Title"].value_counts())
        rare_books=rating_count[rating_count["Book-Title"]<=200].index
        common_books=df[~df["Book-Title"].isin(rare_books)]
        
        if bookTitle in rare_books:
            most_common=pd.Series(common_books["Book-Title"].unique()).sample(3).values
            print("No Recommendations for this Book ☹️ \n ")
            print("YOU MAY TRY: \n ")
            print("{}".format(most_common[0]), "\n")
            print("{}".format(most_common[1]), "\n")
            print("{}".format(most_common[2]), "\n")
        else:
            common_books=common_books.drop_duplicates(subset=["Book-Title"])
            common_books.reset_index(inplace=True)
            common_books["index"]=[i for i in range(common_books.shape[0])]
            targets=["Book-Title","Book-Author","Publisher"]
            common_books["all_features"] = [" ".join(common_books[targets].iloc[i,].values) for i in range(common_books[targets].shape[0])]
            vectorizer=CountVectorizer()
            common_booksVector=vectorizer.fit_transform(common_books["all_features"])
            similarity=cosine_similarity(common_booksVector)
            index=common_books[common_books["Book-Title"]==bookTitle]["index"].values[0]
            similar_books=list(enumerate(similarity[index]))
            similar_booksSorted=sorted(similar_books,key=lambda x:x[1],reverse=True)[1:6]
            books=[]
            for i in range(len(similar_booksSorted)):
                
                books.append(common_books[common_books["index"]==similar_booksSorted[i][0]]["Book-Title"].item())
                
            print('You May Like These Books')
            print('***********************')
            for i in range(len(books)):
                
                book_name=common_books.loc[common_books["Book-Title"]==books[i],"Book-Title"][:1].values[0]
                ratings = round(df[df["Book-Title"]==top_ten["Book-Title"].tolist()[i]]["Book-Rating"].mean(),1)
                print(str(i+1)+". "+book_name+"|| Rating: ", rating)
    else:
        print("COULD NOT FIND ") 

In [60]:
content_based("The Da Vinci Code")

You May Like These Books
***********************
1. The Catcher in the Rye|| Rating:  8.1
2. The Brethren|| Rating:  8.1
3. The Firm|| Rating:  8.1
4. The Chamber|| Rating:  8.1
5. The Partner|| Rating:  8.1


In [61]:
content_based("Tuesdays with Morrie An Old Man a Young Man and Life s Greatest Lesson")


You May Like These Books
***********************
1. The Five People You Meet in Heaven|| Rating:  8.1
2. The Brethren|| Rating:  8.1
3. Life of Pi|| Rating:  8.1
4. The Firm|| Rating:  8.1
5. The Chamber|| Rating:  8.1


In [62]:
content_based("A Soldier of the Great War")


No Recommendations for this Book ☹️ 
 
YOU MAY TRY: 
 
The Street Lawyer 

Life of Pi 

Angels amp Demons 



In [63]:
content_based("Life of Pi")


You May Like These Books
***********************
1. The Secret Life of Bees|| Rating:  8.1
2. Snow Falling on Cedars|| Rating:  8.1
3. The Fellowship of the Ring The Lord of the Rings Part 1|| Rating:  8.1
4. Harry Potter and the Chamber of Secrets Book 2|| Rating:  8.1
5. Harry Potter and the Prisoner of Azkaban Book 3|| Rating:  8.1
