# Import Module

In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

# Load & Check Data

In [2]:
#Read the data
org_df_books=pd.read_csv("/kaggle/input/book-recommendation-dataset/Books.csv")
org_df_ratings=pd.read_csv("/kaggle/input/book-recommendation-dataset/Ratings.csv")
org_df_users=pd.read_csv("/kaggle/input/book-recommendation-dataset/Users.csv")
#Making a copy of data
df_books = org_df_books.copy()
df_ratings = org_df_ratings.copy()
df_users = org_df_users.copy()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
#first 5 values of a dataset
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
#first 5 values of a dataset
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
#first 5 values of a dataset
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
#Shape of the data
print('df_books shape:',df_books.shape)
print('df_ratings shape:',df_ratings.shape)
print('df_users shape:',df_users.shape)

df_books shape: (271360, 8)
df_ratings shape: (1149780, 3)
df_users shape: (278858, 3)


In [7]:
#info of the df_books
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


### Conclusion:
<ul>
    <li>There are missing values</li>
</ul>

In [8]:
#info of the df_ratings
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [9]:
#info of the df_users
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


### Conclusion:
<ul>
    <li>Age has missing values.</li>
    <li>Age datatype is float.</li>
</ul>

In [10]:
#number of missing values in df_books
df_books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [11]:
#number of missing values in df_ratings
df_ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

### Conclusion:
<ul>
    <li>No missing values.</li>
</ul>

In [12]:
#number of missing values in df_users
df_users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

### Conclusion:
<ul>
    <li>Age has missing values</li>
</ul>

In [13]:
#Duplicate values
print('df_books:',df_books.duplicated().sum())
print('df_ratings:',df_ratings.duplicated().sum())
print('df_users:',df_users.duplicated().sum())

df_books: 0
df_ratings: 0
df_users: 0


# Preprocessing

In [14]:
# Convert http to https
df_books['Image-URL-S']=df_books['Image-URL-S'].str.replace(r'^http\b', 'https', regex=True)
df_books['Image-URL-M']=df_books['Image-URL-M'].str.replace(r'^http\b', 'https', regex=True)
df_books['Image-URL-L']=df_books['Image-URL-L'].str.replace(r'^http\b', 'https', regex=True)

In [15]:
#joining df_ratings and df_books
df_ratings_books = df_ratings.merge(df_books, on='ISBN')

In [16]:
#first 5 values of a dataset
df_ratings_books.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....


In [17]:
#joining df_ratings_books and df_users
df_complete = df_ratings_books.merge(df_users, on='User-ID')

In [18]:
#first 5 values of a dataset
df_complete.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Location,Age
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,"tyler, texas, usa",
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,https://images.amazon.com/images/P/034545104X....,"cincinnati, ohio, usa",23.0
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,https://images.amazon.com/images/P/0812533550....,https://images.amazon.com/images/P/0812533550....,https://images.amazon.com/images/P/0812533550....,"cincinnati, ohio, usa",23.0
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,https://images.amazon.com/images/P/0679745580....,https://images.amazon.com/images/P/0679745580....,https://images.amazon.com/images/P/0679745580....,"cincinnati, ohio, usa",23.0
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,https://images.amazon.com/images/P/0060173289....,https://images.amazon.com/images/P/0060173289....,https://images.amazon.com/images/P/0060173289....,"cincinnati, ohio, usa",23.0


In [19]:
#Shape of dataset
df_complete.shape

(1031136, 12)

In [20]:
# number of unique values in each feature of df_complete
for feature in df_complete:
    uni=df_complete[feature].nunique()
    print(f"Unique in {feature}:",uni)

Unique in User-ID: 92106
Unique in ISBN: 270151
Unique in Book-Rating: 11
Unique in Book-Title: 241071
Unique in Book-Author: 101588
Unique in Year-Of-Publication: 202
Unique in Publisher: 16729
Unique in Image-URL-S: 269842
Unique in Image-URL-M: 269842
Unique in Image-URL-L: 269839
Unique in Location: 22480
Unique in Age: 141


In [21]:
# EDA
report = ProfileReport(df_complete)
report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Popularity Based Recommender System

It is a type of recommendation system which works on the principle of popularity and or anything which is in trend. These systems check about the product or movie which are in trend or are most popular among the users and directly recommend those.

In [22]:
# number of ratings in each book
num_rating_df = df_complete.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'Num_ratings'},inplace=True)
num_rating_df.head()

Unnamed: 0,Book-Title,Num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [23]:
# average  ratings in each book
avg_rating_df = df_complete.groupby('Book-Title').mean()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'Avg_ratings'},inplace=True)
avg_rating_df.head()

Unnamed: 0,Book-Title,Avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,2.25
1,Always Have Popsicles,0.0
2,Apple Magic (The Collector's series),0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.0
4,Beyond IBM: Leadership Marketing and Finance ...,0.0


In [24]:
# joining num_rating_df and avg_rating_df
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popular_df

Unnamed: 0,Book-Title,Num_ratings,Avg_ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [25]:
# only books which contain Num_rating more than 250
popular_df = popular_df[popular_df['Num_ratings']>=250].sort_values('Avg_ratings', ascending=False).head(50)

In [26]:
# joining popular_df and df_books
popular_df = popular_df.merge(df_books, on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','Num_ratings','Avg_ratings']]

# Collaborative Filtering Based Recommender System

In Collaborative Filtering, we tend to find similar users and recommend what similar users like. In this type of recommendation system, we don’t use the features of the item to recommend it, rather we classify the users into the clusters of similar types, and recommend each user according to the preference of its cluster.

In [27]:
# Users who are giving more than 200 Book-Rating
temp_x = df_complete.groupby('User-ID').count()['Book-Rating'] > 200
filtered_user_id = temp_x[temp_x].index

In [28]:
# All data of users who are giving more than 200 Book-Rating
filtered_ratings = df_complete[df_complete['User-ID'].isin(filtered_user_id)]

In [29]:
# All data of books who have more than 50 Book-Rating
temp_y = filtered_ratings.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = temp_y[temp_y].index

In [30]:
# All data of famous books
final_ratings = filtered_ratings[filtered_ratings['Book-Title'].isin(famous_books)]

In [31]:
# All the users and their rating on each famous books
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')

In [32]:
# replacing the values with 0 , who does not have given any rating for a book
pt.fillna(0, inplace=True)

In [33]:
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Checking the similarity of a book with each book in pt
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pt)
similarity_scores

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

In [35]:
def recommend(book_name):
    # index fetch
    index = np.where(pt.index == book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse = True)[1:6]
    
    data = []
    for i in similar_items:
        item = []
        #print(pt.index[i[0]])
        temp_df = df_books[df_books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title']))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author']))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M']))
        data.extend(item)
    return data

In [36]:
recommend('Message in a Bottle')

['Nights in Rodanthe',
 'Nicholas Sparks',
 'https://images.amazon.com/images/P/0446531332.01.MZZZZZZZ.jpg',
 'The Mulberry Tree',
 'Jude Deveraux',
 'https://images.amazon.com/images/P/0743437640.01.MZZZZZZZ.jpg',
 'A Walk to Remember',
 'Nicholas Sparks',
 'https://images.amazon.com/images/P/0446608955.01.MZZZZZZZ.jpg',
 "River's End",
 'Nora Roberts',
 'https://images.amazon.com/images/P/0515127833.01.MZZZZZZZ.jpg',
 'Nightmares &amp; Dreamscapes',
 'Stephen King',
 'https://images.amazon.com/images/P/0451180232.01.MZZZZZZZ.jpg']

# Export

In [37]:
import pickle
pickle.dump(popular_df, open('popular.pkl','wb'))

In [38]:
pickle.dump(pt, open('pt.pkl','wb'))
pickle.dump(df_books, open('df_books.pkl','wb'))
pickle.dump(similarity_scores, open('similarity_scores.pkl','wb'))

In [39]:
df_complete['Image-URL-M']

0          https://images.amazon.com/images/P/034545104X....
1          https://images.amazon.com/images/P/034545104X....
2          https://images.amazon.com/images/P/0812533550....
3          https://images.amazon.com/images/P/0679745580....
4          https://images.amazon.com/images/P/0060173289....
                                 ...                        
1031131    https://images.amazon.com/images/P/2862749796....
1031132    https://images.amazon.com/images/P/3788097000....
1031133    https://images.amazon.com/images/P/0553571001....
1031134    https://images.amazon.com/images/P/0689822294....
1031135    https://images.amazon.com/images/P/0583307841....
Name: Image-URL-M, Length: 1031136, dtype: object