# Book Recommendation

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle5 as pickle
import warnings
warnings.filterwarnings('ignore')

## Dataset Overview


1. **Books data** : Giving the detail overview about the book information including the book title, publication year, as well as the author of the book
2. **Users data** : Give the detailed overview about each user such as the user's location as well as the user's age
3. **Ratings data** : Give the detailed overview about the rating that each user give to the book

## Preprocessing

### Books Data

In [4]:
df_books1 = pd.read_csv('data/Books.csv', sep=',', error_bad_lines=False, usecols = [0,1,2,3,4])

In [5]:
df_books1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
dtypes: object(5)
memory usage: 10.4+ MB


In [6]:
try:
    df_books1['Year-Of-Publication']  = df_books1['Year-Of-Publication'].astype(int)
except Exception as e:
    print(e)


invalid literal for int() with base 10: 'DK Publishing Inc'


In [7]:
df_books1[df_books1['Year-Of-Publication'] == 'DK Publishing Inc']

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...


In [8]:
df_books1['Year-Of-Publication'] = pd.to_numeric(df_books1['Year-Of-Publication'],errors='coerce')

In [9]:
df_books1 = df_books1.dropna()
df_books1['Year-Of-Publication'] = df_books1['Year-Of-Publication'].astype(int)

In [10]:
df_books1.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [11]:
df_books2 = pd.read_csv('data/BooksWithCategory.csv')

In [12]:
df_books2.set_axis(['ASIN', 'Filename', 'Image Url', 'Book-Title', 'Author', 'Category ID', 'Category'],
                    axis=1,inplace=True)

In [13]:
df_books2.head()

Unnamed: 0,ASIN,Filename,Image Url,Book-Title,Author,Category ID,Category
0,1623439671,1623439671.jpg,http://ecx.images-amazon.com/images/I/61t-hrSw...,Doug the Pug 2016 Wall Calendar,Doug the Pug,3,Calendars
1,B00O80WC6I,B00O80WC6I.jpg,http://ecx.images-amazon.com/images/I/41X-KQqs...,"Moleskine 2016 Weekly Notebook, 12M, Large, Bl...",Moleskine,3,Calendars
2,761182187,0761182187.jpg,http://ecx.images-amazon.com/images/I/61j-4gxJ...,365 Cats Color Page-A-Day Calendar 2016,Workman Publishing,3,Calendars
3,1578052084,1578052084.jpg,http://ecx.images-amazon.com/images/I/51Ry4Tsq...,Sierra Club Engagement Calendar 2016,Sierra Club,3,Calendars
4,1578052076,1578052076.jpg,http://ecx.images-amazon.com/images/I/619KxYEq...,Sierra Club Wilderness Calendar 2016,Sierra Club,3,Calendars


In [14]:
df_books2 = df_books2.drop(['ASIN', 'Filename', 'Image Url', 'Author'], axis=1)

In [15]:
df_books2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207571 entries, 0 to 207570
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Book-Title   207571 non-null  object
 1   Category ID  207571 non-null  int64 
 2   Category     207571 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.8+ MB


In [16]:
df_books = pd.merge(df_books1, df_books2, on='Book-Title')

### Users Data

In [17]:
df_users = pd.read_csv('data/Users.csv', sep=',', error_bad_lines=False, usecols = [0,1,2])

In [18]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [19]:
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


### Ratings Data

In [20]:
df_ratings = pd.read_csv('data/Ratings.csv', sep=',', error_bad_lines=False, usecols = [0,1,2])    

In [21]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [22]:
df_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


### Data Cleaning

In [23]:
df_books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Category ID,Category
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,15,Literature & Fiction
1,0801319536,Classical Mythology,Mark P. O. Morford,1998,John Wiley &amp; Sons,15,Literature & Fiction
2,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,15,Literature & Fiction
3,080410753X,The Kitchen God's Wife,Amy Tan,1992,Ivy Books,15,Literature & Fiction
4,080410753x,The Kitchen God's Wife,Amy Tan,1992,Ivy Books,15,Literature & Fiction
...,...,...,...,...,...,...,...
14620,0911647155,Cutting (Western Horseman Books),Leon Harrel,2002,Western Horseman,26,Sports & Outdoors
14621,0691027641,Makers of Modern Strategy from Machiavelli to ...,Peter Paret,1986,Princeton University Press,12,History
14622,0395957699,One Man's Garden,Henry Mitchell,1999,Mariner Books,8,"Crafts, Hobbies & Home"
14623,0881924989,The Cactus Family,Edward F. Anderson,2001,Timber Press (OR),8,"Crafts, Hobbies & Home"


In [24]:
print('Number of data before cleaning : {}'.format(len(df_ratings)))
df_ratings = df_ratings[df_ratings['ISBN'].isin(df_books['ISBN'])]
print('Number of data after cleaning : {}'.format(len(df_ratings)))

Number of data before cleaning : 1149780
Number of data after cleaning : 96495


In [25]:
df_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
2,276727,0446520802,0
8,276744,038550120X,7
10,276746,0425115801,0
19,276747,0671537458,9
24,276748,0747558167,6
...,...,...,...
1149751,276690,0440439884,0
1149763,276704,0395404258,0
1149768,276704,0446605409,0
1149771,276704,0743211383,7


In [26]:
df_official = pd.merge(df_ratings, df_books, on='ISBN')

In [27]:
df_official.drop_duplicates(subset=['User-ID', 'ISBN'], inplace=True, keep='last')

In [28]:
df_official

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Category ID,Category
0,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
1,278418,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
2,638,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
3,3363,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
4,7158,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
...,...,...,...,...,...,...,...,...,...
115568,276556,055337849x,10,The Brothers K,David James Duncan,1996,Bantam,15,Literature & Fiction
115569,276581,0312311362,0,Radiance: A Novel,Carter Scholz,2003,Picador USA,24,Science Fiction & Fantasy
115570,276641,0151006555,0,Bay of Tigers: An Odyssey through War-torn Angola,Pedro Rosa Mendes,2003,Harcourt,29,Travel
115571,276688,0060168307,0,Sacred Clowns,Tony Hillerman,1994,Harpercollins,15,Literature & Fiction


### Content Based Filtering

In [29]:
df1 = df_official

In [30]:
df1

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Category ID,Category
0,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
1,278418,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
2,638,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
3,3363,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
4,7158,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance
...,...,...,...,...,...,...,...,...,...
115568,276556,055337849x,10,The Brothers K,David James Duncan,1996,Bantam,15,Literature & Fiction
115569,276581,0312311362,0,Radiance: A Novel,Carter Scholz,2003,Picador USA,24,Science Fiction & Fantasy
115570,276641,0151006555,0,Bay of Tigers: An Odyssey through War-torn Angola,Pedro Rosa Mendes,2003,Harcourt,29,Travel
115571,276688,0060168307,0,Sacred Clowns,Tony Hillerman,1994,Harpercollins,15,Literature & Fiction


In [31]:
avg = df1.groupby('Book-Title')['Book-Rating'].mean()

In [32]:
df1 =  df1.join(avg, on='Book-Title', rsuffix='_Avg')

In [33]:
df1

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Category ID,Category,Book-Rating_Avg
0,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance,3.560000
1,278418,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance,3.560000
2,638,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance,3.560000
3,3363,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance,3.560000
4,7158,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,22,Romance,3.560000
...,...,...,...,...,...,...,...,...,...,...
115568,276556,055337849x,10,The Brothers K,David James Duncan,1996,Bantam,15,Literature & Fiction,5.000000
115569,276581,0312311362,0,Radiance: A Novel,Carter Scholz,2003,Picador USA,24,Science Fiction & Fantasy,0.000000
115570,276641,0151006555,0,Bay of Tigers: An Odyssey through War-torn Angola,Pedro Rosa Mendes,2003,Harcourt,29,Travel,0.000000
115571,276688,0060168307,0,Sacred Clowns,Tony Hillerman,1994,Harpercollins,15,Literature & Fiction,1.611111


In [34]:
df1.drop(['Publisher', 'Category ID', 'ISBN', 'Year-Of-Publication', 'User-ID', 'Book-Rating'], axis=1, inplace=True)

In [35]:
df1

Unnamed: 0,Book-Title,Book-Author,Category,Book-Rating_Avg
0,The Notebook,Nicholas Sparks,Romance,3.560000
1,The Notebook,Nicholas Sparks,Romance,3.560000
2,The Notebook,Nicholas Sparks,Romance,3.560000
3,The Notebook,Nicholas Sparks,Romance,3.560000
4,The Notebook,Nicholas Sparks,Romance,3.560000
...,...,...,...,...
115568,The Brothers K,David James Duncan,Literature & Fiction,5.000000
115569,Radiance: A Novel,Carter Scholz,Science Fiction & Fantasy,0.000000
115570,Bay of Tigers: An Odyssey through War-torn Angola,Pedro Rosa Mendes,Travel,0.000000
115571,Sacred Clowns,Tony Hillerman,Literature & Fiction,1.611111


In [36]:
df1.drop_duplicates(subset="Book-Title",inplace=True, keep="first")

In [37]:
df1.duplicated().sum()

0

In [38]:
result = df1[df1["Book-Title"] == "The Guardian"]
result

Unnamed: 0,Book-Title,Book-Author,Category,Book-Rating_Avg
28510,The Guardian,Nicholas Sparks,Romance,3.076503


In [39]:
fillnabooks= df1.fillna('')

In [40]:
def clean_data(x):
        return str.lower(x.replace(" ", ""))

In [41]:
features=['Book-Title','Book-Author', 'Category','Book-Rating_Avg']
fillednabooks=fillnabooks[features]

In [42]:
fillednabooks = fillednabooks.astype(str)
fillednabooks.dtypes

Book-Title         object
Book-Author        object
Category           object
Book-Rating_Avg    object
dtype: object

In [43]:
for feature in features:
    fillednabooks[feature] = fillednabooks[feature].apply(clean_data)
    
fillednabooks.head(2)

Unnamed: 0,Book-Title,Book-Author,Category,Book-Rating_Avg
0,thenotebook,nicholassparks,romance,3.56
116,apaintedhouse,johngrisham,"mystery,thriller&suspense",3.231503579952267


In [44]:
def create_soup(x):
    return x['Book-Title']+ ' ' + x['Book-Author'] + ' ' + x['Category']  + ' ' + x['Book-Rating_Avg'] 

In [45]:
fillednabooks['soup'] = fillednabooks.apply(create_soup, axis=1)

In [46]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(fillednabooks['soup'])

In [47]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [48]:
fillednabooks=fillednabooks.reset_index()
indices = pd.Series(fillednabooks.index, index=fillednabooks['Book-Title'])

In [49]:
def get_recommendations_new(title, cosine_sim=cosine_sim2):
    title=title.replace(' ','').lower()
    idx = indices[title]

    
    sim_scores = list(enumerate(cosine_sim[idx]))

  
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  
    sim_scores = sim_scores[1:11]

  
    book_indices = [i[0] for i in sim_scores]

 
    return list(df1['Book-Title'].iloc[book_indices])

In [50]:
l=get_recommendations_new('The Notebook', cosine_sim2)
l

['The Guardian',
 'She',
 'Viking Passion',
 'Consequences',
 'Slow Burn',
 'Sins of Omission',
 'Always',
 'In the Cut',
 'The Next',
 'Secret']

In [51]:
pickle.dump(count, open('model/book_c-bd.pkl', 'wb'))