<a href="https://colab.research.google.com/github/Kev-Daran/Book-Review-and-Recommendation/blob/main/Book_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df1 = pd.read_csv('/content/Ratings.csv', low_memory=False)
df2 = pd.read_csv('/content/Users.csv', low_memory=False)
df3 = pd.read_csv('/content/Books.csv', low_memory=False)

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [None]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [None]:
# remove 'DK Publishing Inc', 'Gallimard' values from 'Year-Of-Publication' column and convert to int
df3 = df3[~df3['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard'])]
df3['Year-Of-Publication'] = pd.to_numeric(df3['Year-Of-Publication'])

In [None]:
# drop any duplicates in df3
df3 = df3.drop_duplicates(['Book-Author', 'Book-Title'])

# get book-author and title from df3
data = pd.merge(df3, df1, on='ISBN')[['Book-Author', 'Book-Rating', 'Book-Title', 'ISBN']]

data = data.groupby('Book-Author').agg(['mean', 'count'])['Book-Rating'].reset_index()

# generate score based on mean rating and total number of times the author is rated
m = data['count'].quantile(0.99) # minimum votes required to be listed in the Top 250
data = data[data['count']>m]
print('m =', m)
print(data.shape)
R = data['mean'] # average for the author (mean) = (Rating)
v = data['count'] # number of votes for the author = (votes)
C = data['mean'].mean() # mean vote across all authors
data['weighted rating'] = (v/(v+m))*R + (m/(v+m))*C
data = data.sort_values('weighted rating', ascending=False).reset_index(drop=True)

data.iloc[:20]

m = 130.0
(996, 3)


Unnamed: 0,Book-Author,mean,count,weighted rating
0,J. K. Rowling,5.411434,2134,5.263202
1,Bill Watterson,5.498134,536,4.977312
2,J. R. R. Tolkien,5.265861,662,4.866023
3,Shel Silverstein,6.273333,150,4.674607
4,Dr. Seuss,5.168044,363,4.551501
5,Nick Bantock,5.27881,269,4.480927
6,Harper Lee,4.932039,412,4.427841
7,J.R.R. TOLKIEN,4.511224,980,4.314315
8,Neil Gaiman,4.453074,1236,4.298602
9,Daniel Quinn,5.012295,244,4.253716


In [None]:
reader = Reader(rating_scale=(0, 10))
surprise_data = Dataset.load_from_df(data[['User-ID', 'ISBN', 'Book-Rating']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.25)

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD()]:
    # Perform cross validation
    results = cross_validate(algorithm, surprise_data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,3.917326,1.929028,0.14902


In [None]:
svd = SVD() 
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f1d6e7a3f90>

In [None]:
index_val = 912
# get user id
userId = df.index[index_val]
books = []
ratings = []
titles = []

for isbn in df.iloc[index_val][df.iloc[index_val].isna()].index:
    books.append(isbn)
    title = data[data['ISBN']==isbn]['Book-Title'].values[0]
    titles.append(title)
    ratings.append(svd.predict(userId, isbn).est)

prediction = pd.DataFrame({'ISBN':books, 'title':titles, 'rating':ratings, 'userId':userId})  
prediction = prediction.sort_values('rating', ascending=False).iloc[:10].reset_index(drop=True)

# get other high rated books by user
temp = data[data['User-ID']==df.index[index_val]].sort_values(
    'Book-Rating', ascending=False)[['Book-Rating', 'Book-Title', 'User-ID']].iloc[:10].reset_index(drop=True)
prediction['Book Read'] = temp['Book-Title']
prediction['Rated']= temp['Book-Rating']
prediction

Unnamed: 0,ISBN,title,rating,userId,Book Read,Rated
0,0553279912,A Is for Alibi (Kinsey Millhone Mysteries (Pap...,7.966554,16795,"A Child Called \It\"": One Child's Courage to S...",10
1,0805063897,Nickel and Dimed: On (Not) Getting By in America,7.659301,16795,The Da Vinci Code,10
2,0060938455,Fast Food Nation: The Dark Side of the All-Ame...,7.226936,16795,The Lovely Bones: A Novel,10
3,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,7.095352,16795,The Secret Life of Bees,10
4,0439064872,Harry Potter and the Chamber of Secrets (Book 2),6.695379,16795,She's Come Undone (Oprah's Book Club),10
5,0380731851,Mystic River,6.582392,16795,The Notebook,10
6,0446611212,Violets Are Blue,6.105393,16795,House of Sand and Fog,9
7,0060392452,Stupid White Men ...and Other Sorry Excuses fo...,5.819775,16795,The Red Tent (Bestselling Backlist),9
8,0345313860,"The Vampire Lestat (Vampire Chronicles, Book II)",5.754851,16795,The Horse Whisperer,9
9,0380789035,American Gods,5.47949,16795,The Bridges of Madison County,9


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# replace NaN with user based average rating in pivot dataframe
df_imputed = df.fillna(df.mean(axis=0))

# get similarity between all users
similarity_matrix = cosine_similarity(df_imputed.values)

In [None]:
def get_recommendation(user_index):
    idx = user_index
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # get books that are unrated by the given user
    unrated_books = df.iloc[idx][df.iloc[idx].isna()].index

    # get weighted ratings of unrated books by all other users
    book_ratings = (df[unrated_books].T * similarity_matrix[idx]).T

    # get top 100 similar users by skipping the current user
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    # get mean of book ratings by top 100 most similar users for the unrated books
    book_ratings = book_ratings.iloc[[x[0] for x in sim_scores]].mean()
    
    # get rid of null values and sort it based on ratings
    book_ratings = book_ratings.reset_index().dropna().sort_values(0, ascending=False).iloc[:10]
    
    # get recommended book titles in sorted order
    recommended_books = data[data['ISBN'].isin(book_ratings['ISBN'])][['ISBN', 'Book-Title']]
    recommended_books = recommended_books.drop_duplicates('ISBN').reset_index(drop=True)
    assumed_ratings = book_ratings[0].reset_index(drop=True)

    return pd.DataFrame({'ISBN':recommended_books['ISBN'], 
                         'Recommended Book':recommended_books['Book-Title'], 
                         'Assumed Rating':assumed_ratings})

In [None]:
user_index = 2131
recommended_books = get_recommendation(user_index)
# get other high rated books by user
temp = data[data['User-ID']==df.index[user_index]].sort_values(
    'Book-Rating', ascending=False)[['Book-Rating', 'Book-Title', 'User-ID']].iloc[:10].reset_index(drop=True)
recommended_books['userId'] = temp['User-ID']
recommended_books['Book Read'] = temp['Book-Title']
recommended_books['Rated']= temp['Book-Rating']
recommended_books

Unnamed: 0,ISBN,Recommended Book,Assumed Rating,userId,Book Read,Rated
0,0684872153,Angela's Ashes (MMP) : A Memoir,7.089074,35859,Fahrenheit 451,10
1,0316284955,White Oleander : A Novel (Oprah's Book Club),6.819865,35859,Harry Potter and the Sorcerer's Stone (Harry P...,10
2,0345339681,The Hobbit : The Enchanting Prelude to The Lor...,6.214522,35859,One for the Money (Stephanie Plum Novels (Pape...,10
3,0345361792,A Prayer for Owen Meany,6.166465,35859,The Red Tent (Bestselling Backlist),10
4,0786868716,The Five People You Meet in Heaven,6.130962,35859,Bel Canto: A Novel,9
5,080410526X,All I Really Need to Know,5.704645,35859,The Secret Life of Bees,9
6,0440222656,The Horse Whisperer,5.626918,35859,Left Behind: A Novel of the Earth's Last Days ...,9
7,0316769487,The Catcher in the Rye,4.815939,35859,The Joy Luck Club,8
8,0345339703,The Fellowship of the Ring (The Lord of the Ri...,4.115793,35859,Two for the Dough,8
9,0060959037,Prodigal Summer: A Novel,4.087308,35859,Balzac and the Little Chinese Seamstress : A N...,8


In [None]:
user_index = 6349
recommended_books = get_recommendation(user_index)
# get other high rated books by user
temp = data[data['User-ID']==df.index[user_index]].sort_values(
    'Book-Rating', ascending=False)[['Book-Rating', 'Book-Title', 'User-ID']].iloc[:10].reset_index(drop=True)
recommended_books['userId'] = temp['User-ID']
recommended_books['Book Read'] = temp['Book-Title']
recommended_books['Rated']= temp['Book-Rating']
recommended_books

Unnamed: 0,ISBN,Recommended Book,Assumed Rating,userId,Book Read,Rated
0,671510053,SHIPPING NEWS,7.467531,106225,Girl with a Pearl Earring,10
1,1558743669,"A Child Called \It\"": One Child's Courage to S...",7.000075,106225,Angela's Ashes (MMP) : A Memoir,9
2,60930535,The Poisonwood Bible: A Novel,6.980253,106225,"Tuesdays with Morrie: An Old Man, a Young Man,...",9
3,345339681,The Hobbit : The Enchanting Prelude to The Lor...,6.980253,106225,A Painted House,9
4,446672211,Where the Heart Is (Oprah's Book Club (Paperba...,6.980202,106225,The Lovely Bones: A Novel,8
5,345378490,Congo,6.967019,106225,Good in Bed,8
6,375727345,House of Sand and Fog,6.927949,106225,White Oleander : A Novel (Oprah's Book Club),8
7,345339703,The Fellowship of the Ring (The Lord of the Ri...,6.609094,106225,The Secret Life of Bees,8
8,60959037,Prodigal Summer: A Novel,6.382191,106225,The Brethren,6
9,375700757,Cold Mountain : A Novel,6.223824,106225,The Handmaid's Tale,0
