## Popularity-based and Content-based Recommender
#### Data source: https://www.kaggle.com/jealousleopard/goodreadsbooks/download

In [59]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [60]:
df = pd.read_csv('books.csv', on_bad_lines='warn')

In [61]:
df.head(3)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic


In [62]:
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11123.0,11123.0,11123.0,11123.0,11123.0,11123.0
mean,21310.856963,3.934075,9759880000000.0,336.405556,17942.85,542.048099
std,13094.727252,0.350485,442975800000.0,241.152626,112499.2,2576.619589
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10277.5,3.77,9780345000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780582000000.0,299.0,745.0,47.0
75%,32104.5,4.14,9780872000000.0,416.0,5000.5,238.0
max,45641.0,5.0,9790008000000.0,6576.0,4597666.0,94265.0


In [63]:
len_metadata = len(df['bookID'].unique())

In [64]:
df.head(2).transpose()

Unnamed: 0,0,1
bookID,1,2
title,Harry Potter and the Half-Blood Prince (Harry ...,Harry Potter and the Order of the Phoenix (Har...
authors,J.K. Rowling/Mary GrandPré,J.K. Rowling/Mary GrandPré
average_rating,4.57,4.49
isbn,0439785960,0439358078
isbn13,9780439785969,9780439358071
language_code,eng,eng
num_pages,652,870
ratings_count,2095690,2153167
text_reviews_count,27591,29221


In [65]:
top5AverageRating = df.sort_values(by = 'average_rating',
                                     ascending = False).head(5)
top5AverageRating

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
5476,19788,The Goon Show Volume 11: He's Fallen in the W...,NOT A BOOK,5.0,0563388323,9780563388326,eng,2,2,0,10/2/1995,BBC Physical Audio
5474,19786,The Goon Show Volume 4: My Knees Have Fallen ...,NOT A BOOK,5.0,0563388692,9780563388692,eng,2,3,0,4/1/1996,BBC Physical Audio
624,2034,Comoediae 1: Acharenses/Equites/Nubes/Vespae/P...,Aristophanes/F.W. Hall/W.M. Geldart,5.0,0198145047,9780198145042,grc,364,0,0,2/22/1922,Oxford University Press USA
9847,39580,The American Campaign: U.S. Presidential Campa...,James E. Campbell,5.0,089096940X,9780890969403,eng,314,0,0,5/1/2000,Texas A&M University Press
4788,17224,The Diamond Color Meditation: Color Pathway to...,John Diamond,5.0,1890995525,9781890995522,eng,74,5,3,2/1/2006,Square One Publishers


In [66]:
top5ratings = df.sort_values(by = 'ratings_count',ascending = False).head(5)
top5ratings

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
10336,41865,Twilight (Twilight #1),Stephenie Meyer,3.59,0316015849,9780316015844,eng,501,4597666,94265,9/6/2006,Little Brown and Company
1697,5907,The Hobbit or There and Back Again,J.R.R. Tolkien,4.27,0618260307,9780618260300,eng,366,2530894,32871,8/15/2002,Houghton Mifflin
1462,5107,The Catcher in the Rye,J.D. Salinger,3.8,0316769177,9780316769174,eng,277,2457092,43499,1/30/2001,Back Bay Books
307,960,Angels & Demons (Robert Langdon #1),Dan Brown,3.89,1416524797,9781416524793,eng,736,2418736,21303,4/1/2006,Pocket Books
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.


## Use a Weighted Rating
<font color = "white"> Create a function named popularity Recommender and use it to recommend books based on popularity. <br>
Using a weighted rank similar to that used in the IMDB rating 


In [9]:
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [10]:
def popularityRecommender(df):
    
    #Define the minimum vote count
    minimum_rating_count = 0.75* df['ratings_count'].max()
    
    #Define C – the mean rating
    mean_rating = df['average_rating'].mean()

    df['weighted_rating'] = (((df['ratings_count']/(df['ratings_count']+minimum_rating_count)) * df['average_rating']) +
                             ((minimum_rating_count/(df['ratings_count']+minimum_rating_count))*mean_rating))

    recommendations = df.sort_values(by = 'weighted_rating',ascending = False).head(5)
    
    return(recommendations)   

In [11]:
top5 = popularityRecommender(df)
top5[["title",'ratings_count','average_rating','weighted_rating',]].head(5)

Unnamed: 0,title,ratings_count,average_rating,weighted_rating
3,Harry Potter and the Prisoner of Azkaban (Harr...,2339585,4.56,4.187089
0,Harry Potter and the Half-Blood Prince (Harry ...,2095690,4.57,4.174464
1,Harry Potter and the Order of the Phoenix (Har...,2153167,4.49,4.147771
4415,Harry Potter and the Chamber of Secrets (Harry...,2293963,4.42,4.128198
23,The Fellowship of the Ring (The Lord of the Ri...,2128944,4.36,4.096661


In [12]:
#The top 5 recommended books based on popularity are:
top5Names = top5["title"].head(5).values
print(f'The top 5 recommended books based on popularity are:\n {top5Names}')

The top 5 recommended books based on popularity are:
 ['Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)'
 'Harry Potter and the Half-Blood Prince (Harry Potter  #6)'
 'Harry Potter and the Order of the Phoenix (Harry Potter  #5)'
 'Harry Potter and the Chamber of Secrets (Harry Potter  #2)'
 'The Fellowship of the Ring (The Lord of the Rings  #1)']


### <font color = "white"> Content Based Recommender
<font color = "white"> Create a function named Content Based Recommender with TF-IDF and use it to recommend books based on content.

In [45]:
# Assign stop words based on language - stop words are words that you don't want to add up in our count, like the, a, of, etc.
from sklearn.feature_extraction.text import TfidfVectorizer
cbr = TfidfVectorizer(stop_words = 'english')

# Replace empty descriptions with a blank "" value and transform the titles of books in our dataset into the matrix
df['title'] = df['title'].fillna('')
tfidf_matrix = cbr.fit_transform(df['title'])

tfidf_matrix.shape

(11123, 11090)

In [46]:
stop_words_list = cbr.get_stop_words()
print(len(stop_words_list), "words in the stop list:\n")
print(stop_words_list)

318 words in the stop list:

frozenset({'eight', 'until', 'others', 'hers', 'somehow', 'go', 'after', 'front', 'eleven', 'then', 'me', 'thereupon', 'further', 'please', 'system', 'a', 'serious', 'our', 'the', 'afterwards', 'himself', 'upon', 'fifty', 'namely', 'otherwise', 'across', 'seeming', 'almost', 'though', 'and', 'more', 'were', 'becomes', 'four', 'beforehand', 'how', 'both', 'sometimes', 'ten', 'for', 'thru', 'latter', 'from', 'over', 'have', 'hereupon', 'their', 'now', 'to', 'would', 'on', 'un', 'fifteen', 'him', 'formerly', 'its', 'several', 'towards', 'becoming', 'these', 'above', 'empty', 'herein', 'inc', 'therefore', 'thence', 'here', 'cry', 'his', 'too', 'top', 'an', 'etc', 'before', 'move', 'bill', 'wherever', 'nowhere', 'been', 'her', 'detail', 'less', 'due', 'once', 're', 'at', 'many', 'some', 'during', 'another', 'either', 'elsewhere', 'most', 'everything', 'hasnt', 'thereafter', 'all', 'or', 'done', 'in', 'via', 'name', 'only', 'one', 'third', 'can', 'something', 'wh

In [47]:
# Take a look at a sample movie description
df['title'][0]

'Harry Potter and the Half-Blood Prince (Harry Potter  #6)'

In [48]:
# Use the lambda function to split the description into words and count the length of the paragraph
df['new_column'] = df.title.apply(lambda x: len(str(x).split(' ')))
df['new_column'][0]

np.int64(10)

In [49]:
# There are 11090 words (not including stop words) used to describe the different books
tfidf_matrix[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (1, 11090)>

In [50]:
# Look at the vector representing the importance of the words in the document. Cumulatively, they represent the document.
print(tfidf_matrix[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (1, 11090)>
  Coords	Values
  (0, 4606)	0.578259465690308
  (0, 7752)	0.6037245307706429
  (0, 4539)	0.3466347512612644
  (0, 1292)	0.2922551986233868
  (0, 7835)	0.3091341608120538


In [51]:
# Assign the instance of our recommender function.
# This is a matrix with a similarity value for every book with every other books in the dataset

from sklearn.metrics.pairwise import cosine_similarity
distance_matrix = cosine_similarity(tfidf_matrix)

# Re-create the indices of our list of books by removing any duplicates if required
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [52]:
distance_matrix.size

123721129

In [53]:
def ContentBasedRecommender(title, indices, distance_matrix):
    # Fix the check to avoid ValueError
    if title not in indices.index:
        print(f"'{title}' not found in the dataset.")
        return []

    id_ = indices[title]  # Get the index of the book
    distances = list(enumerate(distance_matrix[id_])) # Get distances from the similarity/distance matrix
    distances = sorted(distances, key=lambda x: x[1], reverse=True) # Sort by similarity (descending)
    
    top_matches = distances[1:6] # Skip the first match (the book itself)
    recommended_indices = [i[0] for i in top_matches] # Extract the indices of the recommended books

    # Return the corresponding titles
    return df['title'].iloc[recommended_indices].reset_index(drop=True)


In [54]:
print(type(distance_matrix))               # Should be <class 'numpy.ndarray'>
print(distance_matrix.shape)              # Should be (N, N), where N = number of books
print(distance_matrix[0][0])              # Should be a float like 1.0, not an array
print(type(distance_matrix[0][0]))  

<class 'numpy.ndarray'>
(11123, 11123)
1.0
<class 'numpy.float64'>


In [55]:

# Example usage
book_title = "Changeling (Changeling  #1)"
recommendations = ContentBasedRecommender(book_title, indices, distance_matrix)
print("Top Recommendations:")
print(recommendations)

Top Recommendations:
0                                       The Changeling
1                                       The Changeling
2                                           Changeling
3                                   The Changeling Sea
4    A Changeling for All Seasons (Changeling Seaso...
Name: title, dtype: object


In [56]:
# Define a function that takes the re-indexed dataset, finds the 6 most similar titles 
#to a chosen title based on the
# similarity of the words in the titles,
# and returns the top 5, (not) including itself, which will be the best match. 

def ContentBasedRecommender(title, indices, distance_matrix):
    if title not in indices:
        print(f"'{title}' not found in the dataset.")
        return []
        
    id_ = indices[title] #Fetch the index of the book we will enter
    
    #List of tuples with distance for each book to the entered movie (2 cols = id and distance)
    distances = list(enumerate(distance_matrix[id_])) 
    
    #sort the book by the distance function, which is in column[1]
    distances = sorted(distances, key=lambda x: x[1], reverse = True) 
    
    distances = distances[1:6] # Get the 5 best scores, not including itself
    print(distances)
    
    # get the indices of the top 5
    recommendations = [distance[0] for distance in distances] 
    
    # return those recommendation names by pulling title from the given 5 indices
    return df['title'].iloc[recommendations] 

In [57]:
#Pick a title and see the resulting recommendations:
book_title = "Harry Potter and the Order of the Phoenix (Harry Potter  #5)"
recommendations = ContentBasedRecommender(book_title, indices, distance_matrix)
print(recommendations)

[(6, np.float64(0.802723259014784)), (10675, np.float64(0.7596836050891125)), (2, np.float64(0.729409121003878)), (4415, np.float64(0.729409121003878)), (10674, np.float64(0.7259450162060239))]
6             Harry Potter Collection (Harry Potter  #1-6)
10675    Harry Potter and the Goblet of Fire (Harry Pot...
2        Harry Potter and the Chamber of Secrets (Harry...
4415     Harry Potter and the Chamber of Secrets (Harry...
10674    Harry Potter and the Philosopher's Stone (Harr...
Name: title, dtype: object
