In [1]:
from datetime import date
import pickle

import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
MIN_BOOK_RATINGS_NO_PB = 100   # The minimum number of ratings for a single book to be used to build a Popularity Based Recommender System
DEFAULT_RATING = 5   # The neutral rating of the book
MIN_USER_RATINGS_NO_CFB = 200   # The minimum number of ratings per user required to use user data in the recommender system
MIN_BOOK_RATINGS_NO_CFB = 50   # The minimum number of ratings for a single book to be used to build a Collaborative Filtering Based Recommender System

## Functions

In [3]:
def calc_weighted_rating(row, avg_rating, num_of_ratings, min_thres, default_rating):
    """
    Calculates the weighted rating for a book based on its average rating, number of ratings, and a minimum threshold.

    Args:
        row (pd.Series): A row from the DataFrame containing the book's data.
        avg_rating (str): The column name for the average rating of the book.
        num_of_ratings (str): The column name for the number of ratings the book has received.
        min_thres (int): The minimum threshold for the number of ratings to be considered for the weighted rating.
        default_rating (float): The neutral rating of the book.

    Returns:
        float: The calculated weighted rating for the book.
    """
    
    weighted_rating = ((row[avg_rating] * row[num_of_ratings]) + 
      (min_thres * default_rating))/(row[num_of_ratings] + min_thres)
    return weighted_rating

In [4]:
def recommend_book(books_df, book_name, pivot_table, similarity_scores, recommend_books_no=5,
                   books_df_cols={
                      'book_isbn': 'isbn',
                      'book_title': 'title',
                      'book_author': 'author',
                      'book_image': 'image_url'
                   }):
    """
    Recommends books similar to the given book based on similarity scores.

    Args:
        books_df (pd.DataFrame): The DataFrame containing the books data.
        book_name (str): The name of the book for which recommendations are to be made.
        pivot_table (pd.DataFrame or scipy.sparse.csr_matrix): The pivot table containing book-user interactions.
        similarity_scores (np.ndarray): The matrix of cosine similarity scores.
        recommend_books_no (int, optional): The number of similar books to recommend. Default is 5.
        books_df_cols (dict, optional): A dictionary mapping the standard column names to the DataFrame column names.
            - 'book_isbn' (str): Column name for the book's ISBN.
            - 'book_title' (str): Column name for the book's title.
            - 'book_author' (str): Column name for the book's author.
            - 'book_image' (str): Column name for the book's image URL.
        
    Raises:
        ValueError: If the specified book is not found in the pivot table.
    
    Returns:
        list: A list of lists, each containing the details of a recommended book (ISBN, title, author, image URL).
    """
    
    if book_name not in pivot_table.index:
        raise ValueError(f"Book '{book_name}' not found in the dataset.")
        
    index = np.where(pivot_table.index == book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:(recommend_books_no + 1)]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = books_df[books_df['title'] == pivot_table.index[i[0]]]
        temp_df = temp_df.drop_duplicates(books_df_cols['book_title'])
        
        item.extend(list(temp_df[books_df_cols['book_isbn']].values))
        item.extend(list(temp_df[books_df_cols['book_title']].values))
        item.extend(list(temp_df[books_df_cols['book_author']].values))
        item.extend(list(temp_df[books_df_cols['book_image']].values))
        
        data.append(item)
    
    return data

## Read and check datasets

In [5]:
books_df = pd.read_csv('../data/db_data/Books.csv')
users_df = pd.read_csv('../data/db_data/Users.csv')
ratings_df = pd.read_csv('../data/db_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271358, 6)
users_df.shape = (278858, 3)
ratings_df.shape = (1031136, 3)


In [6]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

# author - 0, image-url-l - 0
missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,0,0.0
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image_url,0,0.0


In [7]:
# Should be zero records
books_df[books_df['publication_year'] > date.today().year]

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url


In [8]:
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [9]:
users_df.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [10]:
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


## Popularity Based Recommendation System

The task is to find the top-n books with the highest avg_rating value.  
However, when examining the dataset in the file '1_books_dataset_EDA.ipynb', it was possible to observe a situation where certain books had a low number of ratings (1-5) and a high value of avg_rating. As a result, the system will be biased and proportionally incorrect.  
Therefore, to prevent such a problem, it is advisable to consider only books that have, for example, more than 100 ratings.

In [11]:
rating_book_df = ratings_df.merge(books_df, on='isbn')

print(f"books_df.shape = {books_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")
print(f"rating_book_df.shape = {rating_book_df.shape}\n")

rating_book_df.head(3)

books_df.shape = (271358, 6)
ratings_df.shape = (1031136, 3)
rating_book_df.shape = (1031136, 8)



Unnamed: 0,user_id,isbn,rating,title,author,publication_year,publisher,image_url
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...


In [12]:
book_rated_df = (
    rating_book_df
    .groupby('title')
    ['rating']
    .agg(['count', 'mean'])
    .reset_index()
    .rename(
        columns={
            'count': 'ratings_no',
            'mean': 'avg_rating'
        }
    )
)

book_rated_df.head(5)

Unnamed: 0,title,ratings_no,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


In [13]:
book_rated_df = books_df.merge(book_rated_df, on='title', how='inner').drop_duplicates('title')
print(f"books_df.shape = {books_df.shape}; book_rated_df.shape = {book_rated_df.shape}")

books_df.shape = (271358, 6); book_rated_df.shape = (241071, 8)


In [14]:
# Leave only books with more than MIN_BOOK_RATINGS_NO ratings
print(
    f"len(book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO_PB]) = "
    + f"{len(book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO_PB])}"
)

book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO_PB].head(5)

len(book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO_PB]) = 914


Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,311,2.996785
18,440234743,The Testament,John Grisham,1999,Dell,http://images.amazon.com/images/P/0440234743.0...,617,3.179903
19,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,180,3.411111
26,971880107,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,2502,1.019584
27,345402871,Airframe,Michael Crichton,1997,Ballantine Books,http://images.amazon.com/images/P/0345402871.0...,332,2.900602


In [18]:
book_rated_df = (
    book_rated_df
    [book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO_PB]
    .sort_values(by='avg_rating', ascending=False)
)
book_rated_df.head(5)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating,weighted_rating
3835,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,428,5.852804,5.691288
5427,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,387,5.824289,5.655031
388,0156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,http://images.amazon.com/images/P/0156528207.0...,141,5.815603,5.477178
2806,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,278,5.73741,5.542328
5502,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,347,5.501441,5.389262


In [19]:
book_rated_df['weighted_rating'] = (
    book_rated_df
    .apply(
            lambda x: calc_weighted_rating(x, 'avg_rating', 'ratings_no', MIN_BOOK_RATINGS_NO_PB, DEFAULT_RATING), 
        axis=1)
)
book_rated_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating,weighted_rating
3835,439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,428,5.852804,5.691288
5427,439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,387,5.824289,5.655031
388,156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,http://images.amazon.com/images/P/0156528207.0...,141,5.815603,5.477178


In [20]:
book_rated_df.sort_values(by='weighted_rating', ascending=False).head(10)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating,weighted_rating
3835,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,428,5.852804,5.691288
5427,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,387,5.824289,5.655031
2806,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,278,5.73741,5.542328
388,0156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,http://images.amazon.com/images/P/0156528207.0...,141,5.815603,5.477178
5502,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,347,5.501441,5.389262
946,0312853238,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1992,Tor Books,http://images.amazon.com/images/P/0312853238.0...,249,5.409639,5.292264
3455,0439064872,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,556,5.183453,5.155488
1150,0671027344,The Perks of Being a Wallflower,Stephen Chbosky,1999,MTV,http://images.amazon.com/images/P/0671027344.0...,104,5.144231,5.073529
3350,0345339681,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339681.0...,281,5.007117,5.005249
1568,0345339703,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339703.0...,368,4.94837,4.959402


In [21]:
book_rated_df.sort_values(by='weighted_rating', ascending=False).head(10)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating,weighted_rating
3835,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,428,5.852804,5.691288
5427,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,387,5.824289,5.655031
2806,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,278,5.73741,5.542328
388,0156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,http://images.amazon.com/images/P/0156528207.0...,141,5.815603,5.477178
5502,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,347,5.501441,5.389262
946,0312853238,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1992,Tor Books,http://images.amazon.com/images/P/0312853238.0...,249,5.409639,5.292264
3455,0439064872,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,556,5.183453,5.155488
1150,0671027344,The Perks of Being a Wallflower,Stephen Chbosky,1999,MTV,http://images.amazon.com/images/P/0671027344.0...,104,5.144231,5.073529
3350,0345339681,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339681.0...,281,5.007117,5.005249
1568,0345339703,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339703.0...,368,4.94837,4.959402


In [27]:
check_book_rated_df = pd.read_csv('../artifacts/data/popularity_based_df.csv')

print(f"book_rated_df.shape = {book_rated_df.shape}; check_book_rated_df.shape = {check_book_rated_df.shape}")
check_book_rated_df.head(10)

book_rated_df.shape = (914, 9); check_book_rated_df.shape = (914, 9)


Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating,weighted_rating
0,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,428,5.852804,5.691288
1,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,387,5.824289,5.655031
2,0156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,http://images.amazon.com/images/P/0156528207.0...,141,5.815603,5.477178
3,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,278,5.73741,5.542328
4,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,347,5.501441,5.389262
5,0312853238,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1992,Tor Books,http://images.amazon.com/images/P/0312853238.0...,249,5.409639,5.292264
6,0439064872,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,556,5.183453,5.155488
7,0671027344,The Perks of Being a Wallflower,Stephen Chbosky,1999,MTV,http://images.amazon.com/images/P/0671027344.0...,104,5.144231,5.073529
8,0345339681,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339681.0...,281,5.007117,5.005249
9,0345339703,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339703.0...,368,4.94837,4.959402


## Collaborative Filtering Based Recommendation System

### Preliminary data preparation

Before implementing the Collaborative Filtering Based Recommender System, it should be noted that:
1) The system will consider only those users who have left more than MIN_USER_RATINGS_NO ratings for different books.
2) Only those books with more than MIN_BOOK_RATINGS_NO_CFB ratings will be recommended.

In [34]:
rating_book_df = ratings_df.merge(books_df, on='isbn')

print(f"books_df.shape = {books_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")
print(f"rating_book_df.shape = {rating_book_df.shape}\n")

rating_book_df.head(3)

books_df.shape = (271358, 6)
ratings_df.shape = (1031136, 3)
rating_book_df.shape = (1031136, 8)



Unnamed: 0,user_id,isbn,rating,title,author,publication_year,publisher,image_url
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...


In [35]:
x = rating_book_df.groupby('user_id').count()
x = x[x['rating'] > MIN_USER_RATINGS_NO_CFB]
x

Unnamed: 0_level_0,isbn,rating,title,author,publication_year,publisher,image_url
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
254,300,300,300,300,300,300,300
2276,456,456,456,456,456,456,456
2766,269,269,269,269,269,269,269
2977,227,227,227,227,227,227,227
3363,890,890,890,890,890,890,890
...,...,...,...,...,...,...,...
274308,1293,1293,1293,1293,1293,1293,1293
275970,1325,1325,1325,1325,1325,1325,1325
277427,490,490,490,490,490,490,490
277639,265,265,265,265,265,265,265


In [36]:
print(f"x.index = {x.index}")

cfb_rs_user_ids = x.index
print(f"cfb_rs_user_ids = {cfb_rs_user_ids}")

x.index = Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='user_id', length=811)
cfb_rs_user_ids = Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='user_id', length=811)


In [37]:
user_filtered_ratings_df = rating_book_df[rating_book_df['user_id'].isin(cfb_rs_user_ids)]

print(f"rating_book_df.shape = {rating_book_df.shape}")
print(f"user_filtered_ratings_df.shape = {user_filtered_ratings_df.shape}\n")

user_filtered_ratings_df.head(5)

rating_book_df.shape = (1031136, 8)
user_filtered_ratings_df.shape = (474007, 8)



Unnamed: 0,user_id,isbn,rating,title,author,publication_year,publisher,image_url
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1151,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...
1152,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...
1153,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...
1154,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...


In [38]:
y = user_filtered_ratings_df.groupby('title').count()['rating'] >= MIN_BOOK_RATINGS_NO_CFB
famous_books = y[y].index

print(f"len(famous_books) = {len(famous_books)}")

len(famous_books) = 706


In [39]:
final_ratings_df = user_filtered_ratings_df[user_filtered_ratings_df['title'].isin(famous_books)]

print(f"rating_book_df.shape = {rating_book_df.shape}")
print(f"user_filtered_ratings_df.shape = {user_filtered_ratings_df.shape}")
print(f"final_ratings_df.shape = {final_ratings_df.shape}\n")

final_ratings_df.head(5)

rating_book_df.shape = (1031136, 8)
user_filtered_ratings_df.shape = (474007, 8)
final_ratings_df.shape = (58586, 8)



Unnamed: 0,user_id,isbn,rating,title,author,publication_year,publisher,image_url
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1163,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...
1165,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...
1168,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...
1174,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...


### Create a pivot table

In [40]:
pivot_table = final_ratings_df.pivot_table(index='title', columns='user_id', values='rating')
pivot_table.fillna(0, inplace=True)
pivot_table

user_id,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create a Collaborative Filtering Based Recommendation System 

In [41]:
similarity_scores = cosine_similarity(pivot_table)

print(f"similarity_scores.shape = {similarity_scores.shape}")
print(f"similarity_scores[0][:10] = {similarity_scores[0][:10]}")

similarity_scores.shape = (706, 706)
similarity_scores[0][:10] = [1.         0.10255025 0.01220856 0.         0.05367224 0.02774901
 0.08216491 0.13732869 0.03261686 0.03667591]


In [44]:
res = recommend_book(books_df=books_df, book_name='1984', pivot_table=pivot_table, similarity_scores=similarity_scores)
res

[['0451526341',
  'Animal Farm',
  'George Orwell',
  'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'],
 ['0449212602',
  "The Handmaid's Tale",
  'Margaret Atwood',
  'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'],
 ['0060809833',
  'Brave New World',
  'Aldous Huxley',
  'http://images.amazon.com/images/P/0060809833.01.MZZZZZZZ.jpg'],
 ['0345313860',
  'The Vampire Lestat (Vampire Chronicles, Book II)',
  'ANNE RICE',
  'http://images.amazon.com/images/P/0345313860.01.MZZZZZZZ.jpg'],
 ['0312243022',
  'The Hours : A Novel',
  'Michael Cunningham',
  'http://images.amazon.com/images/P/0312243022.01.MZZZZZZZ.jpg']]

In [45]:
res = (
    recommend_book(books_df=books_df, 
                   book_name="Harry Potter and the Sorcerer's Stone (Book 1)", 
                   pivot_table=pivot_table, 
                   similarity_scores=similarity_scores,
                   recommend_books_no=10
                  )
)
res

[['0439064872',
  'Harry Potter and the Chamber of Secrets (Book 2)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'],
 ['0439136350',
  'Harry Potter and the Prisoner of Azkaban (Book 3)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'],
 ['0439139597',
  'Harry Potter and the Goblet of Fire (Book 4)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg'],
 ['043935806X',
  'Harry Potter and the Order of the Phoenix (Book 5)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg'],
 ['0345350499',
  'The Mists of Avalon',
  'MARION ZIMMER BRADLEY',
  'http://images.amazon.com/images/P/0345350499.01.MZZZZZZZ.jpg'],
 ['0399146431',
  "The Bonesetter's Daughter",
  'Amy Tan',
  'http://images.amazon.com/images/P/0399146431.01.MZZZZZZZ.jpg'],
 ['0380564998',
  'Jacob Have I Loved',
  'Katherine Paterson',
  'http://images.amazon.com/images/P/0380564998.01.

In [46]:
try:
    res = (
        recommend_book(books_df=books_df, 
                       book_name="stardust", 
                       pivot_table=pivot_table, 
                       similarity_scores=similarity_scores,
                       recommend_books_no=5
                      )
    )
    res
except ValueError as e:
    print(e)

Book 'stardust' not found in the dataset.


In [47]:
try:
    res = (
        recommend_book(books_df=books_df, 
                       book_name="The Mists of Avalon", 
                       pivot_table=pivot_table, 
                       similarity_scores=similarity_scores,
                       recommend_books_no=5
                      )
    )
    print(f"res = {res}")
except ValueError as e:
    print(e)

res = [['0590353403', "Harry Potter and the Sorcerer's Stone (Book 1)", 'J. K. Rowling', 'http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg'], ['0142001430', 'Year of Wonders', 'Geraldine Brooks', 'http://images.amazon.com/images/P/0142001430.01.MZZZZZZZ.jpg'], ['0439064872', 'Harry Potter and the Chamber of Secrets (Book 2)', 'J. K. Rowling', 'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'], ['0440215625', 'Dragonfly in Amber', 'DIANA GABALDON', 'http://images.amazon.com/images/P/0440215625.01.MZZZZZZZ.jpg'], ['043935806X', 'Harry Potter and the Order of the Phoenix (Book 5)', 'J. K. Rowling', 'http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg']]


### Create a Collaborative Filtering Based Recommendation System with sparce matrix

In [48]:
sparse_table = csr_matrix(pivot_table)

similarity_scores = cosine_similarity(sparse_table)

print(f"similarity_scores.shape = {similarity_scores.shape}")
print(f"similarity_scores[0][:10] = {similarity_scores[0][:10]}")

similarity_scores.shape = (706, 706)
similarity_scores[0][:10] = [1.         0.10255025 0.01220856 0.         0.05367224 0.02774901
 0.08216491 0.13732869 0.03261686 0.03667591]


In [49]:
res = recommend_book(books_df=books_df, book_name='1984', pivot_table=pivot_table, similarity_scores=similarity_scores)
res

[['0451526341',
  'Animal Farm',
  'George Orwell',
  'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'],
 ['0449212602',
  "The Handmaid's Tale",
  'Margaret Atwood',
  'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'],
 ['0060809833',
  'Brave New World',
  'Aldous Huxley',
  'http://images.amazon.com/images/P/0060809833.01.MZZZZZZZ.jpg'],
 ['0345313860',
  'The Vampire Lestat (Vampire Chronicles, Book II)',
  'ANNE RICE',
  'http://images.amazon.com/images/P/0345313860.01.MZZZZZZZ.jpg'],
 ['0312243022',
  'The Hours : A Novel',
  'Michael Cunningham',
  'http://images.amazon.com/images/P/0312243022.01.MZZZZZZZ.jpg']]

In [50]:
res = (
    recommend_book(books_df=books_df, 
                   book_name="Harry Potter and the Sorcerer's Stone (Book 1)", 
                   pivot_table=pivot_table, 
                   similarity_scores=similarity_scores,
                   recommend_books_no=8
                  )
)
res

[['0439064872',
  'Harry Potter and the Chamber of Secrets (Book 2)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'],
 ['0439136350',
  'Harry Potter and the Prisoner of Azkaban (Book 3)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg'],
 ['0439139597',
  'Harry Potter and the Goblet of Fire (Book 4)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg'],
 ['043935806X',
  'Harry Potter and the Order of the Phoenix (Book 5)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg'],
 ['0345350499',
  'The Mists of Avalon',
  'MARION ZIMMER BRADLEY',
  'http://images.amazon.com/images/P/0345350499.01.MZZZZZZZ.jpg'],
 ['0399146431',
  "The Bonesetter's Daughter",
  'Amy Tan',
  'http://images.amazon.com/images/P/0399146431.01.MZZZZZZZ.jpg'],
 ['0380564998',
  'Jacob Have I Loved',
  'Katherine Paterson',
  'http://images.amazon.com/images/P/0380564998.01.

In [51]:
try:
    res = (
        recommend_book(books_df=books_df, 
                       book_name="stardust", 
                       pivot_table=pivot_table, 
                       similarity_scores=similarity_scores,
                       recommend_books_no=5
                      )
    )
    res
except ValueError as e:
    print(e)

Book 'stardust' not found in the dataset.


In [52]:
try:
    res = (
        recommend_book(books_df=books_df, 
                       book_name="The Mists of Avalon", 
                       pivot_table=pivot_table, 
                       similarity_scores=similarity_scores,
                       recommend_books_no=5
                      )
    )
    print(f"res = {res}")
except ValueError as e:
    print(e)

res = [['0590353403', "Harry Potter and the Sorcerer's Stone (Book 1)", 'J. K. Rowling', 'http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg'], ['0142001430', 'Year of Wonders', 'Geraldine Brooks', 'http://images.amazon.com/images/P/0142001430.01.MZZZZZZZ.jpg'], ['0439064872', 'Harry Potter and the Chamber of Secrets (Book 2)', 'J. K. Rowling', 'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'], ['0440215625', 'Dragonfly in Amber', 'DIANA GABALDON', 'http://images.amazon.com/images/P/0440215625.01.MZZZZZZZ.jpg'], ['043935806X', 'Harry Potter and the Order of the Phoenix (Book 5)', 'J. K. Rowling', 'http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg']]


## Save artifacts to separate files

### Popularity Based Recommendation System

In [23]:
book_rated_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating,weighted_rating
3835,439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,428,5.852804,5.691288
5427,439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,387,5.824289,5.655031
388,156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,http://images.amazon.com/images/P/0156528207.0...,141,5.815603,5.477178


In [26]:
book_rated_df.to_csv('../artifacts/data/popularity_based_df.csv', index=False)

check_book_rated_df = pd.read_csv('../artifacts/data/popularity_based_df.csv')

print(f"book_rated_df.shape = {book_rated_df.shape}; check_book_rated_df.shape = {check_book_rated_df.shape}")
check_book_rated_df.head(3)

book_rated_df.shape = (914, 9); check_book_rated_df.shape = (914, 9)


Unnamed: 0,isbn,title,author,publication_year,publisher,image_url,ratings_no,avg_rating,weighted_rating
0,439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,428,5.852804,5.691288
1,439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,387,5.824289,5.655031
2,156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,http://images.amazon.com/images/P/0156528207.0...,141,5.815603,5.477178


### Collaborative Filtering Based Recommendation System

In [55]:
final_ratings_df.to_csv('../artifacts/data/final_ratings_df.csv', index=False)

check_final_ratings_df = pd.read_csv('../artifacts/data/final_ratings_df.csv')

print(f"final_ratings_df.shape = {final_ratings_df.shape}; check_final_ratings_df.shape = {check_final_ratings_df.shape}")
check_final_ratings_df.head(3)

final_ratings_df.shape = (58586, 8); check_final_ratings_df.shape = (58586, 8)


Unnamed: 0,user_id,isbn,rating,title,author,publication_year,publisher,image_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...
2,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...


In [56]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271358 entries, 0 to 271357
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   isbn              271358 non-null  object
 1   title             271358 non-null  object
 2   author            271358 non-null  object
 3   publication_year  271358 non-null  int64 
 4   publisher         271356 non-null  object
 5   image_url         271358 non-null  object
dtypes: int64(1), object(5)
memory usage: 12.4+ MB


Since the books_df is unchanged compared to the books_df from the '1_books_dataset_EDA' file, there is no point in keeping a copy of it in a separate file.

In [57]:
pickle.dump(pivot_table, open('../artifacts/pivot_table.pkl', 'wb'))
pickle.dump(similarity_scores, open('../artifacts/similarity_scores.pkl', 'wb'))