In [5]:
from datetime import date

import numpy as np
import pandas as pd

In [27]:
MIN_BOOK_RATINGS_NO = 100   # The minimum number of ratings for a single book to be used to build a recommender system.
DEFAULT_RATING = 5   # The neutral rating of the book

## Functions

In [28]:
def calc_weighted_rating(row, avg_rating, num_of_ratings, min_thres, default_rating):
    weighted_rating = ((row[avg_rating] * row[num_of_ratings]) + 
      (min_thres * default_rating))/(row[num_of_ratings] + min_thres)
    return weighted_rating

## Read and check datasets

In [2]:
books_df = pd.read_csv('../data/processed_data/Books.csv')
users_df = pd.read_csv('../data/processed_data/Users.csv')
ratings_df = pd.read_csv('../data/processed_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271360, 8)
users_df.shape = (278858, 3)
ratings_df.shape = (1149780, 3)


In [3]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

# author - 0, image-url-l - 0
missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,0,0.0
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image-url-s,0,0.0
6,image-url-m,0,0.0
7,image-url-l,0,0.0


In [6]:
# Should be zero records
books_df[books_df['publication_year'] > date.today().year]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l


In [7]:
books_df.head(2)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [8]:
users_df.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [9]:
ratings_df.head(2)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5


## Popularity Based Model

The task is to find the top-n books with the highest avg_rating value.  
However, when examining the dataset in the file '1_books_dataset_EDA.ipynb', it was possible to observe a situation where certain books had a low number of ratings (1-5) and a high value of avg_rating. As a result, the system will be biased and proportionally incorrect.  
Therefore, to prevent such a problem, it is advisable to consider only books that have, for example, more than 100 ratings.

In [15]:
rating_book_df = ratings_df.merge(books_df, on='isbn')

print(f"books_df.shape = {books_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")
print(f"rating_book_df.shape = {rating_book_df.shape}\n")

rating_book_df.head(3)

books_df.shape = (271360, 8)
ratings_df.shape = (1149780, 3)
rating_book_df.shape = (1031136, 10)



Unnamed: 0,user_id,isbn,rating,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...


In [17]:
book_rated_df = (
    rating_book_df
    .drop(['image-url-s', 'image-url-m', 'image-url-l'], axis=1)
    .groupby('title')
    ['rating']
    .agg(['count', 'mean'])
    .reset_index()
    .rename(
        columns={
            'count': 'ratings_no',
            'mean': 'avg_rating'
        }
    )
)

book_rated_df.head(5)

Unnamed: 0,title,ratings_no,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


In [21]:
# Leave only books with more than MIN_BOOK_RATINGS_NO ratings
print(
    f"len(book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO]) = "
    + f"{len(book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO])}"
)

book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO].head(5)

len(book_rated_df[book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO]) = 914


Unnamed: 0,title,ratings_no,avg_rating
764,1984,284,4.454225
818,1st to Die: A Novel,509,3.575639
1002,24 Hours,106,2.424528
1048,2nd Chance,356,3.269663
1266,4 Blondes,151,1.94702


In [22]:
book_rated_df = (
    book_rated_df
    [book_rated_df['ratings_no'] >= MIN_BOOK_RATINGS_NO]
    .sort_values(by='avg_rating', ascending=False)
)
book_rated_df.head(5)

Unnamed: 0,title,ratings_no,avg_rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
195672,The Little Prince,141,5.815603
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441


In [29]:
book_rated_df['weighted_rating'] = (
    book_rated_df
    .apply(
            lambda x: calc_weighted_rating(x, 'avg_rating', 'ratings_no', MIN_BOOK_RATINGS_NO, DEFAULT_RATING), 
        axis=1)
)
book_rated_df.head(3)

Unnamed: 0,title,ratings_no,avg_rating,weighted_rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,5.691288
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,5.655031
195672,The Little Prince,141,5.815603,5.477178


In [30]:
book_rated_df.sort_values(by='weighted_rating', ascending=False).head(10)

Unnamed: 0,title,ratings_no,avg_rating,weighted_rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,5.691288
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,5.655031
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741,5.542328
195672,The Little Prince,141,5.815603,5.477178
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441,5.389262
60582,Ender's Game (Ender Wiggins Saga (Paperback)),249,5.409639,5.292264
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453,5.155488
201970,The Perks of Being a Wallflower,104,5.144231,5.073529
191612,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117,5.005249
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837,4.959402
