In [1]:
from datetime import date
import time

import numpy as np
import pandas as pd

from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
MIN_USERS_OVERLAP_COEFF = 0.5   # The minimum percentage of co-read books between current_user and overlapped_user to be used to create a recommendation
MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF = 2   # The minimum number of books read and rated by current_user before the MIN_USERS_OVERLAP_COEFF factor will be used to filter overlapped_users
SIMILAR_USERS_NO = 15   # The number of similar users whose book preferences will be used to recommend books to the current_user
BOOKS_NO_TO_RECOMMEND = 10   # The number of books that will be recommended to the user based on his/her book list
MIN_USER_RATINGS_NO_CFB = 100   # The minimum number of ratings per user required to use user data in the recommender system
MIN_BOOK_RATINGS_NO_CFB = 3   # The minimum number of ratings for a single book to be used to build a Collaborative Filtering Based Recommender System
DEFAULT_RATING = 5   # The neutral rating of the book

## Functions

In [3]:
def calc_weighted_rating(row, avg_rating, num_of_ratings, min_thres, default_rating):
    """
    Calculates the weighted rating for a book based on its average rating, number of ratings, and a minimum threshold.

    Args:
        row (pd.Series): A row from the DataFrame containing the book's data.
        avg_rating (str): The column name for the average rating of the book.
        num_of_ratings (str): The column name for the number of ratings the book has received.
        min_thres (int): The minimum threshold for the number of ratings to be considered for the weighted rating.
        default_rating (float): The neutral rating of the book.

    Returns:
        float: The calculated weighted rating for the book.
    """
    
    weighted_rating = ((row[avg_rating] * row[num_of_ratings]) + 
      (min_thres * default_rating))/(row[num_of_ratings] + min_thres)
    return weighted_rating

In [4]:
def make_clickable(val, link_title="Amazon Image"):
    """
    Converts a URL into a clickable HTML link.

    Args:
        val (str): The URL to be converted into a clickable link.
        link_title (str, optional): The text to be displayed for the clickable link. Default is "Amazon Image".
        
    Returns:
        str: An HTML string that represents a clickable link.
    """
    
    return '<a target="_blank" href="{}">{}</a>'.format(val, val, link_title)


def show_image(val):
    """
    Converts a URL into an HTML string that displays an image with a clickable link.

    Args:
        val (str): The URL of the image to be displayed and linked.

    Returns:
        str: An HTML string that represents an image which is clickable, linking to the image URL.
    """
    
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

## Read datasets

In [5]:
books_df = pd.read_csv('../data/processed_data/Books.csv')
users_df = pd.read_csv('../data/processed_data/Users.csv')
ratings_df = pd.read_csv('../data/processed_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271360, 8)
users_df.shape = (278858, 3)
ratings_df.shape = (1149780, 3)


In [8]:
books_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [9]:
users_df.head(3)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [10]:
ratings_df.head(3)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


## Find users with similar book preferences as the current user

In [11]:
books_user_like = [
    '0590353403',   # Harry Potter and the Sorcerer's Stone (Book 1)
    '0439064872',   # Harry Potter and the Chamber of Secrets (Book 2)
    '0345350499'    # The Mists of Avalon
]

books_user_like_titles = [
    "Harry Potter and the Sorcerer's Stone (Book 1)", 
    "Harry Potter and the Chamber of Secrets (Book 2)",
    "The Mists of Avalon"
]

In [12]:
current_user_id = -1

books_user_like_dict = {
    'user_id': [current_user_id for _ in range(len(books_user_like))],
    'isbn': books_user_like,
    'title': books_user_like_titles,
    'rating': [10, 9, 10]
}

books_user_like_df = pd.DataFrame.from_dict(books_user_like_dict)
books_user_like_df

Unnamed: 0,user_id,isbn,title,rating
0,-1,590353403,Harry Potter and the Sorcerer's Stone (Book 1),10
1,-1,439064872,Harry Potter and the Chamber of Secrets (Book 2),9
2,-1,345350499,The Mists of Avalon,10


In [13]:
books_user_like_set = set(books_user_like_df['isbn'])   # Because theoretically, a user could give several ratings to a book with the same title
print(f"books_user_like_set = {books_user_like_set}")

books_user_like_set = {'0345350499', '0590353403', '0439064872'}


### Filter users

In [14]:
# Filter users in order to reduce overlap users search time
# Leave only users with more than MIN_USER_RATINGS_NO_CFB ratings
user_rating_count_df = (
    ratings_df
    .groupby('user_id')
    ['rating']
    .count()
    .reset_index()
    .rename(columns={'rating': 'user_ratings_no'})
)

print(f"ratings_df['user_id'].nunique() = {ratings_df['user_id'].nunique()}")
print(f"user_rating_count_df.shape = {user_rating_count_df.shape}\n")

print(
    f"len(user_rating_count_df[user_rating_count_df['user_ratings_no'] >= MIN_USER_RATINGS_NO_CFB]) = "
    + f"{len(user_rating_count_df[user_rating_count_df['user_ratings_no'] >= MIN_USER_RATINGS_NO_CFB])}"
)

user_rating_count_df = user_rating_count_df[user_rating_count_df['user_ratings_no'] >= MIN_USER_RATINGS_NO_CFB]
print(f"user_rating_count_df.shape = {user_rating_count_df.shape}\n")
user_rating_count_df.head(5)

ratings_df['user_id'].nunique() = 105283
user_rating_count_df.shape = (105283, 2)

len(user_rating_count_df[user_rating_count_df['user_ratings_no'] >= MIN_USER_RATINGS_NO_CFB]) = 1847
user_rating_count_df.shape = (1847, 2)



Unnamed: 0,user_id,user_ratings_no
68,183,136
95,254,314
177,507,131
294,882,126
490,1424,137


In [15]:
print(f"ratings_df.shape before filtering by user: {ratings_df.shape}")
ratings_df = ratings_df.merge(user_rating_count_df, on='user_id', how='inner')
ratings_df.drop('user_ratings_no', inplace=True, axis=1)
print(f"ratings_df.shape after filtering by user: {ratings_df.shape}")

ratings_df.shape before filtering by user: (1149780, 3)
ratings_df.shape after filtering by user: (658805, 3)


### Filter books

**Filtering books without necessarily saving user-liked books may not be the best idea, because if a user likes a book with a low number of ratings, it is quite possible that it will be filtered out.**

In [16]:
# Filter users in order to reduce overlap users search time
# Leave books current_user liked and books with more than MIN_BOOK_RATINGS_NO_CFB ratings
start_time = time.time()

book_rating_count_df = (
    ratings_df
    .groupby('isbn')
    ['rating']
    .count()
    .reset_index()
    .rename(columns={'rating': 'book_ratings_no'})
)

print(f"book_rating_count_df.shape before book filtering = {book_rating_count_df.shape}")
book_filter_condition_result = (
    book_rating_count_df
    .apply(
        lambda row: (row['isbn'] in books_user_like_set) or (row['book_ratings_no'] >= MIN_BOOK_RATINGS_NO_CFB), 
        axis=1
    )
)
book_rating_count_df = book_rating_count_df[book_filter_condition_result]
duration = time.time() - start_time

print(f"book_rating_count_df.shape after book filtering = {book_rating_count_df.shape}")
print(f"\nduration of filtering by books = {duration // 60: .0f}m {duration % 60: .0f}s")

# Check the results
print(f"\nbooks_user_like_set = {books_user_like_set}")
book_rating_count_df[book_rating_count_df.apply(lambda row: row['isbn'] in books_user_like_set, axis=1)]

book_rating_count_df.shape before book filtering = (240478, 2)
book_rating_count_df.shape after book filtering = (52474, 2)

duration of filtering by books =  0m  2s

books_user_like_set = {'0345350499', '0590353403', '0439064872'}


Unnamed: 0,isbn,book_ratings_no
40705,345350499,84
76707,439064872,144
115246,590353403,81


In [17]:
print(f"ratings_df.shape before filtering by book: {ratings_df.shape}")
ratings_df = ratings_df.merge(book_rating_count_df, on='isbn', how='inner')
ratings_df.drop('book_ratings_no', inplace=True, axis=1)
print(f"ratings_df.shape after filtering by book: {ratings_df.shape}")

ratings_df.shape before filtering by book: (658805, 3)
ratings_df.shape after filtering by book: (431174, 3)


### Find users with similar book preferences as the current user

In [18]:
# Users who read the same books as the current_user
overlap_users = {}

start_time = time.time()

for index, row in ratings_df.iterrows():
    user_id = row['user_id']
    book_isbn = row['isbn']
    if book_isbn in books_user_like_set:
        if user_id not in overlap_users:
            overlap_users[user_id] = 1
        else:
            overlap_users[user_id] += 1 # Track how many books read and rated by current_user have been read and rated by the current user in a loop

duration = time.time() - start_time
print(f"duration of creating the overlap_users = {duration // 60: .0f}m {duration % 60: .0f}s\n")

print(f"len(overlap_users) = {len(overlap_users)}")

duration of creating the overlap_users =  0m  17s

len(overlap_users) = 278


In [23]:
counter = 0
for index, user_id in enumerate(overlap_users):
    print(f"{index + 1}) user_id = {user_id}; similar_books_count = {overlap_users[user_id]}")
    counter += 1
    if counter >= 10:
        break

1) user_id = 277427; similar_books_count = 1
2) user_id = 254; similar_books_count = 3
3) user_id = 2033; similar_books_count = 1
4) user_id = 6251; similar_books_count = 1
5) user_id = 6563; similar_books_count = 1
6) user_id = 7346; similar_books_count = 1
7) user_id = 8245; similar_books_count = 1
8) user_id = 8253; similar_books_count = 1
9) user_id = 8681; similar_books_count = 2
10) user_id = 9747; similar_books_count = 1


In [214]:
""" 
    Filter users, leaving only those who have read all the books of current_user 
        (if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF) 
    or MIN_USERS_OVERLAP_COEFF a part of the books read by current_user 
        (if len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF)
"""

start_time = time.time()

filtered_overlap_user_ids_set = None
if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 
    print(f"len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} <= {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = len(books_user_like)
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] >= books_thresh])
else:
    print(f"len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} > {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = int(len(books_user_like) * MIN_USERS_OVERLAP_COEFF)
    print(f"books_thresh = {books_thresh}")
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] > books_thresh])

duration = time.time() - start_time

print(f"duration of filtering users = {duration // 60: .0f}m {duration % 60: .0f}s\n")
print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")

len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 3 > 2
books_thresh = 1
duration of filtering users =  0m  0s

len(filtered_overlap_user_ids_set) = 27


#### Check the results

In [190]:
# To check the results
print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")
print(f"list(filtered_overlap_user_ids_set)[:10] = {list(filtered_overlap_user_ids_set)[:10]}")

len(filtered_overlap_user_ids_set) = 27
list(filtered_overlap_user_ids_set)[:10] = [148744, 131594, 30735, 240144, 226965, 21014, 11676, 36256, 238120, 100906]


In [191]:
# Check the results
result = (
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] in filtered_overlap_user_ids_set and row['isbn'] in books_user_like_set, axis=1)
)

filtered_ratings_df = ratings_df[result]
filtered_ratings_df = (
    filtered_ratings_df
    .groupby('user_id')
    .count()
    .reset_index()
    .rename(columns={'rating': 'ratings_no'})
    .sort_values(by='ratings_no')
)

print(f"len(filtered_ratings_df) = {len(filtered_ratings_df)}")

filtered_ratings_df

len(filtered_ratings_df) = 27


Unnamed: 0,user_id,isbn,ratings_no
13,98741,2,2
24,235842,2,2
23,229741,2,2
22,226965,2,2
21,225996,2,2
20,211426,2,2
18,150124,2,2
17,148744,2,2
16,131594,2,2
15,110973,2,2


### Find users with similar book preferences as the current user without iterrows()

In [None]:
filtered_overlap_user_ids_set = None
if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 
    print(f"len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} <= {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = len(books_user_like)
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] >= books_thresh])
else:
    print(f"len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} > {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = int(len(books_user_like) * MIN_USERS_OVERLAP_COEFF)
    print(f"books_thresh = {books_thresh}")
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] > books_thresh])

In [27]:
start_time = time.time()

# Filter ratings_df to only include rows where 'isbn' is in books_user_like_set
filtered_ratings_df = ratings_df[ratings_df['isbn'].isin(books_user_like_set)]

# Group by 'user_id' and count the number of occurrences of each user_id
overlap_users_series = (
    filtered_ratings_df
    .groupby('user_id')
    .count()
    .reset_index()
    .rename(columns={'rating': 'ratings_no'})
)

duration = time.time() - start_time
print(f"duration of creating the overlap_users = {duration // 60: .0f}m {duration % 60: .0f}s\n")

print(f"len(overlap_users_series) = {len(overlap_users_series)}")

overlap_users_series.head()

duration of creating the overlap_users =  0m  0s

len(overlap_users_series) = 278


Unnamed: 0,user_id,isbn,ratings_no
0,254,3,3
1,2033,1,1
2,6251,1,1
3,6563,1,1
4,7346,1,1


In [31]:
""" 
    Filter users, leaving only those who have read all the books of current_user 
        (if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF) 
    or MIN_USERS_OVERLAP_COEFF a part of the books read by current_user 
        (if len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF)
"""

start_time = time.time()

filtered_overlap_user_ids_set = None
if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 
    print(f"len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} <= {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = len(books_user_like)
    filtered_overlap_user_ids_set = set(overlap_users_series[overlap_users_series['ratings_no'] >= books_thresh]['user_id'].values)
else:
    print(f"len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} > {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = int(len(books_user_like) * MIN_USERS_OVERLAP_COEFF)
    print(f"books_thresh = {books_thresh}")
    filtered_overlap_user_ids_set = set(overlap_users_series[overlap_users_series['ratings_no'] > books_thresh]['user_id'].values)

duration = time.time() - start_time

print(f"duration of filtering users = {duration // 60: .0f}m {duration % 60: .0f}s\n")
print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")

len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 3 > 2
books_thresh = 1
duration of filtering users =  0m  0s

len(filtered_overlap_user_ids_set) = 27


#### Check the results

In [32]:
# To check the results
print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")
print(f"list(filtered_overlap_user_ids_set)[:10] = {list(filtered_overlap_user_ids_set)[:10]}")

len(filtered_overlap_user_ids_set) = 27
list(filtered_overlap_user_ids_set)[:10] = [148744, 131594, 30735, 240144, 226965, 21014, 11676, 36256, 238120, 100906]


In [33]:
# Check the results
result = (
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] in filtered_overlap_user_ids_set and row['isbn'] in books_user_like_set, axis=1)
)

filtered_ratings_df = ratings_df[result]
filtered_ratings_df = (
    filtered_ratings_df
    .groupby('user_id')
    .count()
    .reset_index()
    .rename(columns={'rating': 'ratings_no'})
    .sort_values(by='ratings_no')
)

print(f"len(filtered_ratings_df) = {len(filtered_ratings_df)}")

filtered_ratings_df

len(filtered_ratings_df) = 27


Unnamed: 0,user_id,isbn,ratings_no
13,98741,2,2
24,235842,2,2
23,229741,2,2
22,226965,2,2
21,225996,2,2
20,211426,2,2
18,150124,2,2
17,148744,2,2
16,131594,2,2
15,110973,2,2


## Find similar user book ratings

In [34]:
# For filtered users, user_id, book_isbn and rating will be stored in interactions_arr
start_time = time.time()

interactions_arr = ratings_df[(
    ratings_df['user_id']
    .apply(lambda user_id: user_id in filtered_overlap_user_ids_set) # Maybe also filter: isbn not in books_user_like_set - no need, because it will be used later to find the users with the same book tastes as current_user 
)].values

duration = time.time() - start_time
print(f"duration of creating the interactions_arr = {duration // 60: .0f}m {duration % 60: .0f}s\n")

print(f"len(interactions_arr) = {len(interactions_arr)}")
print(f"interactions_arr[:5] = {interactions_arr[:5]}")

duration of creating the interactions_arr =  0m  0s

len(interactions_arr) = 22296
interactions_arr[:5] = [[254 '006000438X' 0]
 [254 '0060013117' 0]
 [254 '0060199563' 0]
 [254 '0060391448' 0]
 [254 '0060502320' 7]]


In [35]:
interactions_df = pd.DataFrame(data=interactions_arr, columns=['user_id', 'isbn', 'rating'])

# Check the results
print(f"len(interactions_df) = {len(interactions_df)}\n")
print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")

len(interactions_df) = 22296

len(filtered_overlap_user_ids_set) = 27
interactions_df['user_id'].nunique() = 27


In [36]:
interactions_df.head(3)

Unnamed: 0,user_id,isbn,rating
0,254,006000438X,0
1,254,0060013117,0
2,254,0060199563,0


## Create a user/book matrix

In [37]:
interactions_df = pd.concat([books_user_like_df[['user_id', 'isbn', 'rating']], interactions_df])
interactions_df

Unnamed: 0,user_id,isbn,rating
0,-1,0590353403,10
1,-1,0439064872,9
2,-1,0345350499,10
0,254,006000438X,0
1,254,0060013117,0
...,...,...,...
22291,240144,1853260770,0
22292,240144,1853262021,0
22293,240144,1880284324,0
22294,240144,1899749357,0


In [38]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22299 entries, 0 to 22295
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  22299 non-null  object
 1   isbn     22299 non-null  object
 2   rating   22299 non-null  object
dtypes: object(3)
memory usage: 696.8+ KB


In [39]:
interactions_df['user_id'] = pd.to_numeric(interactions_df['user_id'])
interactions_df['rating'] = pd.to_numeric(interactions_df['rating'])
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22299 entries, 0 to 22295
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  22299 non-null  int64 
 1   isbn     22299 non-null  object
 2   rating   22299 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 696.8+ KB


In [40]:
interactions_df['user_index'] = interactions_df['user_id'].astype('category').cat.codes
interactions_df['book_index'] = interactions_df['isbn'].astype('category').cat.codes

# To check the correctness of creating cols 'user_index' and 'book_index'
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")
print(f"interactions_df['user_index'].nunique() = {interactions_df['user_index'].nunique()}\n")

print(f"interactions_df['isbn'].nunique() = {interactions_df['isbn'].nunique()}")
print(f"interactions_df['book_index'].nunique() = {interactions_df['book_index'].nunique()}\n")

interactions_df.head(3)

interactions_df['user_id'].nunique() = 28
interactions_df['user_index'].nunique() = 28

interactions_df['isbn'].nunique() = 16068
interactions_df['book_index'].nunique() = 16068



Unnamed: 0,user_id,isbn,rating,user_index,book_index
0,-1,590353403,10,0,10418
1,-1,439064872,9,0,6385
2,-1,345350499,10,0,2806


In [41]:
ratings_matrix_coo = coo_matrix((interactions_df['rating'], (interactions_df['user_index'], interactions_df['book_index'])))
ratings_matrix = ratings_matrix_coo.tocsr()

## Find users similar to current_user

In [42]:
# Get user_index for the current_user
current_user_index = interactions_df[interactions_df['user_id'] == -1]['user_index'].unique()[0]
print(f"current_user_index = {current_user_index}")

similarity = cosine_similarity(ratings_matrix[current_user_index, :], ratings_matrix).flatten()
print(f"len(similarity) = {len(similarity)}")

similar_user_indices = np.argpartition(similarity, -(SIMILAR_USERS_NO + 1))[-(SIMILAR_USERS_NO + 1):]
similar_user_indices = np.delete(similar_user_indices, np.where(similar_user_indices == 0))
print(f"SIMILAR_USERS_NO = {SIMILAR_USERS_NO}; len(similar_user_indices) = {len(similar_user_indices)}")

similar_users = interactions_df[interactions_df['user_index'].isin(similar_user_indices)].copy()
print(f"similar_users['user_id'].nunique() = {similar_users['user_id'].nunique()}\n")
similar_users

current_user_index = 0
len(similarity) = 28
SIMILAR_USERS_NO = 15; len(similar_user_indices) = 15
similar_users['user_id'].nunique() = 15



Unnamed: 0,user_id,isbn,rating,user_index,book_index
0,254,006000438X,0,1,92
1,254,0060013117,0,1,115
2,254,0060199563,0,1,240
3,254,0060391448,0,1,270
4,254,0060502320,7,1,288
...,...,...,...,...,...
22291,240144,1853260770,0,27,15519
22292,240144,1853262021,0,27,15533
22293,240144,1880284324,0,27,15585
22294,240144,1899749357,0,27,15624


## Create book recommendations using the 'book_score' variable

#### Create book_recs_df dataframe

In [43]:
book_recs_df = (
    similar_users
    .groupby('isbn')
    ['rating']
    .agg(['count', 'mean'])
    .rename(columns = {
        'count': 'ratings_no',
        'mean': 'avg_rating'
    })
    .sort_values(by='ratings_no', ascending=False)
)

print(f"book_recs_df.shape = {book_recs_df.shape}")
book_recs_df

book_recs_df.shape = (5352, 2)


Unnamed: 0_level_0,ratings_no,avg_rating
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
0590353403,14,7.285714
0345350499,11,6.727273
0439064864,10,5.800000
0439136350,10,7.200000
043935806X,9,6.000000
...,...,...
038097584X,1,0.000000
0380975300,1,0.000000
0380975068,1,0.000000
0380973820,1,0.000000


In [44]:
# merge book_recs_df and books_df to get all the information about recommended books
book_recs_df = book_recs_df.merge(books_df, how='inner', on='isbn')

print(f"book_recs_df.shape before filtering = {book_recs_df.shape}")
book_recs_df = book_recs_df[book_recs_df['ratings_no'] >= MIN_BOOK_RATINGS_NO_CFB]
print(f"book_recs_df.shape after filtering = {book_recs_df.shape}")

book_recs_df.shape before filtering = (5247, 10)
book_recs_df.shape after filtering = (127, 10)


In [45]:
book_total_ratings_df = (
    ratings_df
    .groupby('isbn')
    ['rating']
    .count()
    .reset_index()
    .rename(columns = {'rating': 'total_ratings_no'})
)
print(f"book_total_ratings_df.shape = {book_total_ratings_df.shape}\n")
book_total_ratings_df.sort_values(by='total_ratings_no', ascending=False).head()

book_total_ratings_df.shape = (52474, 2)



Unnamed: 0,isbn,total_ratings_no
48073,0971880107,617
7853,0316666343,430
18404,0385504209,325
1436,0060928336,306
21996,044021145X,276


In [46]:
book_recs_df = book_recs_df.merge(book_total_ratings_df, how='inner', on='isbn')
print(f"book_recs_df.shape = {book_recs_df.shape}\n")
book_recs_df.head(3)

book_recs_df.shape = (127, 11)



Unnamed: 0,isbn,ratings_no,avg_rating,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l,total_ratings_no
0,590353403,14,7.285714,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,81
1,345350499,11,6.727273,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,84
2,439064864,10,5.8,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,92


In [47]:
# Get the current columns
cols = book_recs_df.columns.tolist()

# Rearrange columns to move the last column to the 4th position
new_cols = cols[:3] + [cols[-1]] + cols[3:-1]
print(f"new_cols = {new_cols}")

# Reindex the DataFrame with the new column order
book_recs_df = book_recs_df.reindex(columns=new_cols)
book_recs_df.head(3)

new_cols = ['isbn', 'ratings_no', 'avg_rating', 'total_ratings_no', 'title', 'author', 'publication_year', 'publisher', 'image-url-s', 'image-url-m', 'image-url-l']


Unnamed: 0,isbn,ratings_no,avg_rating,total_ratings_no,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,590353403,14,7.285714,81,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...
1,345350499,11,6.727273,84,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...
2,439064864,10,5.8,92,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...


#### Create the 'book_score' variable to recommend books

In [48]:
book_recs_df['adjusted_ratings_no'] = book_recs_df['ratings_no'] * (book_recs_df['ratings_no'] / book_recs_df['total_ratings_no'])
book_recs_df['book_score'] = book_recs_df['avg_rating'] * book_recs_df['adjusted_ratings_no']

# Get the current columns
cols = book_recs_df.columns.tolist()

# Rearrange columns to move the last column to the 4th position
new_cols = cols[:4] + cols[-2:] + cols[4:-2]

# Reindex the DataFrame with the new column order
book_recs_df = book_recs_df.reindex(columns=new_cols)

book_recs_df.head(3)

Unnamed: 0,isbn,ratings_no,avg_rating,total_ratings_no,adjusted_ratings_no,book_score,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,590353403,14,7.285714,81,2.419753,17.62963,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...
1,345350499,11,6.727273,84,1.440476,9.690476,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...
2,439064864,10,5.8,92,1.086957,6.304348,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...


#### Filter the results and recommend books

In [49]:
# Filter book_recs_df
print(f"book_recs_df.shape before filtering = {book_recs_df.shape}")

book_recs_df = book_recs_df[~book_recs_df['isbn'].isin(books_user_like_df['isbn'])]

# Create a new 'mod_title' column in both book_recs_df and books_user_like_df 
book_recs_df['mod_title'] = book_recs_df['title'].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
book_recs_df['mod_title'] = book_recs_df['mod_title'].str.replace("\s+", " ", regex=True)

books_user_like_df['mod_title'] = books_user_like_df['title'].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
books_user_like_df['mod_title'] = books_user_like_df['mod_title'].str.replace("\s+", " ", regex=True)

book_recs_df = book_recs_df[~book_recs_df['mod_title'].isin(books_user_like_df['mod_title'])]

print(f"book_recs_df.shape after filtering = {book_recs_df.shape}")

book_recs_df.shape before filtering = (127, 13)
book_recs_df.shape after filtering = (123, 14)


In [50]:
top_books_recs_df = book_recs_df.drop_duplicates('title').sort_values(by='book_score', ascending=False)
top_books_recs_df.head(10)

Unnamed: 0,isbn,ratings_no,avg_rating,total_ratings_no,adjusted_ratings_no,book_score,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l,mod_title
3,0439136350,10,7.2,95,1.052632,7.578947,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,harry potter and the prisoner of azkaban book 3
119,0345391373,3,5.0,7,1.285714,6.428571,An Incomplete Education,Judy Jones,1995,Ballantine Books,http://images.amazon.com/images/P/0345391373.0...,http://images.amazon.com/images/P/0345391373.0...,http://images.amazon.com/images/P/0345391373.0...,an incomplete education
22,0618002219,4,9.0,39,0.410256,3.692308,The Hobbit: or There and Back Again,J.R.R. Tolkien,1999,Houghton Mifflin Company,http://images.amazon.com/images/P/0618002219.0...,http://images.amazon.com/images/P/0618002219.0...,http://images.amazon.com/images/P/0618002219.0...,the hobbit or there and back again
4,043935806X,9,6.0,152,0.532895,3.197368,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,harry potter and the order of the phoenix book 5
7,0439139597,7,5.571429,93,0.526882,2.935484,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...,harry potter and the goblet of fire book 4
27,0618002227,4,6.75,37,0.432432,2.918919,The Fellowship of the Ring (The Lord of the Ri...,J. R. R. Tolkien,1999,Houghton Mifflin Company,http://images.amazon.com/images/P/0618002227.0...,http://images.amazon.com/images/P/0618002227.0...,http://images.amazon.com/images/P/0618002227.0...,the fellowship of the ring the lord of the rin...
98,0618002235,3,6.666667,21,0.428571,2.857143,"The Two Towers (The Lord of the Rings, Part 2)",J. R. R. Tolkien,1999,Houghton Mifflin Company,http://images.amazon.com/images/P/0618002235.0...,http://images.amazon.com/images/P/0618002235.0...,http://images.amazon.com/images/P/0618002235.0...,the two towers the lord of the rings part 2
68,0670857831,3,2.666667,10,0.9,2.4,Lady of Avalon,Marion Zimmer Bradley,1997,Viking Books,http://images.amazon.com/images/P/0670857831.0...,http://images.amazon.com/images/P/0670857831.0...,http://images.amazon.com/images/P/0670857831.0...,lady of avalon
63,0142000809,3,2.666667,10,0.9,2.4,Ex Libris,Ross King,2002,Penguin Books,http://images.amazon.com/images/P/0142000809.0...,http://images.amazon.com/images/P/0142000809.0...,http://images.amazon.com/images/P/0142000809.0...,ex libris
23,0345325818,4,3.5,26,0.615385,2.153846,The Silmarillion,J.R.R. TOLKIEN,1985,Del Rey,http://images.amazon.com/images/P/0345325818.0...,http://images.amazon.com/images/P/0345325818.0...,http://images.amazon.com/images/P/0345325818.0...,the silmarillion
