In [41]:
from datetime import date
import time

import numpy as np
import pandas as pd

from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [75]:
MIN_USERS_OVERLAP_COEFF = 0.5   # The minimum percentage of co-read books between current_user and overlapped_user to be used to create a recommendation
MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF = 2   # The minimum number of books read and rated by current_user before the MIN_USERS_OVERLAP_COEFF factor will be used to filter overlapped_users
SIMILAR_USERS_NO = 15   # The number of similar users whose book preferences will be used to recommend books to the current_user
BOOKS_NO_TO_RECOMMEND = 10   # The number of books that will be recommended to the user based on his/her book list
MIN_BOOK_RATINGS_NO_CFB = 3   # The minimum number of ratings for a single book to be used to build a Collaborative Filtering Based Recommender System

## Functions

In [57]:
def calc_weighted_rating(row, avg_rating, num_of_ratings, min_thres, default_rating):
    """
    Calculates the weighted rating for a book based on its average rating, number of ratings, and a minimum threshold.

    Args:
        row (pd.Series): A row from the DataFrame containing the book's data.
        avg_rating (str): The column name for the average rating of the book.
        num_of_ratings (str): The column name for the number of ratings the book has received.
        min_thres (int): The minimum threshold for the number of ratings to be considered for the weighted rating.
        default_rating (float): The neutral rating of the book.

    Returns:
        float: The calculated weighted rating for the book.
    """
    
    weighted_rating = ((row[avg_rating] * row[num_of_ratings]) + 
      (min_thres * default_rating))/(row[num_of_ratings] + min_thres)
    return weighted_rating

In [132]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Amazon Image</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

## Read and check datasets

In [3]:
books_df = pd.read_csv('../data/processed_data/Books.csv')
users_df = pd.read_csv('../data/processed_data/Users.csv')
ratings_df = pd.read_csv('../data/processed_data/Ratings.csv')

print(f"books_df.shape = {books_df.shape}")
print(f"users_df.shape = {users_df.shape}")
print(f"ratings_df.shape = {ratings_df.shape}")

books_df.shape = (271360, 8)
users_df.shape = (278858, 3)
ratings_df.shape = (1149780, 3)


In [4]:
missing_info_df = pd.DataFrame({
    'missing_count': books_df.isnull().sum(),
    'missing_percentage': (books_df.isnull().sum() / len(books_df)) * 100
}).reset_index().rename(columns={'index': 'column'})

# author - 0, image-url-l - 0
missing_info_df

Unnamed: 0,column,missing_count,missing_percentage
0,isbn,0,0.0
1,title,0,0.0
2,author,0,0.0
3,publication_year,0,0.0
4,publisher,2,0.000737
5,image-url-s,0,0.0
6,image-url-m,0,0.0
7,image-url-l,0,0.0


In [5]:
# Should be zero records
books_df[books_df['publication_year'] > date.today().year]

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l


In [6]:
books_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [7]:
users_df.head(3)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [8]:
ratings_df.head(3)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


## Find users with similar book preferences as the current user

In [9]:
books_user_like = [
    '0590353403',   # Harry Potter and the Sorcerer's Stone (Book 1)
    '0439064872',   # Harry Potter and the Chamber of Secrets (Book 2)
    '0345350499'    # The Mists of Avalon
]

books_user_like_titles = [
    "Harry Potter and the Sorcerer's Stone (Book 1)", 
    "Harry Potter and the Chamber of Secrets (Book 2)",
    "The Mists of Avalon"
]

In [10]:
current_user_id = -1

books_user_like_dict = {
    'user_id': [current_user_id for _ in range(len(books_user_like))],
    'isbn': books_user_like,
    'title': books_user_like_titles,
    'rating': [10, 9, 10]
}

books_user_like_df = pd.DataFrame.from_dict(books_user_like_dict)
books_user_like_df

Unnamed: 0,user_id,isbn,title,rating
0,-1,590353403,Harry Potter and the Sorcerer's Stone (Book 1),10
1,-1,439064872,Harry Potter and the Chamber of Secrets (Book 2),9
2,-1,345350499,The Mists of Avalon,10


In [11]:
books_user_like_set = set(books_user_like_df['isbn'])   # Because theoretically, a user could give several ratings to a book with the same title
print(f"books_user_like_set = {books_user_like_set}")

books_user_like_set = {'0345350499', '0590353403', '0439064872'}


In [12]:
# Users who read the same books as the current_user
overlap_users = {}

for index, row in ratings_df.iterrows():
    user_id = row['user_id']
    book_isbn = row['isbn']
    if book_isbn in books_user_like_set:
        if user_id not in overlap_users:
            overlap_users[user_id] = 1
        else:
            overlap_users[user_id] += 1 # Track how many books read and rated by current_user have been read and rated by the current user in a loop

print(f"len(overlap_users) = {len(overlap_users)}")

len(overlap_users) = 664


In [13]:
# To check the results
overlap_user_ids_list = list(overlap_users.keys())
print(f"len(overlap_user_ids_list) = {len(overlap_user_ids_list)}")
print(f"overlap_user_ids_list[:10] = {overlap_user_ids_list[:10]}")

len(overlap_user_ids_list) = 664
overlap_user_ids_list[:10] = [277427, 278356, 278582, 254, 1725, 2033, 2179, 2337, 2793, 2855]


In [14]:
# Check the results for one user
result = (
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] == 277427 and row['isbn'] in books_user_like_set, axis=1)
)
ratings_df[result]

Unnamed: 0,user_id,isbn,rating
1629,277427,439064872,0


In [15]:
# Check the results for 5 users
result = (
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] in overlap_user_ids_list[:5] and row['isbn'] in books_user_like_set, axis=1)
)
ratings_df[result]

Unnamed: 0,user_id,isbn,rating
1629,277427,439064872,0
4257,278356,439064872,8
9138,278582,345350499,10
10106,254,345350499,0
10209,254,439064872,9
10279,254,590353403,9
13327,1725,345350499,0


In [17]:
""" 
    Filter users, leaving only those who have read all the books of current_user 
        (if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF) 
    or MIN_USERS_OVERLAP_COEFF a part of the books read by current_user 
        (if len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF)
"""

filtered_overlap_user_ids_set = None
if len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 
    print(f"len(books_user_like) <= MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} <= {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = len(books_user_like)
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] >= books_thresh])
else:
    print(f"len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: {len(books_user_like)} > {MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF}")
    books_thresh = int(len(books_user_like) * MIN_USERS_OVERLAP_COEFF)
    print(f"books_thresh = {books_thresh}")
    filtered_overlap_user_ids_set = set([k for k in overlap_users if overlap_users[k] > books_thresh])

print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")

len(books_user_like) > MIN_BOOKS_NO_TO_USE_OVERLAP_COEFF: 3 > 2
books_thresh = 1
len(filtered_overlap_user_ids_set) = 32


### Check the results

In [18]:
# To check the results
print(f"len(filtered_overlap_user_ids_set) = {len(filtered_overlap_user_ids_set)}")
print(f"list(filtered_overlap_user_ids_set)[:10] = {list(filtered_overlap_user_ids_set)[:10]}")

len(filtered_overlap_user_ids_set) = 32
list(filtered_overlap_user_ids_set)[:10] = [148744, 131594, 30735, 240144, 272786, 226965, 21014, 11676, 252829, 36256]


In [20]:
# Check the results
result = (
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] in filtered_overlap_user_ids_set and row['isbn'] in books_user_like_set, axis=1)
)

filtered_ratings_df = ratings_df[result]
filtered_ratings_df = (
    filtered_ratings_df
    .groupby('user_id')
    .count()
    .reset_index()
    .rename(columns={'rating': 'ratings_no'})
    .sort_values(by='ratings_no')
)

print(f"len(filtered_ratings_df) = {len(filtered_ratings_df)}")

filtered_ratings_df

len(filtered_ratings_df) = 32


Unnamed: 0,user_id,isbn,ratings_no
15,100906,2,2
29,240144,2,2
28,238120,2,2
27,235842,2,2
26,229741,2,2
25,226965,2,2
24,225996,2,2
23,211426,2,2
22,190448,2,2
20,150124,2,2


## Find similar user book ratings

In [21]:
# For filtered users, user_id, book_isbn and rating will be stored in interactions_list
interactions_arr = ratings_df[(
    ratings_df[['user_id', 'isbn']]
    .apply(lambda row: row['user_id'] in filtered_overlap_user_ids_set and row['isbn'] in books_user_like_set, axis=1)
)].values

print(f"len(interactions_arr) = {len(interactions_arr)}")

len(interactions_arr) = 68


In [22]:
interactions_arr[:3]

array([[254, '0345350499', 0],
       [254, '0439064872', 9],
       [254, '0590353403', 9]], dtype=object)

In [23]:
# Check the correctness
print(f"sum(filtered_ratings_df['ratings_no'].values) = {sum(filtered_ratings_df['ratings_no'].values)})")
print(f"len(interactions_arr) = {len(interactions_arr)}")

sum(filtered_ratings_df['ratings_no'].values) = 68)
len(interactions_arr) = 68


The results show that the previous operation successfully selected all records from filtered_overlap_user_ids_set and their ratings for books from books_user_like_set.  
However, in order to make recommendations, users in filtered_overlap_user_ids_set need to select **all the books they have read**, **not just books in books_user_like_set** (since the task is to recommend new books to the user (books they haven't read yet)).

In [26]:
# For filtered users, user_id, book_isbn and rating will be stored in interactions_arr
start_time = time.time()

interactions_arr = ratings_df[(
    ratings_df['user_id']
    .apply(lambda user_id: user_id in filtered_overlap_user_ids_set) # Maybe also filter: isbn not in books_user_like_set - no need, because it will be used later to find the users with the same book tastes as current_user 
)].values

duration = time.time() - start_time
print(f"duration of creating the interactions_arr = {duration // 60: .0f}m {duration % 60: .0f}s\n")

print(f"len(interactions_arr) = {len(interactions_arr)}")
print(f"interactions_arr[:5] = {interactions_arr[:5]}")

duration of creating the interactions_arr =  0m  0s

len(interactions_arr) = 35418
interactions_arr[:5] = [[254 '006000438X' 0]
 [254 '0060013117' 0]
 [254 '0060199563' 0]
 [254 '0060391448' 0]
 [254 '0060502320' 7]]


In [27]:
interactions_df = pd.DataFrame(data=interactions_arr, columns=['user_id', 'isbn', 'rating'])

# Check the results
print(f"len(interactions_df) = {len(interactions_df)}")
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")

len(interactions_df) = 35418
interactions_df['user_id'].nunique() = 32


In [28]:
interactions_df.head(3)

Unnamed: 0,user_id,isbn,rating
0,254,006000438X,0
1,254,0060013117,0
2,254,0060199563,0


## Create a user/book matrix

In [29]:
interactions_df = pd.concat([books_user_like_df[['user_id', 'isbn', 'rating']], interactions_df])
interactions_df

Unnamed: 0,user_id,isbn,rating
0,-1,0590353403,10
1,-1,0439064872,9
2,-1,0345350499,10
0,254,006000438X,0
1,254,0060013117,0
...,...,...,...
35413,272786,0446672211,8
35414,272786,0465017606,0
35415,272786,059035342X,9
35416,272786,0679723161,0


In [30]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35421 entries, 0 to 35417
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  35421 non-null  object
 1   isbn     35421 non-null  object
 2   rating   35421 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [31]:
interactions_df['user_id'] = pd.to_numeric(interactions_df['user_id'])
interactions_df['rating'] = pd.to_numeric(interactions_df['rating'])
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35421 entries, 0 to 35417
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  35421 non-null  int64 
 1   isbn     35421 non-null  object
 2   rating   35421 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [32]:
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")
print(f"interactions_df['isbn'].nunique() = {interactions_df['isbn'].nunique()}")

interactions_df['user_id'].nunique() = 33
interactions_df['isbn'].nunique() = 28928


The obtained results show that there are far fewer users than books (this is logical, because different users could read different books). This indicates the need to use a **sparse matrix to save memory**.

In [33]:
interactions_df['user_index'] = interactions_df['user_id'].astype('category').cat.codes
interactions_df['book_index'] = interactions_df['isbn'].astype('category').cat.codes

# To check the correctness of creating cols 'user_index' and 'book_index'
print(f"interactions_df['user_id'].nunique() = {interactions_df['user_id'].nunique()}")
print(f"interactions_df['user_index'].nunique() = {interactions_df['user_index'].nunique()}\n")

print(f"interactions_df['isbn'].nunique() = {interactions_df['isbn'].nunique()}")
print(f"interactions_df['book_index'].nunique() = {interactions_df['book_index'].nunique()}\n")

interactions_df.head(3)

interactions_df['user_id'].nunique() = 33
interactions_df['user_index'].nunique() = 33

interactions_df['isbn'].nunique() = 28928
interactions_df['book_index'].nunique() = 28928



Unnamed: 0,user_id,isbn,rating,user_index,book_index
0,-1,590353403,10,0,15720
1,-1,439064872,9,0,10025
2,-1,345350499,10,0,4603


In [34]:
ratings_matrix_coo = coo_matrix((interactions_df['rating'], (interactions_df['user_index'], interactions_df['book_index'])))
ratings_matrix_coo

<33x28928 sparse matrix of type '<class 'numpy.int64'>'
	with 35421 stored elements in COOrdinate format>

In [35]:
ratings_matrix = ratings_matrix_coo.tocsr()

## Find users similar to current_user

In [36]:
# Get user_index for the current_user
interactions_df[interactions_df['user_id'] == -1]

Unnamed: 0,user_id,isbn,rating,user_index,book_index
0,-1,590353403,10,0,15720
1,-1,439064872,9,0,10025
2,-1,345350499,10,0,4603


In [38]:
current_user_index = interactions_df[interactions_df['user_id'] == -1]['user_index'].unique()[0]
print(f"current_user_index = {current_user_index}")

current_user_index = 0


In [40]:
similarity = cosine_similarity(ratings_matrix[current_user_index, :], ratings_matrix).flatten()

print(f"len(similarity) = {len(similarity)}")
print(f"similarity[:5] = {similarity[:5]}") # the first element = similarity(current_user, current_user) = 1

len(similarity) = 33
similarity[:5] = [1.         0.16919625 0.08900757 0.01298811 0.        ]


In [44]:
similar_user_indices = np.argpartition(similarity, -(SIMILAR_USERS_NO + 1))[-(SIMILAR_USERS_NO + 1):]
similar_user_indices = np.delete(similar_user_indices, np.where(similar_user_indices == 0))
print(f"SIMILAR_USERS_NO = {SIMILAR_USERS_NO}; len(similar_user_indices) = {len(similar_user_indices)}")
print(f"similar_user_indices = {similar_user_indices}")

SIMILAR_USERS_NO = 15; len(similar_user_indices) = 15
similar_user_indices = [22 25  2  8 19  7 31  1 13 26 12  6 18 32 23]


In [46]:
similar_users = interactions_df[interactions_df['user_index'].isin(similar_user_indices)].copy()

print(f"similar_users['user_id'].nunique() = {similar_users['user_id'].nunique()}\n")

similar_users

similar_users['user_id'].nunique() = 15



Unnamed: 0,user_id,isbn,rating,user_index,book_index
0,254,006000438X,0,1,251
1,254,0060013117,0,1,278
2,254,0060199563,0,1,468
3,254,0060391448,0,1,535
4,254,0060502320,7,1,559
...,...,...,...,...,...
35413,272786,0446672211,8,32,11652
35414,272786,0465017606,0,32,13219
35415,272786,059035342X,9,32,15721
35416,272786,0679723161,0,32,18037


## Create book recommendations

In [109]:
book_recs_df = (
    similar_users
    .groupby('isbn')
    ['rating']
    .agg(['count', 'mean'])
    .rename(columns = {
        'count': 'ratings_no',
        'mean': 'avg_rating'
    })
    .sort_values(by='ratings_no', ascending=False)
)

print(f"book_recs_df.shape = {book_recs_df.shape}")
book_recs_df

book_recs_df.shape = (3272, 2)


Unnamed: 0_level_0,ratings_no,avg_rating
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
0590353403,14,6.714286
0439064864,10,6.300000
0345350499,10,8.400000
0439136350,10,7.100000
0439064872,9,8.111111
...,...,...
0425141977,1,0.000000
042514321X,1,7.000000
0425143503,1,0.000000
0425145638,1,0.000000


In [110]:
books_df.head(3)

Unnamed: 0,isbn,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [111]:
# merge book_recs_df and books_df to get all the information about recommended books
book_recs_df = book_recs_df.merge(books_df, how='inner', on='isbn')
book_recs_df.head(3)

Unnamed: 0,isbn,ratings_no,avg_rating,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,590353403,14,6.714286,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...
1,439064864,10,6.3,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...
2,345350499,10,8.4,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...


In [112]:
print(f"book_recs_df.shape before filtering = {book_recs_df.shape}")
book_recs_df = book_recs_df[book_recs_df['ratings_no'] >= MIN_BOOK_RATINGS_NO_CFB]
print(f"book_recs_df.shape after filtering = {book_recs_df.shape}")

book_recs_df.shape before filtering = (3147, 10)
book_recs_df.shape after filtering = (46, 10)


In [113]:
book_total_ratings_df = (
    ratings_df
    .groupby('isbn')
    ['rating']
    .count()
    .reset_index()
    .rename(columns = {'rating': 'total_ratings_no'})
)
print(f"book_total_ratings_df.shape = {book_total_ratings_df.shape}\n")
book_total_ratings_df.sort_values(by='total_ratings_no', ascending=False).head()

book_total_ratings_df.shape = (340556, 2)



Unnamed: 0,isbn,total_ratings_no
247408,971880107,2502
47371,316666343,1295
83359,385504209,883
9637,60928336,732
41007,312195516,723


In [114]:
# Check the results
print(f"book_total_ratings_df.shape = {book_total_ratings_df.shape}")
print(f"ratings_df['isbn'].nunique() = {ratings_df['isbn'].nunique()}")
print(f"books_df['isbn'].nunique() = {books_df['isbn'].nunique()}\n")

print(f"sum(book_total_ratings_df['total_ratings_no'].values) = {sum(book_total_ratings_df['total_ratings_no'].values)}")
print(f"ratings_df.shape = {ratings_df.shape}")

book_total_ratings_df.shape = (340556, 2)
ratings_df['isbn'].nunique() = 340556
books_df['isbn'].nunique() = 271360

sum(book_total_ratings_df['total_ratings_no'].values) = 1149780
ratings_df.shape = (1149780, 3)


In [115]:
book_recs_df = book_recs_df.merge(book_total_ratings_df, how='inner', on='isbn')
print(f"book_recs_df.shape = {book_recs_df.shape}\n")
book_recs_df.head(3)

book_recs_df.shape = (46, 11)



Unnamed: 0,isbn,ratings_no,avg_rating,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l,total_ratings_no
0,590353403,14,6.714286,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,168
1,439064864,10,6.3,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,170
2,345350499,10,8.4,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,181


In [116]:
# Get the current columns
cols = book_recs_df.columns.tolist()

# Rearrange columns to move the last column to the 4th position
new_cols = cols[:3] + [cols[-1]] + cols[3:-1]
print(f"new_cols = {new_cols}")

# Reindex the DataFrame with the new column order
book_recs_df = book_recs_df.reindex(columns=new_cols)
book_recs_df.head(3)

new_cols = ['isbn', 'ratings_no', 'avg_rating', 'total_ratings_no', 'title', 'author', 'publication_year', 'publisher', 'image-url-s', 'image-url-m', 'image-url-l']


Unnamed: 0,isbn,ratings_no,avg_rating,total_ratings_no,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,590353403,14,6.714286,168,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...
1,439064864,10,6.3,170,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...
2,345350499,10,8.4,181,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...


#### Create a 'book_score' variable to recommend books

In [117]:
book_recs_df['adjusted_ratings_no'] = book_recs_df['ratings_no'] * (book_recs_df['ratings_no'] / book_recs_df['total_ratings_no'])
book_recs_df['book_score'] = book_recs_df['avg_rating'] * book_recs_df['adjusted_ratings_no']

# Get the current columns
cols = book_recs_df.columns.tolist()

# Rearrange columns to move the last column to the 4th position
new_cols = cols[:4] + cols[-2:] + cols[4:-2]

# Reindex the DataFrame with the new column order
book_recs_df = book_recs_df.reindex(columns=new_cols)

book_recs_df.head(3)

Unnamed: 0,isbn,ratings_no,avg_rating,total_ratings_no,adjusted_ratings_no,book_score,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l
0,590353403,14,6.714286,168,1.166667,7.833333,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...
1,439064864,10,6.3,170,0.588235,3.705882,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...
2,345350499,10,8.4,181,0.552486,4.640884,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...


#### Filter the results and recommend books

In [119]:
# Filter book_recs_df

print(f"book_recs_df.shape before filtering = {book_recs_df.shape}")

book_recs_df = book_recs_df[~book_recs_df['isbn'].isin(books_user_like_df['isbn'])]

# Create a new 'mod_title' column in both book_recs_df and books_user_like_df 
book_recs_df['mod_title'] = book_recs_df['title'].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
book_recs_df['mod_title'] = book_recs_df['mod_title'].str.replace("\s+", " ", regex=True)

books_user_like_df['mod_title'] = books_user_like_df['title'].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
books_user_like_df['mod_title'] = books_user_like_df['mod_title'].str.replace("\s+", " ", regex=True)

book_recs_df = book_recs_df[~book_recs_df['mod_title'].isin(books_user_like_df['mod_title'])]

print(f"book_recs_df.shape after filtering = {book_recs_df.shape}")

book_recs_df.shape before filtering = (46, 13)
book_recs_df.shape after filtering = (42, 14)


In [125]:
top_books_recs_df = book_recs_df.drop_duplicates('title').sort_values(by='book_score', ascending=False)
top_books_recs_df.head(10)

Unnamed: 0,isbn,ratings_no,avg_rating,total_ratings_no,adjusted_ratings_no,book_score,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l,mod_title
3,0439136350,10,7.1,197,0.507614,3.604061,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,harry potter and the prisoner of azkaban book 3
38,0451458621,3,5.333333,20,0.45,2.4,Priestess Of Avalon,Marion Zimmer Bradley,2002,Roc,http://images.amazon.com/images/P/0451458621.0...,http://images.amazon.com/images/P/0451458621.0...,http://images.amazon.com/images/P/0451458621.0...,priestess of avalon
15,0618002219,4,8.75,70,0.228571,2.0,The Hobbit: or There and Back Again,J.R.R. Tolkien,1999,Houghton Mifflin Company,http://images.amazon.com/images/P/0618002219.0...,http://images.amazon.com/images/P/0618002219.0...,http://images.amazon.com/images/P/0618002219.0...,the hobbit or there and back again
6,0439139600,7,6.428571,193,0.253886,1.632124,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2002,Scholastic Paperbacks,http://images.amazon.com/images/P/0439139600.0...,http://images.amazon.com/images/P/0439139600.0...,http://images.amazon.com/images/P/0439139600.0...,harry potter and the goblet of fire book 4
14,0451454243,4,6.25,65,0.246154,1.538462,The Forest House,Marion Zimmer Bradley,1995,Roc,http://images.amazon.com/images/P/0451454243.0...,http://images.amazon.com/images/P/0451454243.0...,http://images.amazon.com/images/P/0451454243.0...,the forest house
33,0618002227,3,8.333333,63,0.142857,1.190476,The Fellowship of the Ring (The Lord of the Ri...,J. R. R. Tolkien,1999,Houghton Mifflin Company,http://images.amazon.com/images/P/0618002227.0...,http://images.amazon.com/images/P/0618002227.0...,http://images.amazon.com/images/P/0618002227.0...,the fellowship of the ring the lord of the rin...
5,043935806X,7,6.857143,334,0.146707,1.005988,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,http://images.amazon.com/images/P/043935806X.0...,harry potter and the order of the phoenix book 5
32,0385425074,3,3.333333,33,0.272727,0.909091,Backlash: The Undeclared War Against American ...,Susan Faludi,1992,Anchor,http://images.amazon.com/images/P/0385425074.0...,http://images.amazon.com/images/P/0385425074.0...,http://images.amazon.com/images/P/0385425074.0...,backlash the undeclared war against american w...
41,1573225517,3,9.0,105,0.085714,0.771429,High Fidelity,Nick Hornby,1996,Riverhead Books,http://images.amazon.com/images/P/1573225517.0...,http://images.amazon.com/images/P/1573225517.0...,http://images.amazon.com/images/P/1573225517.0...,high fidelity
26,0345325818,3,4.0,63,0.142857,0.571429,The Silmarillion,J.R.R. TOLKIEN,1985,Del Rey,http://images.amazon.com/images/P/0345325818.0...,http://images.amazon.com/images/P/0345325818.0...,http://images.amazon.com/images/P/0345325818.0...,the silmarillion


#### Improve the display of the books

In [133]:
top_books_recs_df.head(5).style.format({'image-url-s': show_image, 'image-url-m': make_clickable})

Unnamed: 0,isbn,ratings_no,avg_rating,total_ratings_no,adjusted_ratings_no,book_score,title,author,publication_year,publisher,image-url-s,image-url-m,image-url-l,mod_title
3,439136350,10,7.1,197,0.507614,3.604061,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,,Amazon Image,http://images.amazon.com/images/P/0439136350.01.LZZZZZZZ.jpg,harry potter and the prisoner of azkaban book 3
38,451458621,3,5.333333,20,0.45,2.4,Priestess Of Avalon,Marion Zimmer Bradley,2002,Roc,,Amazon Image,http://images.amazon.com/images/P/0451458621.01.LZZZZZZZ.jpg,priestess of avalon
15,618002219,4,8.75,70,0.228571,2.0,The Hobbit: or There and Back Again,J.R.R. Tolkien,1999,Houghton Mifflin Company,,Amazon Image,http://images.amazon.com/images/P/0618002219.01.LZZZZZZZ.jpg,the hobbit or there and back again
6,439139600,7,6.428571,193,0.253886,1.632124,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2002,Scholastic Paperbacks,,Amazon Image,http://images.amazon.com/images/P/0439139600.01.LZZZZZZZ.jpg,harry potter and the goblet of fire book 4
14,451454243,4,6.25,65,0.246154,1.538462,The Forest House,Marion Zimmer Bradley,1995,Roc,,Amazon Image,http://images.amazon.com/images/P/0451454243.01.LZZZZZZZ.jpg,the forest house
