In [33]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from IPython.display import display, Image
warnings.filterwarnings("ignore")

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [3]:
books = pd.read_csv("data/old/Books.csv")
users = pd.read_csv("data/old/Users.csv")
ratings = pd.read_csv('data/old/Ratings.csv')

In [4]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1048575, 3)
(278858, 3)


In [6]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [7]:
ratings_with_book_titles = ratings.merge(books,on='ISBN')

In [8]:
ratings_with_book_titles.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [10]:
ratings_with_book_titles.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)

KeyError: "['Location'] not found in axis"

In [11]:
ratings_with_book_titles

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
2,6543,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
3,8680,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
4,10314,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
...,...,...,...,...,...,...,...
941108,250764,0,HIDDEN FIRES,JANETTE RADCLIFFE,1978,Dell,http://images.amazon.com/images/P/0440106575.0...
941109,250764,0,"Cheyenne (Fortunes West, No 2)",A.R. Riefe,1988,New Amer Library (Mm),http://images.amazon.com/images/P/0451157516.0...
941110,250764,0,Glamorous Movie Stars of the Thirties: Paper D...,Tom Tierney,1982,Dover Publications,http://images.amazon.com/images/P/048623715X.0...
941111,250764,0,Schiaparelli Fashion Review: Paper Dolls in Fu...,Tom Tierney,1988,Dover Publications,http://images.amazon.com/images/P/0486256588.0...


### Collaborative Filtering

In [12]:
# users with more than 180 book ratings
rate_threshold = 180

# Count ratings per user
num_ratings_per_user = ratings_with_book_titles.groupby('User-ID')['Book-Rating'].count()

# Filter users with more than the minimum threshold
user_ids = num_ratings_per_user[num_ratings_per_user >rate_threshold].index

In [14]:
# Filter ratings from user_ids
user_ratings =ratings_with_book_titles[ratings_with_book_titles['User-ID'].isin(user_ids)]

In [15]:
min_rate_count_thresh=50
rating_counts= user_ratings.groupby('Book-Title').count()['Book-Rating']
popular_books = rating_counts[rating_counts >=min_rate_count_thresh].index

In [16]:
final_ratings = user_ratings[user_ratings['Book-Title'].isin(popular_books)]

In [17]:
pivot_table = final_ratings.pivot_table(index='Book-Title',columns='User-ID'
                          ,values='Book-Rating')
pivot_table

User-ID,254,2033,2276,2766,2977,3363,4017,4385,6251,6323,...,249862,249894,250184,250405,250764,277427,277478,277639,278188,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,,10.0,,,,,,,,...,,,,,,,,0.0,,
4 Blondes,,,,,,,,,0.0,,...,,,,,,,,,,
A Bend in the Road,0.0,,,7.0,,,,,,,...,,,,,,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,,7.0,,,,,,...,,,,,,,,,,
You Belong To Me,,,,,,,,,,0.0,...,,,0.0,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,,0.0,,,0.0,,...,,,,,,,,,,
Zoya,,,,,,,,,,,...,,,,,0.0,,,,,


In [18]:
pivot_table.fillna(0,inplace=True)
pivot_table

User-ID,254,2033,2276,2766,2977,3363,4017,4385,6251,6323,...,249862,249894,250184,250405,250764,277427,277478,277639,278188,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### cosine_similarity 
matrix as input, each rows represent a data point and  columns represent a feature. 
So in my data, rows represent users,  columns represent book titles. 
Function calculates the cosine similarity between every pair of users in the matrix, measuring angle between two vectors;
a 1 score shows perfect similarity, 
and 0 shows perfect dissimilarity. 
Output is square matrix where each element (i, j) represents the cosine similarity score between user i and user j.

Use this matrix to recommend items to users based on their similarity to other users who have rated those books highly.
For example, find the user with the highest cosine similarity to a particular user and recommend the items that the similar user rated highly.

In [19]:
from sklearn.metrics.pairwise import cosine_similarity 

In [20]:
similarity_score = cosine_similarity(pivot_table)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity 

similarity_score = cosine_similarity(pivot_table)

def recommend(book_name):
    index = np.where(pivot_table.index==book_name)[0][0]
    similar_books = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1], reverse=True)[1:6]
    
    data = []
    
    for i in similar_books:
        item = []
        temp_df = books[books['Book-Title'] == pivot_table.index[i[0]]]
        item.append(temp_df['Book-Title'].values[0]) 
        item.append(temp_df['Book-Author'].values[0])
        
        similarity = i[1]  # similarity score
        item.append(similarity)  # Adding similarity score
        
        data.append(item)
    return data

In [28]:
recommended_books = recommend("The Alienist")
for book in recommended_books:
    print(f"Book Title: {book[0]}")
    print(f"Author: {book[1]}")
    print(f"Similarity Score: {book[2]}")
    print()


Book Title: The Poisonwood Bible
Author: Barbara Kingsolver
Similarity Score: 0.30163647403280186

Book Title: The Angel of Darkness
Author: Caleb Carr
Similarity Score: 0.2955209461735067

Book Title: The Cradle Will Fall
Author: Mary Higgins Clark
Similarity Score: 0.28569389653568555

Book Title: The Shipping News : A Novel
Author: Annie Proulx
Similarity Score: 0.27750180972638644

Book Title: Secret History
Author: DONNA TARTT
Similarity Score: 0.25964676550264176



RMSE: 3.5036


3.5036455509735864