In [1]:
"""
Uses cosine similarity.
Filters only books where at least 100 reviews are present.
Only where reviewers have given at least 200 reviews.
"""


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [2]:
books_df = pd.read_csv("../data/Books.csv")
users_df = pd.read_csv("../data/Users.csv")
ratings_df = pd.read_csv("../data/ratings.csv")

  books_df = pd.read_csv("../data/Books.csv")


In [3]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
# Clean up
# 1. Find the user ID of those who have rater either too few books (<5) and too many books (>200)
# 2. Remove those users from the ratings_df

valid_transaction_df = ratings_df.groupby('User-ID').filter(lambda x: len(x) > 5 and len(x) < 200)
valid_transaction_df.groupby('User-ID').size().sort_values(ascending=False)

User-ID
240403    199
203017    199
193458    199
2033      198
267061    198
         ... 
95420       6
95156       6
231313      6
95146       6
233397      6
Length: 18812, dtype: int64

In [5]:
ratings_with_book_titles = ratings_df.merge(books_df,on='ISBN')
ratings_with_book_titles.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [6]:
ratings_with_book_titles.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)

In [7]:
complete_df = ratings_with_book_titles.merge(users_df.drop("Age", axis=1), on="User-ID")
complete_df.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Location
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,"tyler, texas, usa"
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,"cincinnati, ohio, usa"
2,2313,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,"cincinnati, ohio, usa"
3,2313,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,http://images.amazon.com/images/P/0679745580.0...,"cincinnati, ohio, usa"
4,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,"cincinnati, ohio, usa"


In [8]:
complete_df['Location'] = complete_df['Location'].str.split(',').str[-1].str.strip()
complete_df.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Location
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,usa
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,usa
2,2313,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,usa
3,2313,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,http://images.amazon.com/images/P/0679745580.0...,usa
4,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,usa


In [9]:
min_user_reviewers = 200
user_ratings = complete_df.groupby('User-ID').count()
filtered_users = user_ratings[user_ratings > min_user_reviewers].index
complete_df = complete_df[complete_df['User-ID'].isin(filtered_users)]


In [10]:
min_ratings_count_threshold=100
rating_counts= complete_df.groupby('Book-Title').count()['Book-Rating']
popular_books = rating_counts[rating_counts >= min_ratings_count_threshold].index

In [11]:
final_ratings =  complete_df[complete_df['Book-Title'].isin(popular_books)]
print(f"Number of ratings: {len(final_ratings)}")
final_ratings.head()

Number of ratings: 183799


Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Location
2,2313,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,usa
4,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,usa
6,2313,5,The Bonesetter's Daughter,Amy Tan,2001,Putnam Publishing Group,http://images.amazon.com/images/P/0399146431.0...,usa
7,2313,9,The Princess Bride: S Morgenstern's Classic Ta...,WILLIAM GOLDMAN,1987,Del Rey,http://images.amazon.com/images/P/0345348036.0...,usa
9,2313,0,The Sparrow,MARY DORIA RUSSELL,1997,Fawcett Books,http://images.amazon.com/images/P/0449912558.0...,usa


In [12]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID'
                          ,values='Book-Rating')
pt.head()

User-ID,8,9,14,16,17,26,32,39,42,44,...,278819,278820,278824,278828,278832,278836,278843,278844,278846,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
24 Hours,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,,,,,,,,,,...,,,,,,,,,,
4 Blondes,,,,,,,,,,,...,,,,,,,,,,


In [13]:
pt.fillna(0,inplace=True)
pt

User-ID,8,9,14,16,17,26,32,39,42,44,...,278819,278820,278824,278828,278832,278836,278843,278844,278846,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Define the rating scale
reader = Reader(rating_scale=(0, 10))

# Load the data into Surprise's dataset format
data = Dataset.load_from_df(complete_df[['User-ID', 'Book-Title', 'Book-Rating']], reader)

# Split the dataset into training and testing sets
train_set, test_set = train_test_split(data, test_size=0.20, random_state=42)

# Define the SVD algorithm
model = SVD()

# Train the algorithm on the training set
model.fit(train_set)

# Make predictions on the test set
predictions = model.test(test_set)

# Evaluate the model
accuracy.rmse(predictions)


RMSE: 3.5208


3.5208088777408277

In [15]:
def recommend_books(user_id, n=10):
    # List all unique book titles
    all_books = complete_df['Book-Title'].unique()

    # Remove books already rated by the user
    rated_books = complete_df[complete_df['User-ID'] == user_id]['Book-Title'].values
    books_to_predict = [book for book in all_books if book not in rated_books]

    # Predict ratings for remaining books
    predictions = []
    for book in books_to_predict:
        pred = model.predict(user_id, book)
        predictions.append((book, pred.est))

    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get top N recommendations
    top_n = predictions[:n]

    return top_n

In [16]:
user_id = 271705
recommended_books = recommend_books(user_id)
print(f"Top 10 recommended books for user {user_id}:")
for i, (title, _) in enumerate(recommended_books, start=1):
    print(f"{i}. {title}")

Top 10 recommended books for user 271705:
1. Ahab's Wife: Or, The Star-Gazer: A Novel
2. The Color of Magic
3. The Dive From Clausen's Pier (Alex Awards)
4. Q Is for Quarry
5. The Honk and Holler Opening Soon
6. Night Sins
7. A Prayer for Owen Meany
8. Loves Music, Loves to Dance
9. Nickel and Dimed: On (Not) Getting By in America
10. Stupid White Men ...and Other Sorry Excuses for the State of the Nation!


In [17]:
def get_user_history(user_id):
    return complete_df[complete_df['User-ID'] == user_id]['Book-Title'].values

In [18]:
get_user_history(user_id)

array(['A Wrinkle In Time',
       'Politically Correct Bedtime Stories: Modern Tales for Our Life and Times',
       "One Flew over the Cuckoo's Nest (Penguin Classics)",
       'The Da Vinci Code', 'The Joy Luck Club',
       "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
       'To Kill a Mockingbird',
       'Reading Lolita in Tehran: A Memoir in Books',
       'The Divine Secrets of the Ya-Ya Sisterhood: A Novel',
       'Girl in Hyacinth Blue', 'While I Was Gone',
       'Girl with a Pearl Earring', 'Chocolat',
       'As mulheres deviam vir com livro de instruÃ§Ãµes (Campo da literatura)',
       'Little Altars Everywhere: A Novel',
       'The Red Tent (Bestselling Backlist)', 'A Walk to Remember',
       'Bel Canto: A Novel', 'Oliver Twist (Penguin Classics)',
       'The Secret Life of Bees', 'Lucky : A Memoir',
       "White Oleander : A Novel (Oprah's Book Club)",
       "The Dress Lodger (Ballantine Reader's Circle)",
       'Middlesex: A Novel',
     

In [22]:

def explore_user(user_id):
    print(get_user_history(user_id))
    print("===== Recommendations ========")
    recommended_books = recommend_books(user_id)
    print(f"Top 10 recommended books for user {user_id}:")
    for i, (title, _) in enumerate(recommended_books, start=1):
        print(f"{i}. {title}")


In [23]:
user_id=276808
explore_user(user_id)

['Richard Brautigan : A Confederate General from Big Sur, Dreaming of Babylon, and the Hawkline Monster (Three Books in the Manner of Their Original ed)']
Top 10 recommended books for user 276808:
1. Free
2. The Little Prince
3. A Light in the Attic
4. The Curious Incident of the Dog in the Night-Time (Vintage Contemporaries)
5. The Phantom Tollbooth
6. Interpreter of Maladies
7. Tuesdays with Morrie: An Old Man, a Young Man, and Life's Greatest Lesson
8. BAG OF BONES : A NOVEL
9. Falling Up
10. Anne Frank: The Diary of a Young Girl


In [24]:
user_ids = [276822, 276847, 276856]
for user_id in user_ids:
    explore_user(user_id)
    print("====================================="*2)

['The Boy Next Door' 'Skin and Other Stories (Now in Speak!)'
 'Growing Wings' 'The Riddle of Scheherazade: And Other Amazing Puzzles'
 'Hoot (Newbery Honor Book)'
 "A Kid's Guide to How to Save the Planet (Camelot world)"
 'The Last Book in the Universe' 'The Contest' 'A String in the Harp'
 'The Sandy Bottom Orchestra' 'Artemis Fowl (Artemis Fowl, Book 1)'
 'The Number Devil: A Mathematical Adventure' 'Surviving Sam'
 'Random Acts of Kindness']
Top 10 recommended books for user 276822:
1. A Prayer for Owen Meany
2. Harry Potter and the Sorcerer's Stone (Book 1)
3. Seabiscuit: An American Legend
4. Free
5. Falling Up
6. The Secret Life of Bees
7. YESTERDAY, I CRIED : Celebrating the Lessons of Living and Loving
8. Interpreter of Maladies
9. The Jungle (Bantam Classics)
10. Wicked: The Life and Times of the Wicked Witch of the West
['Along Came a Spider (Alex Cross Novels)' 'Schlafes Bruder'
 'Der Stein der Kelten.' 'Nordermoor' 'Nur der Tod ist ohne Makel.'
 'Der Kleine Hobbit'
 'Die 