In [1]:
"""
Uses cosine similarity.
Filters only books where at least 100 reviews are present.
Only where reviewers have given at least 200 reviews.
"""


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [2]:
books_df = pd.read_csv("../data/Books.csv")
users_df = pd.read_csv("../data/Users.csv")
ratings_df = pd.read_csv("../data/ratings.csv")

  books_df = pd.read_csv("../data/Books.csv")


In [3]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
# Clean up
# 1. Find the user ID of those who have rater either too few books (<5) and too many books (>200)
# 2. Remove those users from the ratings_df

valid_transaction_df = ratings_df.groupby('User-ID').filter(lambda x: len(x) > 5 and len(x) < 200)
valid_transaction_df.groupby('User-ID').size().sort_values(ascending=False)

User-ID
240403    199
203017    199
193458    199
2033      198
267061    198
         ... 
95420       6
95156       6
231313      6
95146       6
233397      6
Length: 18812, dtype: int64

In [5]:
ratings_with_book_titles = ratings_df.merge(books_df,on='ISBN')
ratings_with_book_titles.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [6]:
ratings_with_book_titles.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)

In [7]:
complete_df = ratings_with_book_titles.merge(users_df.drop("Age", axis=1), on="User-ID")
complete_df.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Location
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,"tyler, texas, usa"
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,"cincinnati, ohio, usa"
2,2313,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,"cincinnati, ohio, usa"
3,2313,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,http://images.amazon.com/images/P/0679745580.0...,"cincinnati, ohio, usa"
4,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,"cincinnati, ohio, usa"


In [8]:
complete_df['Location'] = complete_df['Location'].str.split(',').str[-1].str.strip()
complete_df.head()

Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Location
0,276725,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,usa
1,2313,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,usa
2,2313,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,usa
3,2313,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,http://images.amazon.com/images/P/0679745580.0...,usa
4,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,usa


In [9]:
min_user_reviewers = 200
user_ratings = complete_df.groupby('User-ID').count()
filtered_users = user_ratings[user_ratings > min_user_reviewers].index
complete_df = complete_df[complete_df['User-ID'].isin(filtered_users)]


In [10]:
min_ratings_count_threshold=100
rating_counts= complete_df.groupby('Book-Title').count()['Book-Rating']
popular_books = rating_counts[rating_counts >= min_ratings_count_threshold].index

In [45]:
final_ratings =  complete_df[complete_df['Book-Title'].isin(popular_books)]
print(f"Number of ratings: {len(final_ratings)}")
final_ratings.head()

Number of ratings: 183799


Unnamed: 0,User-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L,Location
2,2313,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,usa
4,2313,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,http://images.amazon.com/images/P/0060173289.0...,usa
6,2313,5,The Bonesetter's Daughter,Amy Tan,2001,Putnam Publishing Group,http://images.amazon.com/images/P/0399146431.0...,usa
7,2313,9,The Princess Bride: S Morgenstern's Classic Ta...,WILLIAM GOLDMAN,1987,Del Rey,http://images.amazon.com/images/P/0345348036.0...,usa
9,2313,0,The Sparrow,MARY DORIA RUSSELL,1997,Fawcett Books,http://images.amazon.com/images/P/0449912558.0...,usa


In [25]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Define the rating scale
reader = Reader(rating_scale=(0, 10))

# Load the data into Surprise's dataset format
data = Dataset.load_from_df(final_ratings[['User-ID', 'Book-Title', 'Book-Rating']], reader)

# Split the dataset into training and testing sets
train_set, test_set = train_test_split(data, test_size=0.20, random_state=42)

# Define the SVD algorithm
model = SVD()

# Train the algorithm on the training set
model.fit(train_set)

# Make predictions on the test set
predictions = model.test(test_set)

# Evaluate the model
accuracy.rmse(predictions)


RMSE: 3.8444


3.8444398543679816

In [27]:
def recommend_books(user_id, n=10):
    # List all unique book titles
    all_books = final_ratings['Book-Title'].unique()

    # Remove books already rated by the user
    rated_books = final_ratings[final_ratings['User-ID'] == user_id]['Book-Title'].values
    books_to_predict = [book for book in all_books if book not in rated_books]

    # Predict ratings for remaining books
    predictions = []
    for book in books_to_predict:
        pred = model.predict(user_id, book)
        predictions.append((book, pred.est))

    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get top N recommendations
    top_n = predictions[:n]

    return top_n

In [47]:

import random
def recommend_books_fabricated_profile(ratings:dict,  n=10):
    # List all unique book titles
    all_books = final_ratings['Book-Title'].unique()

    # Remove books already rated by the user
    # rated_books = final_ratings[final_ratings['User-ID'] == user_id]['Book-Title'].values

    rated_books = ratings.keys()

    books_to_predict = [book for book in all_books if book not in rated_books]

    # Predict ratings for remaining books
    predictions = []
    random_user_id = random.randint(2000, 1000000)
    for book in books_to_predict:
        # add history to model.
        for rated_book, rating in ratings.items():
            final_ratings = final_ratings.append({'User-ID': random_user_id, 'Book-Title': rated_book, 'Book-Rating': rating}, ignore_index=True)

        # retrain model
        data = Dataset.load_from_df(final_ratings[['User-ID', 'Book-Title', 'Book-Rating']], reader)
        model = SVD()
        model.fit(data.build_full_trainset())
        pred = model.predict(random_user_id, book)
        predictions.append((book, pred.est))

    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get top N recommendations
    top_n = predictions[:n]

    return top_n

In [48]:
# Recommend fabricated where user has rated only: Harry Potter and the Sorcerer's Stone as 10.

recommend_books_fabricated_profile({"Harry Potter and the Sorcerer's Stone":0})

UnboundLocalError: local variable 'final_ratings' referenced before assignment

In [33]:
def get_user_history(user_id):
    return final_ratings[final_ratings['User-ID'] == user_id]['Book-Title'].values

In [34]:

def explore_user(user_id):
    print(get_user_history(user_id))
    print("===== Recommendations ========")
    recommended_books = recommend_books(user_id)
    print(f"Top 10 recommended books for user {user_id}:")
    for i, (title, _) in enumerate(recommended_books, start=1):
        print(f"{i}. {title}")


In [35]:
user_id=276808
explore_user(user_id)

[]
Top 10 recommended books for user 276808:
1. The Little Prince
2. Harry Potter and the Sorcerer's Stone (Book 1)
3. Ender's Game (Ender Wiggins Saga (Paperback))
4. Harry Potter and the Prisoner of Azkaban (Book 3)
5. Harry Potter and the Goblet of Fire (Book 4)
6. The Perks of Being a Wallflower
7. Anne Frank: The Diary of a Young Girl
8. Harry Potter and the Order of the Phoenix (Book 5)
9. Ishmael: An Adventure of the Mind and Spirit
10. The Stand: Complete and Uncut


In [36]:
user_ids = [276822, 276847, 276856]
for user_id in user_ids:
    explore_user(user_id)
    print("====================================="*2)

['The Boy Next Door' 'Artemis Fowl (Artemis Fowl, Book 1)']
Top 10 recommended books for user 276822:
1. Harry Potter and the Sorcerer's Stone (Book 1)
2. Harry Potter and the Prisoner of Azkaban (Book 3)
3. The Da Vinci Code
4. Hard Eight : A Stephanie Plum Novel (A Stephanie Plum Novel)
5. The Little Prince
6. The Curious Incident of the Dog in the Night-Time (Today Show Book Club #13)
7. Pride and Prejudice
8. The Bean Trees
9. Tuesdays with Morrie: An Old Man, a Young Man, and Life's Greatest Lesson
10. The Fellowship of the Ring (The Lord of the Rings, Part 1)
['Along Came a Spider (Alex Cross Novels)']
Top 10 recommended books for user 276847:
1. The Return of the King (The Lord of the Rings, Part 3)
2. Angus, Thongs and Full-Frontal Snogging: Confessions of Georgia Nicolson
3. A Year in Provence
4. Chicken Soup for the Woman's Soul (Chicken Soup for the Soul Series (Paper))
5. American Gods
6. Interview with the Vampire
7. Balzac and the Little Chinese Seamstress : A Novel
8. Ha

In [37]:
# Explore all users available interests.
all_users = final_ratings['User-ID'].unique()
for user_id in all_users[:10]:
    explore_user(user_id)
    print("====================================="*2)
    


["Ender's Game (Ender Wiggins Saga (Paperback))"
 'Divine Secrets of the Ya-Ya Sisterhood : A Novel'
 "The Bonesetter's Daughter"
 "The Princess Bride: S Morgenstern's Classic Tale of True Love and High Adventure"
 'The Sparrow' 'Midwives: A Novel' 'The Hundred Secret Senses'
 "Song of Solomon (Oprah's Book Club (Paperback))"]
Top 10 recommended books for user 2313:
1. The Lovely Bones: A Novel
2. Harry Potter and the Order of the Phoenix (Book 5)
3. The Tao of Pooh
4. Brave New World
5. Big Stone Gap: A Novel (Ballantine Reader's Circle)
6. The Little Prince
7. The Hitchhiker's Guide to the Galaxy
8. Animal Farm
9. Little Altars Everywhere: A Novel
10. Choke
['The Lovely Bones: A Novel' 'The Da Vinci Code' 'Wild Animus'
 'Four To Score (A Stephanie Plum Novel)'
 'Roses Are Red (Alex Cross Novels)' 'Violets Are Blue'
 "The Sweet Potato Queens' Book of Love" 'Fight Club' 'Fahrenheit 451'
 '1st to Die: A Novel' 'The Red Tent (Bestselling Backlist)'
 'The Passion of Artemisia'
 'One for t