In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
import pickle 

def scrape_goodreads_ratings(user_id, max_pages=10):
    """
    Scrape a user's star ratings from Goodreads.
    
    Args:
    - user_id (str): Goodreads user ID or profile suffix.
    - max_pages (int): Maximum number of pages to scrape (each page contains ~30 books).
    
    Returns:
    - pd.DataFrame: A DataFrame containing book titles and ratings.
    """
    base_url = f"https://www.goodreads.com/review/list/{user_id}?shelf=read"
    headers = {"User-Agent": "Mozilla/5.0"}
    books = []

    for page in range(1, max_pages + 1):
        url = f"{base_url}&page={page}"
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find all book entries in the table
        rows = soup.find_all("tr", class_="bookalike review")
        if not rows:
            print("No more data found.")
            break

        for row in rows:
            try:
                title = row.find("td", class_="field title").a.text.strip()
                rating_element = row.find("td", class_="field rating")
                rating = rating_element.find("span", class_="staticStars").get("title", "No rating")
                stars = map_rating(rating)
                books.append({"Title": title, "Rating": stars, "User_id": user_id})
#                 print(title, rating, stars)
            except AttributeError:
                # Handle rows with missing data
                continue

        print(f"Page {page} scraped successfully.")
        time.sleep(random.uniform(1, 5))  # Be kind to the server and avoid being blocked

    # Return data as a pandas DataFrame
    return pd.DataFrame(books)



In [2]:
def map_rating(phrase):
    rating_map = {
        "liked it": 3,
        "really liked it": 4,
        "it was ok": 2, 
        "it was amazing": 5, 
        "did not like it": 1,
    }
    
    return rating_map.get(phrase, "Invalid rating")  # Default to "Invalid rating" if the phrase isn't in the dictionary


In [3]:
# if __name__ == "__main__":
# #     user_id = "6688207"  # Replace with the Goodreads user ID or profile suffix
# #     for user_id in tqdm(['30181442', '75009563', '11345366', '110912303', '113964939', '11215896', '53701594', '4622890', '93628736', '176180116']):
# #     for user_id in tqdm(['2974095', '4622890', '28953843', '16174645', '4159922', '4125660', '54886546', '16912659', '260116', '4685500', '21865425']):
# #     for user_id in tqdm(['53701594', '27709782', '7566229', '16652861', '30817744', '56259255', '4125660', '60964126', 
# #                          '176167767', '28510930', '1029975', '131020767', '28862120', '88713906', '160141433', '41097916', 
# #                          '20809863', '69519261', '24017481', '7376365', '75941333', '13571407', '106618742', '17792052',
# #                          '3534528', '130656897', '7474475', '4125412', '6336365', '6026811', '3438047']):
#     for user_id in ['169695556']:
#         print("User_id = ", user_id)
#         max_pages = 30  # Adjust based on expected data
#         ratings_data = scrape_goodreads_ratings(user_id, max_pages)

#         if not ratings_data.empty:
# #             print(ratings_data.head())
# #             ratings_data.to_csv("goodreads_ratings.csv", index=False)
#             ratings_data.to_csv('goodreads_ratings.csv', mode='a', header=False, index=False)
#             print("Data saved to goodreads_ratings.csv.")
#         else:
#             print("No data retrieved.")


In [4]:
df = pd.read_csv('goodreads_ratings_series.csv')
print(df.shape)
df = df.drop_duplicates()
# Print the entire DataFrame
print(df)

duplicate_count = df['Title'].duplicated().sum()
print("Number of books with at least two people rating it:", duplicate_count)
duplicate_counts_per_value = df['Title'].value_counts()
print(duplicate_counts_per_value)
print("Number of unique books: ", df['Title'].nunique())
num_users = df['User_id'].nunique()
user_ids = list(df['User_id'].unique())
print("number of users is: ", num_users)
print("user_ids = ", user_ids)

  df = pd.read_csv('goodreads_ratings_series.csv')


(223442, 6)
                                                    Title          Rating  \
0                                       I Am Watching You               3   
1       Three to Get Deadly\n        (Stephanie Plum, #3)               3   
2       Before the Coffee Gets Cold\n        (Before t...               4   
3       Dark Sacred Night\n        (Renée Ballard, #2;...               4   
4         Two for the Dough\n        (Stephanie Plum, #2)               4   
...                                                   ...             ...   
223437  Rise of the Evening Star\n        (Fablehaven,...               5   
223438  Harry Potter and the Sorcerer's Stone\n       ...               5   
223439       Perencanaan dan Pembangunan Sistem Informasi  Invalid rating   
223440                                         Two By Two               5   
223441   Something Borrowed\n        (Darcy & Rachel, #1)               5   

          User_id Series  First  Suggest  
0       169695558  F

In [5]:
# # Get a list of top titles in order
# top_titles = duplicate_counts_per_value.index.tolist()
# top_100 = top_titles[:100]

# for title in top_100:
#     print(title)
    
# with open("top_100.pkl", "wb") as file:
#     pickle.dump(top_100, file)

In [6]:
# threshold = 5#num_users * 0.1
# pop_titles = list(duplicate_counts_per_value[duplicate_counts_per_value > threshold].index)
# my_titles = df.loc[df["User_id"] == 169695558, "Title"].tolist()
# # print(my_titles)

# print("pop titles len = ", len(pop_titles))
# print(pop_titles)
# print("my titles len = ", len(my_titles))
# titles = list(set(pop_titles))# + my_titles))

# # #remove Harry Potter titles:
# # titles = [s for s in titles if "Harry Potter" not in s]

# num_titles = len(titles)


# print(titles)
# # print(titles)
# print("num_titles =", num_titles)

# # ratings = np.full((num_users, num_titles), None)
# ratings = np.zeros((num_users, num_titles))

# for index, row in df.iterrows():
#     if row['Title'] in titles:
#         try:
#             ratings[user_ids.index(row['User_id']), titles.index(row["Title"])] = int(row["Rating"])
# #             print("found ", row["Title"])
#         except:
#             pass
        
# print("ratings size = ", ratings.shape)
# ratings = ratings[~np.all(ratings == 0, axis=1)]
# print("ratings size = ", ratings.shape)
# # Save the list to a file
# with open("titles.pkl", "wb") as file:
#     pickle.dump(titles, file)

In [7]:
# # print(df['Suggest'])

# suggest = list(df['Suggest'])
# print(suggest)

# with open("suggest.pkl", "wb") as file:
#     suggest.dump(suggest, file)

In [8]:
# ratings_df = pd.DataFrame(ratings)
# print(ratings_df.shape)
# #delete users that don't have any of these ratings
# # ratings_df = ratings_df.loc[~(ratings_df == 0).all(axis=1)]
# ratings_df = ratings_df.loc[(ratings_df != 0).sum(axis=1) >= 4] #need at least 4 entries to stay
# print(ratings_df.shape)

# # Calculate percentage of non-zero elements
# percentage_nonzero = (np.count_nonzero(ratings_df) / ratings_df.size) * 100
# print("percentage_nonzero =", round(percentage_nonzero, 1), '%')

# # Save the list to a file
# with open("ratings_df.pkl", "wb") as file:
#     pickle.dump(ratings_df, file)


In [9]:
with open("titles.pkl", "rb") as file:
    titles = pickle.load(file)
    
with open("top_100.pkl", "rb") as file:
    top_100 = pickle.load(file)
    
with open("suggest.pkl", "rb") as file:
    suggest = pickle.load(file)
    
num_titles = len(titles)

# Load the list from the file
with open("ratings_df.pkl", "rb") as file:
    ratings_df = pickle.load(file)
    ratings = ratings_df.to_numpy()


In [10]:
# Convert the filled ratings data into a numpy array
ratings_matrix = ratings_df.values

# Initialize KNN (using user-based KNN)
import math
knn = NearestNeighbors(n_neighbors=50, metric='cosine')  # Using cosine similarity  math.ceil(num_users/10)
knn.fit(ratings_matrix)

with open("knn_model.pkl", "wb") as file:
    pickle.dump(knn, file)

# Example: Predict rating for User 0 on Item 2
user_id = 0  # Index of user in the matrix
item_id = 2  # Index of item to predict rating for

# Get the nearest neighbors for user 0 (excluding the user itself)
distances, indices = knn.kneighbors([ratings_matrix[user_id]])

pred_ratings_list = np.array([])
rankings_list = np.array([])
for item_id in range(num_titles):
    # Get the ratings for the neighbors on item 2
    neighbor_ratings = np.array([ratings_matrix[i, item_id] for i in indices[0] if not np.isnan(ratings_matrix[i, item_id])])

    predicted_rating = np.mean(neighbor_ratings[np.nonzero(neighbor_ratings)])
    rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

#     print(predicted_rating)

    pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    rankings_list = np.append(rankings_list, rankings)
#     print(f"Predicted rating for User {user_id} on Item {item_id}: {predicted_rating}")
    
best_book_rating = np.max(pred_ratings_list)
best_book_idx = np.argmax(pred_ratings_list)

sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
n = 1
for i, idx in enumerate(sorted_indices): 
    if (ratings[0, idx] > 0) or (np.isnan(pred_ratings_list[idx])):
        pass
    else:
        print("#", (n) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
        n+=1

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Top books are:
# 1 Xenocide
        (Ender's Saga, #3) Rating: 5.0
# 2 The Last Lecture Rating: 5.0
# 3 The Silver Star Rating: 5.0
# 4 Dracula Rating: 5.0
# 5 Little House on the Prairie
        (Little House, #3) Rating: 5.0
# 6 Destiny of the Republic: A Tale of Madness, Medicine and the Murder of a President Rating: 5.0
# 7 Fablehaven
        (Fablehaven, #1) Rating: 5.0
# 8 The Westing Game Rating: 5.0
# 9 The Mouse and the Motorcycle
        (Ralph S. Mouse, #1) Rating: 5.0
# 10 The Queen of Nothing
        (The Folk of the Air, #3) Rating: 5.0
# 11 The Alloy of Law
        (Mistborn, #4) Rating: 5.0
# 12 The Lucky One Rating: 5.0
# 13 The Silver Linings Playbook Rating: 5.0
# 14 Beneath a Scarlet Sky Rating: 5.0
# 15 Big Little Lies Rating: 5.0
# 16 The Marriage Bargain
        (Marriage to a Billionaire, #1) Rating: 5.0
# 17 The Nest Rating: 5.0
# 18 The Storied Life of A.J. Fikry Rating: 5.0
# 19 Before I Fall Rating: 5.0
# 20 The Lost Colony
        (Artemis Fowl, #5) Rating:

# 334 Red Rising
        (Red Rising Saga, #1) Rating: 4.0
# 335 Memoirs of a Geisha Rating: 4.0
# 336 Untamed Rating: 4.0
# 337 The Measure Rating: 4.0
# 338 Before I Go to Sleep Rating: 4.0
# 339 My Sister’s Keeper Rating: 4.0
# 340 Tripwire
        (Jack Reacher, #3) Rating: 4.0
# 341 Black Beauty Rating: 4.0
# 342 And Then There Were None Rating: 4.0
# 343 Plain Truth Rating: 4.0
# 344 Cell Rating: 4.0
# 345 Circus of the Damned
        (Anita Blake, Vampire Hunter, #3) Rating: 4.0
# 346 Dragons of Autumn Twilight
        (Dragonlance: Chronicles, #1) Rating: 4.0
# 347 Violeta Rating: 4.0
# 348 Lullaby Rating: 4.0
# 349 The Eternity Code
        (Artemis Fowl, #3) Rating: 4.0
# 350 Mansfield Park Rating: 4.0
# 351 I Am Number Four
        (Lorien Legacies, #1) Rating: 4.0
# 352 Let's Explore Diabetes with Owls: Essays, Etc. Rating: 4.0
# 353 Dead in the Family
        (Sookie Stackhouse, #10) Rating: 4.0
# 354 The Kitchen House Rating: 4.0
# 355 Steve Jobs Rating: 4.0
# 356 Marked


        (Sisterhood, #1) Rating: 3.5
# 625 Life of Pi Rating: 3.5
# 626 The Silent Patient Rating: 3.5
# 627 Love in the Time of Cholera Rating: 3.5
# 628 A Light in the Attic Rating: 3.5
# 629 Nineteen Minutes Rating: 3.5
# 630 A Walk to Remember Rating: 3.4
# 631 Wuthering Heights Rating: 3.4
# 632 Matched
        (Matched, #1) Rating: 3.4
# 633 Of Mice and Men Rating: 3.4
# 634 Fearless Fourteen
        (Stephanie Plum, #14) Rating: 3.3
# 635 Something Blue
        (Darcy & Rachel, #2) Rating: 3.3
# 636 Finger Lickin' Fifteen
        (Stephanie Plum, #15) Rating: 3.3
# 637 Anna Karenina Rating: 3.3
# 638 Wicked: The Life and Times of the Wicked Witch of the West
        (The Wicked Years, #1) Rating: 3.3
# 639 Atlas Shrugged Rating: 3.3
# 640 Beloved
        (Beloved Trilogy, #1) Rating: 3.3
# 641 Forever in Blue: The Fourth Summer of the Sisterhood
        (Sisterhood, #4) Rating: 3.3
# 642 Dark Places Rating: 3.3
# 643 Ten Big Ones
        (Stephanie Plum, #10) Rating: 3.3
# 644 E

In [11]:
#make knn for the similar user part
# Initialize KNN (using user-based KNN)
# import math
knn_30 = NearestNeighbors(n_neighbors=50, metric='cosine')  # Using cosine similarity
knn_30.fit(ratings_matrix)

with open("knn_model_30.pkl", "wb") as file:
    pickle.dump(knn_30, file)

In [12]:
rankings_list

best_book_rating = np.max(rankings_list)
best_book_idx = np.argmax(rankings_list)

sorted_indices = np.argsort(rankings_list)[::-1]
print("Top books are:")
for i, idx in enumerate(sorted_indices): 
    print("#", (i+1) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1), ". Ranking:", rankings_list[idx])

Top books are:
# 1 Harry Potter and the Prisoner of Azkaban
        (Harry Potter, #3) Rating: 4.8 . Ranking: 221.0
# 2 Harry Potter and the Deathly Hallows
        (Harry Potter, #7) Rating: 4.8 . Ranking: 213.0
# 3 Harry Potter and the Half-Blood Prince
        (Harry Potter, #6) Rating: 4.6 . Ranking: 212.0
# 4 Harry Potter and the Chamber of Secrets
        (Harry Potter, #2) Rating: 4.6 . Ranking: 212.0
# 5 Harry Potter and the Goblet of Fire
        (Harry Potter, #4) Rating: 4.8 . Ranking: 210.0
# 6 Harry Potter and the Order of the Phoenix
        (Harry Potter, #5) Rating: 4.6 . Ranking: 207.0
# 7 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1) Rating: 4.6 . Ranking: 195.0
# 8 The Hunger Games
        (The Hunger Games, #1) Rating: 4.7 . Ranking: 180.0
# 9 Catching Fire
        (The Hunger Games, #2) Rating: 4.5 . Ranking: 150.0
# 10 Mockingjay
        (The Hunger Games, #3) Rating: 4.4 . Ranking: 141.0
# 11 Twilight
        (The Twilight Saga, #1) Rating: 3.9

# 158 Bridge to Terabithia Rating: 4.3 . Ranking: 13.0
# 159 Dead and Gone
        (Sookie Stackhouse, #9) Rating: 4.3 . Ranking: 13.0
# 160 Thirteen Reasons Why Rating: 4.3 . Ranking: 13.0
# 161 The Pillars of the Earth
        (Kingsbridge, #1) Rating: 4.3 . Ranking: 13.0
# 162 Speak Rating: 4.3 . Ranking: 13.0
# 163 A Dance with Dragons
        (A Song of Ice and Fire, #5) Rating: 4.3 . Ranking: 13.0
# 164 The Princess Bride Rating: 4.3 . Ranking: 13.0
# 165 Fifty Shades Freed
        (Fifty Shades, #3) Rating: 3.2 . Ranking: 13.0
# 166 Scarlet
        (The Lunar Chronicles, #2) Rating: 4.3 . Ranking: 13.0
# 167 The Merchant of Death
        (Pendragon, #1) Rating: 4.3 . Ranking: 13.0
# 168 The Invention of Hugo Cabret Rating: 4.3 . Ranking: 13.0
# 169 Gathering Blue
        (The Giver, #2) Rating: 4.3 . Ranking: 13.0
# 170 The Handmaidâ€™s Tale
        (The Handmaid's Tale, #1) Rating: 4.3 . Ranking: 13.0
# 171 Sense and Sensibility Rating: 3.2 . Ranking: 13.0
# 172 A Court of Wing

# 338 Bossypants Rating: 3.5 . Ranking: 7.0
# 339 The One
        (The Selection, #3) Rating: 3.5 . Ranking: 7.0
# 340 Love in the Time of Cholera Rating: 3.5 . Ranking: 7.0
# 341 The Time Machine Rating: 3.5 . Ranking: 7.0
# 342 The Picture of Dorian Gray Rating: 3.5 . Ranking: 7.0
# 343 Forever... Rating: 2.3 . Ranking: 7.0
# 344 I Am Watching You Rating: 3.5 . Ranking: 7.0
# 345 American Assassin
        (Mitch Rapp, #1) Rating: 3.5 . Ranking: 7.0
# 346 Fever 1793 Rating: 3.5 . Ranking: 7.0
# 347 Much Ado About Nothing Rating: 3.5 . Ranking: 7.0
# 348 A Light in the Attic Rating: 3.5 . Ranking: 7.0
# 349 Wintergirls Rating: 3.5 . Ranking: 7.0
# 350 Smokin' Seventeen
        (Stephanie Plum, #17) Rating: 3.0 . Ranking: 6.0
# 351 The Husband's Secret Rating: 3.0 . Ranking: 6.0
# 352 Oliver Twist Rating: 3.0 . Ranking: 6.0
# 353 Extras
        (Uglies, #4) Rating: 3.0 . Ranking: 6.0
# 354 Don't Let the Pigeon Drive the Bus! Rating: 3.0 . Ranking: 6.0
# 355 Flowers for Algernon Rating: 

        (The Giver, #4) Rating: 4.0 . Ranking: 4.0
# 550 A Long Petal of the Sea Rating: 4.0 . Ranking: 4.0
# 551 Tripwire
        (Jack Reacher, #3) Rating: 4.0 . Ranking: 4.0
# 552 Shanghai Girls
        (Shanghai Girls, #1) Rating: 4.0 . Ranking: 4.0
# 553 The Trespasser
        (Dublin Murder Squad, #6) Rating: 4.0 . Ranking: 4.0
# 554 Tell Me Lies Rating: 4.0 . Ranking: 4.0
# 555 Rodrick Rules
        (Diary of a Wimpy Kid, #2) Rating: 4.0 . Ranking: 4.0
# 556 3rd Degree
        (Women's Murder Club, #3) Rating: 4.0 . Ranking: 4.0
# 557 Let's Explore Diabetes with Owls: Essays, Etc. Rating: 4.0 . Ranking: 4.0
# 558 The Shell Seekers Rating: 4.0 . Ranking: 4.0
# 559 The Night Circus Rating: 4.0 . Ranking: 4.0
# 560 The Reader Rating: 4.0 . Ranking: 4.0
# 561 Madame Bovary Rating: 4.0 . Ranking: 4.0
# 562 Unwind
        (Unwind, #1) Rating: 4.0 . Ranking: 4.0
# 563 1st to Die
        (Women's Murder Club, #1) Rating: 4.0 . Ranking: 4.0
# 564 The Push Rating: 4.0 . Ranking: 4.0
# 565

# 771 Make Way for Ducklings Rating: 3.0 . Ranking: 3.0
# 772 A Widow for One Year Rating: 3.0 . Ranking: 3.0
# 773 How to Win Friends & Influence People Rating: 3.0 . Ranking: 3.0
# 774 The Restaurant at the End of the Universe
        (The Hitchhiker's Guide to the Galaxy, #2) Rating: 3.0 . Ranking: 3.0
# 775 The Vacationers Rating: 3.0 . Ranking: 3.0
# 776 The Gift of the Magi Rating: 3.0 . Ranking: 3.0
# 777 Pet Sematary Rating: 3.0 . Ranking: 3.0
# 778 The Snow Child Rating: 3.0 . Ranking: 3.0
# 779 Jonathan Livingston Seagull Rating: 3.0 . Ranking: 3.0
# 780 Diary Rating: 3.0 . Ranking: 3.0
# 781 Prep Rating: 3.0 . Ranking: 3.0
# 782 They Both Die at the End
        (Death-Cast, #1) Rating: 3.0 . Ranking: 3.0
# 783 Sold Rating: 3.0 . Ranking: 3.0
# 784 The Wind in the Willows Rating: 3.0 . Ranking: 3.0
# 785 The Phantom Tollbooth Rating: 3.0 . Ranking: 3.0
# 786 The Last Juror Rating: 3.0 . Ranking: 3.0
# 787 American Gods Rating: 3.0 . Ranking: 3.0
# 788 Waiting for Godot Rating

        (Harry Hole, #7) Rating: nan . Ranking: 0.0
# 997 Britt-Marie Was Here Rating: nan . Ranking: 0.0
# 998 A Novel Love Story Rating: nan . Ranking: 0.0
# 999 The Virgin Suicides Rating: nan . Ranking: 0.0
# 1000 The Life and Times of the Thunderbolt Kid Rating: nan . Ranking: 0.0
# 1001 The People We Keep Rating: nan . Ranking: 0.0
# 1002 The Namesake Rating: nan . Ranking: 0.0
# 1003 The Inheritance Games
        (The Inheritance Games, #1) Rating: nan . Ranking: 0.0
# 1004 Lord of Chaos
        (The Wheel of Time, #6) Rating: nan . Ranking: 0.0
# 1005 The Moon Is a Harsh Mistress Rating: nan . Ranking: 0.0
# 1006 Needful Things Rating: nan . Ranking: 0.0
# 1007 Darkfever
        (Fever, #1) Rating: nan . Ranking: 0.0
# 1008 Still Alice Rating: nan . Ranking: 0.0
# 1009 The Coworker Rating: nan . Ranking: 0.0
# 1010 Leviathan
        (Leviathan, #1) Rating: nan . Ranking: 0.0
# 1011 The Innocent
        (Will Robie, #1) Rating: nan . Ranking: 0.0
# 1012 Outliers: The Story of Su

        (Shatter Me, #2) Rating: nan . Ranking: 0.0
# 1180 Things We Never Got Over
        (Knockemout, #1) Rating: nan . Ranking: 0.0
# 1181 Final Girls Rating: nan . Ranking: 0.0
# 1182 Home Before Dark Rating: nan . Ranking: 0.0
# 1183 The Sirens of Titan Rating: nan . Ranking: 0.0
# 1184 Y: The Last Man, Vol. 1: Unmanned Rating: nan . Ranking: 0.0
# 1185 The Complete Anne of Green Gables
        (Anne of Green Gables, #1-8) Rating: nan . Ranking: 0.0
# 1186 Eileen Rating: nan . Ranking: 0.0
# 1187 The Long Dark Tea-Time of the Soul
        (Dirk Gently, #2) Rating: nan . Ranking: 0.0
# 1188 Calvin and Hobbes
        (Calvin and Hobbes, #1) Rating: nan . Ranking: 0.0
# 1189 Between Shades of Gray Rating: nan . Ranking: 0.0
# 1190 Julius Caesar Rating: nan . Ranking: 0.0
# 1191 The Red Badge of Courage Rating: nan . Ranking: 0.0
# 1192 The Complete Grimm's Fairy Tales Rating: nan . Ranking: 0.0
# 1193 The Sandman, Vol. 2: The Doll's House Rating: nan . Ranking: 0.0
# 1194 The Way of

# 1393 Dolores Claiborne Rating: nan . Ranking: 0.0
# 1394 Batman: Year One Rating: nan . Ranking: 0.0
# 1395 The Illustrated Man Rating: nan . Ranking: 0.0
# 1396 Garden Spells
        (Waverley Family, #1) Rating: nan . Ranking: 0.0
# 1397 The Beach House Rating: nan . Ranking: 0.0
# 1398 Memory Man
        (Amos Decker, #1) Rating: nan . Ranking: 0.0
# 1399 The Postmistress Rating: nan . Ranking: 0.0
# 1400 Ghost Story
        (The Dresden Files, #13) Rating: nan . Ranking: 0.0
# 1401 Hyperbole and a Half: Unfortunate Situations, Flawed Coping Mechanisms, Mayhem, and Other Things That Happened Rating: nan . Ranking: 0.0
# 1402 Ghostwritten Rating: nan . Ranking: 0.0
# 1403 Cryptonomicon Rating: nan . Ranking: 0.0
# 1404 Troublemaker: Surviving Hollywood and Scientology Rating: nan . Ranking: 0.0
# 1405 In Defense of Food: An Eater's Manifesto Rating: nan . Ranking: 0.0
# 1406 The Liars' Club Rating: nan . Ranking: 0.0
# 1407 Even Cowgirls Get the Blues Rating: nan . Ranking: 0.0
# 1

# 1615 Foundation
        (Foundation, #1) Rating: nan . Ranking: 0.0
# 1616 Never Lie Rating: nan . Ranking: 0.0
# 1617 The Boyfriend Rating: nan . Ranking: 0.0
# 1618 The Murder of Roger Ackroyd
        (Hercule Poirot, #4) Rating: nan . Ranking: 0.0
# 1619 Eleanor & Park Rating: nan . Ranking: 0.0
# 1620 A Great and Terrible Beauty
        (Gemma Doyle, #1) Rating: nan . Ranking: 0.0
# 1621 The Dharma Bums Rating: nan . Ranking: 0.0
# 1622 The Time Keeper Rating: nan . Ranking: 0.0
# 1623 Becoming Rating: nan . Ranking: 0.0
# 1624 The Boleyn Inheritance
        (The Plantagenet and Tudor Novels, #10) Rating: nan . Ranking: 0.0
# 1625 Worth Dying For
        (Jack Reacher, #15) Rating: nan . Ranking: 0.0
# 1626 A Doll's House Rating: nan . Ranking: 0.0
# 1627 Cross
        (Alex Cross, #12) Rating: nan . Ranking: 0.0
# 1628 Stay Rating: nan . Ranking: 0.0
# 1629 The Nightingale Rating: nan . Ranking: 0.0
# 1630 Black and Blue Rating: nan . Ranking: 0.0
# 1631 The Last House on Needle

# 1841 Second Glance Rating: nan . Ranking: 0.0
# 1842 The Particular Sadness of Lemon Cake Rating: nan . Ranking: 0.0
# 1843 Pawn of Prophecy
        (The Belgariad, #1) Rating: nan . Ranking: 0.0
# 1844 The Drowning Woman Rating: nan . Ranking: 0.0
# 1845 My Name Is Lucy Barton
        (Amgash, #1) Rating: nan . Ranking: 0.0
# 1846 The God of Small Things Rating: nan . Ranking: 0.0
# 1847 Swan Song
        (Nantucket, #4) Rating: nan . Ranking: 0.0
# 1848 Zero Days Rating: nan . Ranking: 0.0
# 1849 Summer Knight
        (The Dresden Files, #4) Rating: nan . Ranking: 0.0
# 1850 State of Fear Rating: nan . Ranking: 0.0
# 1851 21 Lessons for the 21st Century Rating: nan . Ranking: 0.0
# 1852 Run Rating: nan . Ranking: 0.0
# 1853 Entwined with You
        (Crossfire, #3) Rating: nan . Ranking: 0.0
# 1854 Travels with Charley: In Search of America Rating: nan . Ranking: 0.0
# 1855 Beautiful Ruins Rating: nan . Ranking: 0.0
# 1856 About a Boy Rating: nan . Ranking: 0.0
# 1857 Veronika Deci

# 2068 The Idiot Rating: nan . Ranking: 0.0
# 2069 Brooklyn
        (Eilis Lacey, #1) Rating: nan . Ranking: 0.0
# 2070 A History of Wild Places Rating: nan . Ranking: 0.0
# 2071 My Brilliant Friend
        (My Brilliant Friend, #1) Rating: nan . Ranking: 0.0
# 2072 The King of Torts Rating: nan . Ranking: 0.0
# 2073 What to Expect When You're Expecting Rating: nan . Ranking: 0.0
# 2074 Lover Awakened
        (Black Dagger Brotherhood, #3) Rating: nan . Ranking: 0.0
# 2075 The Accidental Tourist Rating: nan . Ranking: 0.0
# 2076 Ramona the Pest
        (Ramona, #2) Rating: nan . Ranking: 0.0
# 2077 Going Postal
        (Discworld, #33; Moist von Lipwig, #1) Rating: nan . Ranking: 0.0
# 2078 Every Summer After Rating: nan . Ranking: 0.0
# 2079 The Guardians Rating: nan . Ranking: 0.0
# 2080 The Women Rating: nan . Ranking: 0.0
# 2081 Franny and Zooey Rating: nan . Ranking: 0.0
# 2082 I Found You Rating: nan . Ranking: 0.0
# 2083 The Silmarillion Rating: nan . Ranking: 0.0
# 2084 I'm Thi

# 2281 A Painted House Rating: nan . Ranking: 0.0
# 2282 The Fifth Season
        (The Broken Earth, #1) Rating: nan . Ranking: 0.0
# 2283 A Promised Land Rating: nan . Ranking: 0.0
# 2284 The Death Cure
        (The Maze Runner, #3) Rating: nan . Ranking: 0.0
# 2285 The Invisible Life of Addie LaRue Rating: nan . Ranking: 0.0
# 2286 The Soulmate Rating: nan . Ranking: 0.0
# 2287 The Boston Girl Rating: nan . Ranking: 0.0
# 2288 Siddhartha Rating: nan . Ranking: 0.0
# 2289 The Overnight Guest Rating: nan . Ranking: 0.0
# 2290 Stories I Only Tell My Friends Rating: nan . Ranking: 0.0
# 2291 Carrie Rating: nan . Ranking: 0.0
# 2292 The Switch Rating: nan . Ranking: 0.0
# 2293 Lincoln in the Bardo Rating: nan . Ranking: 0.0
# 2294 The Club Rating: nan . Ranking: 0.0
# 2295 Maybe You Should Talk to Someone Rating: nan . Ranking: 0.0
# 2296 The Physick Book of Deliverance Dane
        (The Physick Book, #1) Rating: nan . Ranking: 0.0
# 2297 The Client Rating: nan . Ranking: 0.0
# 2298 The S

# 2509 In My Dreams I Hold a Knife Rating: nan . Ranking: 0.0
# 2510 Career of Evil
        (Cormoran Strike, #3) Rating: nan . Ranking: 0.0
# 2511 A Portrait of the Artist as a Young Man Rating: nan . Ranking: 0.0
# 2512 Mystic River Rating: nan . Ranking: 0.0
# 2513 A Streetcar Named Desire Rating: nan . Ranking: 0.0
# 2514 Silas Marner Rating: nan . Ranking: 0.0
# 2515 Once Upon a River Rating: nan . Ranking: 0.0
# 2516 Naked in Death
        (In Death, #1) Rating: nan . Ranking: 0.0
# 2517 The Prince and the Pauper Rating: nan . Ranking: 0.0
# 2518 Too Late Rating: nan . Ranking: 0.0
# 2519 Midwives Rating: nan . Ranking: 0.0
# 2520 The Dark Tower
        (The Dark Tower, #7) Rating: nan . Ranking: 0.0
# 2521 Still Me
        (Me Before You, #3) Rating: nan . Ranking: 0.0
# 2522 The Wrong Family Rating: nan . Ranking: 0.0
# 2523 Out of the Dust Rating: nan . Ranking: 0.0
# 2524 Slade House Rating: nan . Ranking: 0.0
# 2525 Pandora
        (New Tales of the Vampires, #1) Rating: nan

In [13]:
#give a list sorted out with books you've already read:
# sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if (ratings_matrix[user_id, idx] > 0) or (np.isnan(pred_ratings_list[idx])):
        continue
    print("#", list_num , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1

Top books are:
# 1 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1) Rating: 4.6
# 2 The da Vinci Code
        (Robert Langdon, #2) Rating: 4.4
# 3 Eclipse
        (The Twilight Saga, #3) Rating: 4.1
# 4 Breaking Dawn
        (The Twilight Saga, #4) Rating: 4.0
# 5 Pride and Prejudice Rating: 4.3
# 6 The Lion, the Witch and the Wardrobe
        (Chronicles of Narnia, #1) Rating: 4.2
# 7 The Great Gatsby Rating: 3.6
# 8 The Kite Runner Rating: 4.5
# 9 The Hobbit, or There and Back Again
        (The Lord of the Rings, #0) Rating: 4.6
# 10 The Lovely Bones Rating: 3.7
# 11 The Help Rating: 4.8
# 12 The Book Thief Rating: 4.7
# 13 Number the Stars Rating: 4.4
# 14 The Sea of Monsters
        (Percy Jackson and the Olympians, #2) Rating: 3.8
# 15 The Host
        (The Host, #1) Rating: 4.6
# 16 Of Mice and Men Rating: 3.4
# 17 The Diary of a Young Girl Rating: 4.6
# 18 The Fellowship of the Ring
        (The Lord of the Rings, #1) Rating: 4.0
# 19 1984 Rating: 4.0
# 20 The C

# 251 The Absolutely True Diary of a Part-Time Indian Rating: 4.0
# 252 Charlie and the Chocolate Factory
        (Charlie Bucket, #1) Rating: 4.0
# 253 Steve Jobs Rating: 4.0
# 254 Deadlocked
        (Sookie Stackhouse, #12) Rating: 4.0
# 255 Eat, Pray, Love Rating: 2.7
# 256 Harry Potter and the Cursed Child: Parts One and Two
        (Harry Potter, #8) Rating: 2.7
# 257 A Good Girl's Guide to Murder
        (A Good Girl's Guide to Murder, #1) Rating: 4.0
# 258 Bridget Jones: The Edge of Reason
        (Bridget Jones, #2) Rating: 4.0
# 259 A Midsummer Night’s Dream Rating: 4.0
# 260 Leaving Time Rating: 4.0
# 261 Running with Scissors Rating: 2.7
# 262 A Little Princess Rating: 4.0
# 263 Circe Rating: 4.0
# 264 In Cold Blood Rating: 4.0
# 265 Cress
        (The Lunar Chronicles, #3) Rating: 4.0
# 266 Dear John Rating: 4.0
# 267 Fight Club Rating: 2.7
# 268 Horton Hears a Who! Rating: 3.5
# 269 The Secret Garden Rating: 3.5
# 270 Nineteen Minutes Rating: 3.5
# 271 A Child Called "It"


        (Tracy Crosswhite, #1) Rating: 4.0
# 546 Inferno Rating: 4.0
# 547 Behind the Beautiful Forevers: Life, Death, and Hope in a Mumbai Undercity Rating: 4.0
# 548 On Beauty Rating: 4.0
# 549 Will Grayson, Will Grayson Rating: 4.0
# 550 Invisible Monsters Rating: 4.0
# 551 Dewey: The Small-Town Library Cat Who Touched the World Rating: 4.0
# 552 The Black Cauldron
        (The Chronicles of Prydain, #2) Rating: 4.0
# 553 The Life We Bury
        (Joe Talbert, #1; Detective Max Rupert, #1) Rating: 4.0
# 554 Into the Water Rating: 4.0
# 555 I'll Be Gone in the Dark: One Woman's Obsessive Search for the Golden State Killer Rating: 4.0
# 556 The 6th Target
        (Women's Murder Club, #6) Rating: 4.0
# 557 Americanah Rating: 4.0
# 558 Running Blind
        (Jack Reacher, #4) Rating: 4.0
# 559 Coraline Rating: 4.0
# 560 The House on Mango Street Rating: 4.0
# 561 A Million Little Pieces Rating: 4.0
# 562 People of the Book Rating: 4.0
# 563 On Writing: A Memoir of the Craft Rating: 4.0

In [14]:
idx = 8
this_ratings = ratings_matrix[indices[0,idx]]
print(this_ratings)

my_ratings = ratings_matrix[indices[0,0]]
print(my_ratings)

for i, rating in enumerate(this_ratings):
    if rating > 0 and my_ratings[i]>0:
        print("-", titles[i], ", their Rating:", rating, " My Rating:", my_ratings[i])

# print(distances, indices[-1,-1])

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
- Jane Eyre , their Rating: 3.0  My Rating: 1.0
- Twilight
        (The Twilight Saga, #1) , their Rating: 2.0  My Rating: 3.0
- Harry Potter and the Chamber of Secrets
        (Harry Potter, #2) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Order of the Phoenix
        (Harry Potter, #5) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Half-Blood Prince
        (Harry Potter, #6) , their Rating: 5.0  My Rating: 5.0
- The Hunger Games
        (The Hunger Games, #1) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Prisoner of Azkaban
        (Harry Potter, #3) , their Rating: 5.0  My Rating: 5.0
- To Kill a Mockingbird , their Rating: 3.0  My Rating: 5.0
- Harry Potter and the Deathly Hallows
        (Harry Potter, #7) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Goblet of Fire
        (Harry Potter, #4) , their Rating: 5.0  My Rating: 5.0
- The Giver
        (The Giver, #1) , their Rating: 4.

In [15]:
#find most similar books using cosine similarity
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(ratings_matrix.T)

similarity_df = pd.DataFrame(similarity_matrix, index=titles, columns=titles)

# Function to get k nearest neighbors for a movie
def get_similar_book(book_name, k=3):
    similar_book = similarity_df[book_name].sort_values(ascending=False)[1:k+1]
    return similar_book

book_name = 'First Lie Wins'
print("\nTop 5 similar book to", book_name, ":")
print(get_similar_book(book_name, k=5))


Top 5 similar book to First Lie Wins :
Listen for the Lie             0.583548
She's Not Sorry                0.576746
Darling Girls                  0.554097
The Last One at the Wedding    0.544700
The Fury                       0.544629
Name: First Lie Wins, dtype: float64


In [16]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

# Example user rating data (rows = users, columns = items)
ratings_df = pd.DataFrame(ratings)

# Step 1: Handle missing values using imputation (we will fill missing values with the mean rating)
imputer = SimpleImputer(strategy='mean')
ratings_filled = imputer.fit_transform(ratings_df)

# Step 2: Apply KMeans clustering
# We will use 2 clusters as an example
kmeans = KMeans(n_clusters=int(num_users/10), random_state=42)
clusters = kmeans.fit_predict(ratings_filled)

# Step 3: Add the cluster labels to the original DataFrame
ratings_df['Cluster'] = clusters

# # Print the user ratings with cluster assignments
# print("\nUser Ratings with Clusters:")
# print(ratings_df)

# # Step 4: Print the cluster centers (the centroid of each cluster)
# print("\nCluster Centers (Centroids):")
# print(kmeans.cluster_centers_)


user_id = 0
# print(clusters)

cluster_this_user = clusters[user_id]
# print(cluster_this_user)

pred_ratings_list = kmeans.cluster_centers_[user_id]
# print(pred_ratings_list)
# for i in range(len())

#give a list sorted out with books you've already read:
sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if (ratings_matrix[user_id, idx] > 0) or (np.isnan(pred_ratings_list[idx])) :
        continue
    print("#", list_num , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1



Top books are:
# 1 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1) Rating: 3.7
# 2 Pride and Prejudice Rating: 3.1
# 3 Little Women Rating: 3.0
# 4 Memoirs of a Geisha Rating: 2.4
# 5 Where the Sidewalk Ends Rating: 2.4
# 6 Lord of the Flies Rating: 2.2
# 7 The Diary of a Young Girl Rating: 2.1
# 8 Of Mice and Men Rating: 2.0
# 9 The Kite Runner Rating: 1.9
# 10 The Great Gatsby Rating: 1.8
# 11 The da Vinci Code
        (Robert Langdon, #2) Rating: 1.7
# 12 The Hobbit, or There and Back Again
        (The Lord of the Rings, #0) Rating: 1.7
# 13 Gone with the Wind Rating: 1.6
# 14 The Adventures of Huckleberry Finn Rating: 1.6
# 15 The Fellowship of the Ring
        (The Lord of the Rings, #1) Rating: 1.5
# 16 The Lovely Bones Rating: 1.3
# 17 The Notebook
        (The Notebook, #1) Rating: 1.2
# 18 The Odyssey Rating: 1.1
# 19 Jurassic Park
        (Jurassic Park, #1) Rating: 1.0
# 20 Sense and Sensibility Rating: 1.0
# 21 The Devil Wears Prada
        (The Devil Wear

# 268 Fourth Wing
        (The Empyrean, #1) Rating: 0.1
# 269 The Maze Runner
        (The Maze Runner, #1) Rating: 0.1
# 270 Clockwork Princess
        (The Infernal Devices, #3) Rating: 0.1
# 271 The Blind Assassin Rating: 0.1
# 272 The True Story of the 3 Little Pigs Rating: 0.1
# 273 It's Not Summer Without You
        (Summer, #2) Rating: 0.1
# 274 One Day in the Life of Ivan Denisovich Rating: 0.1
# 275 On the Banks of Plum Creek
        (Little House, #4) Rating: 0.1
# 276 Lies My Teacher Told Me: Everything Your American History Textbook Got Wrong Rating: 0.1
# 277 Steve Jobs Rating: 0.1
# 278 Roll of Thunder, Hear My Cry
        (Logans, #4) Rating: 0.1
# 279 The Westing Game Rating: 0.1
# 280 The Little Prince Rating: 0.1
# 281 Bossypants Rating: 0.1
# 282 The Likeness
        (Dublin Murder Squad, #2) Rating: 0.1
# 283 The Guernsey Literary and Potato Peel Pie Society Rating: 0.1
# 284 Explosive Eighteen
        (Stephanie Plum, #18) Rating: 0.1
# 285 Diary of a Wimpy Kid
 

# 525 The Signature of All Things Rating: 0.0
# 526 Ask Again, Yes Rating: 0.0
# 527 Life After Life
        (Todd Family, #1) Rating: 0.0
# 528 Pineapple Street Rating: 0.0
# 529 White Teeth Rating: 0.0
# 530 If You Tell: A True Story of Murder, Family Secrets, and the Unbreakable Bond of Sisterhood Rating: 0.0
# 531 Lover Unbound
        (Black Dagger Brotherhood, #5) Rating: 0.0
# 532 The Nest Rating: 0.0
# 533 This Time Tomorrow Rating: 0.0
# 534 The Henna Artist
        (The Jaipur Trilogy, #1) Rating: 0.0
# 535 Not a Happy Family Rating: 0.0
# 536 Between Shades of Gray Rating: 0.0
# 537 Death on the Nile
        (Hercule Poirot, #18) Rating: 0.0
# 538 Jonathan Strange & Mr Norrell Rating: 0.0
# 539 The Silver Linings Playbook Rating: 0.0
# 540 Weyward Rating: 0.0
# 541 The Life We Bury
        (Joe Talbert, #1; Detective Max Rupert, #1) Rating: 0.0
# 542 The Night Swim
        (Rachel Krall, #1) Rating: 0.0
# 543 The Girl with the Louding Voice Rating: 0.0
# 544 A Million Little

# 817 Confessions on the 7:45 Rating: 0.0
# 818 Fun Home: A Family Tragicomic Rating: 0.0
# 819 The Girl with All the Gifts
        (The Girl With All the Gifts, #1) Rating: 0.0
# 820 Ishmael: An Adventure of the Mind and Spirit
        (Ishmael, #1) Rating: 0.0
# 821 Disgrace Rating: 0.0
# 822 Confessions Rating: 0.0
# 823 Gone
        (Gone, #1) Rating: 0.0
# 824 The World Is Flat: A Brief History of the Twenty-first Century Rating: 0.0
# 825 After I Do Rating: 0.0
# 826 Pippi Longstocking
        (Pippi LÃ¥ngstrump, #1) Rating: 0.0
# 827 Lover Mine
        (Black Dagger Brotherhood, #8) Rating: 0.0
# 828 The Circle
        (The Circle, #1) Rating: 0.0
# 829 The Christie Affair Rating: 0.0
# 830 Heart Bones Rating: 0.0
# 831 Luckiest Girl Alive Rating: 0.0
# 832 The Last Wish
        (The Witcher, #0.5) Rating: 0.0
# 833 The Wishsong of Shannara
        (The Original Shannara Trilogy, #3) Rating: 0.0
# 834 Rich People Problems
        (Crazy Rich Asians, #3) Rating: 0.0
# 835 The Ult

# 1081 Yellow Wife Rating: 0.0
# 1082 The Lunatic Cafe
        (Anita Blake, Vampire Hunter, #4) Rating: 0.0
# 1083 The Stationery Shop Rating: 0.0
# 1084 Library of Souls
        (Miss Peregrine's Peculiar Children, #3) Rating: 0.0
# 1085 The Last Word Rating: 0.0
# 1086 The Stranger in the Woods: The Extraordinary Story of the Last True Hermit Rating: 0.0
# 1087 Guards! Guards!
        (Discworld, #8; City Watch, #1) Rating: 0.0
# 1088 The Judge's List
        (The Whistler, #2) Rating: 0.0
# 1089 The Mystery Guest
        (Molly the Maid, #2) Rating: 0.0
# 1090 The Titan’s Curse
        (Percy Jackson and the Olympians, #3) Rating: 0.0
# 1091 Eats, Shoots & Leaves: The Zero Tolerance Approach to Punctuation Rating: 0.0
# 1092 The Wonder Rating: 0.0
# 1093 The Girl Who Loved Tom Gordon Rating: 0.0
# 1094 Joyland Rating: 0.0
# 1095 The Patron Saint of Liars Rating: 0.0
# 1096 How to Sell a Haunted House Rating: 0.0
# 1097 The Bromance Book Club
        (Bromance Book Club, #1) Rating:

        (Red Queen, #1) Rating: 0.0
# 1386 Hamnet Rating: 0.0
# 1387 The Invention of Hugo Cabret Rating: 0.0
# 1388 Red, White & Royal Blue Rating: 0.0
# 1389 The Dog Stars Rating: 0.0
# 1390 The Gathering Storm
        (The Wheel of Time, #12) Rating: 0.0
# 1391 Even Cowgirls Get the Blues Rating: 0.0
# 1392 The Illustrated Man Rating: 0.0
# 1393 The Boy in the Striped Pajamas Rating: 0.0
# 1394 Five Feet Apart Rating: 0.0
# 1395 The Remains of the Day Rating: 0.0
# 1396 Greenwich Park Rating: 0.0
# 1397 The Last of the Mohicans
        (The Leatherstocking Tales, #2) Rating: 0.0
# 1398 White Fang Rating: 0.0
# 1399 Legendary
        (Caraval, #2) Rating: 0.0
# 1400 Still Life with Woodpecker Rating: 0.0
# 1401 The Opal Deception
        (Artemis Fowl, #4) Rating: 0.0
# 1402 Crime and Punishment Rating: 0.0
# 1403 The Only Survivors Rating: 0.0
# 1404 Testimony Rating: 0.0
# 1405 Thirteen Reasons Why Rating: 0.0
# 1406 The Wind in the Willows Rating: 0.0
# 1407 Magician: Apprentice
 

# 1696 Rendezvous with Rama
        (Rama, #1) Rating: 0.0
# 1697 The Pull of the Stars Rating: 0.0
# 1698 The Hunchback of Notre-Dame Rating: 0.0
# 1699 Fear Nothing
        (Moonlight Bay, #1) Rating: 0.0
# 1700 Catâ€™s Cradle Rating: 0.0
# 1701 The Black Swan: The Impact of the Highly Improbable Rating: 0.0
# 1702 Let's Explore Diabetes with Owls: Essays, Etc. Rating: 0.0
# 1703 Along for the Ride Rating: 0.0
# 1704 Northanger Abbey Rating: 0.0
# 1705 Orange Is the New Black Rating: 0.0
# 1706 The Sandman, Vol. 2: The Doll's House Rating: 0.0
# 1707 Frog and Toad Are Friends
        (Frog and Toad, #1) Rating: 0.0
# 1708 It Rating: 0.0
# 1709 Wyrd Sisters
        (Discworld, #6; Witches, #2) Rating: 0.0
# 1710 Faust, First Part Rating: 0.0
# 1711 Hounded
        (The Iron Druid Chronicles, #1) Rating: -0.0
# 1712 Vanity Fair Rating: -0.0
# 1713 The House at Pooh Corner
        (Winnie-the-Pooh, #2) Rating: -0.0
# 1714 The One and Only Ivan
        (The One and Only #1) Rating: -0.0


        (Chief Inspector Armand Gamache, #2) Rating: -0.0
# 2013 A Bear Called Paddington
        (Paddington, #1) Rating: -0.0
# 2014 Shantaram Rating: -0.0
# 2015 The Five-Star Weekend Rating: -0.0
# 2016 The Great Divorce Rating: -0.0
# 2017 Born to Run: A Hidden Tribe, Superathletes, and the Greatest Race the World Has Never Seen Rating: -0.0
# 2018 The Great Believers Rating: -0.0
# 2019 In the Heart of the Sea: The Tragedy of the Whaleship Essex Rating: -0.0
# 2020 The Book of Longings Rating: -0.0
# 2021 The Collected Poems Rating: -0.0
# 2022 In a Dark, Dark Wood Rating: -0.0
# 2023 The Plague Rating: -0.0
# 2024 Circus of the Damned
        (Anita Blake, Vampire Hunter, #3) Rating: -0.0
# 2025 Sula Rating: -0.0
# 2026 The Jungle Books Rating: -0.0
# 2027 My Grandmother Asked Me to Tell You She's Sorry Rating: -0.0
# 2028 A Widow for One Year Rating: -0.0
# 2029 The Bone Clocks Rating: -0.0
# 2030 Klara and the Sun Rating: -0.0
# 2031 Blue Moon
        (Anita Blake, Vampire Hun

# 2330 The Inheritance Games
        (The Inheritance Games, #1) Rating: -0.0
# 2331 The Magicians
        (The Magicians, #1) Rating: -0.0
# 2332 Sea of Tranquility Rating: -0.0
# 2333 Evvie Drake Starts Over Rating: -0.0
# 2334 The Once and Future King Rating: -0.0
# 2335 Foundation
        (Foundation, #1) Rating: -0.0
# 2336 Apples Never Fall Rating: -0.0
# 2337 The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life Rating: -0.0
# 2338 Who Moved My Cheese? An Amazing Way to Deal with Change in Your Work and in Your Life... Rating: -0.0
# 2339 Dune Messiah
        (Dune Chronicles, #2) Rating: -0.0
# 2340 Proven Guilty
        (The Dresden Files, #8) Rating: -0.0
# 2341 Darkly Dreaming Dexter
        (Dexter, #1) Rating: -0.0
# 2342 The Son of Neptune
        (The Heroes of Olympus, #2) Rating: -0.0
# 2343 Small Things Like These Rating: -0.0
# 2344 We Need to Talk About Kevin Rating: -0.0
# 2345 The Black Echo
        (Harry Bosch, #1; Harry Bosch Un

In [30]:
#doing masked autoencoder
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset, random_split

# Mask for observed values (1 for observed, 0 for missing)
ratings_torch = torch.tensor(ratings).float()
mask = (ratings_torch != 0).float()
mask_tensor = torch.tensor(mask)


#Define autoencoder
class SparseAutoencoder(nn.Module):
    def __init__(self, num_items, latent_dim):
        super(SparseAutoencoder, self).__init__()
        hidden1 = latent_dim*2
        self.encoder1 = nn.Linear(num_items, hidden1)
        self.encoder2 = nn.Linear(hidden1, latent_dim)
        self.decoder1 = nn.Linear(latent_dim, hidden1)
        self.decoder2 = nn.Linear(hidden1, num_items)
        
    def forward(self, x):
        x = torch.relu(self.encoder1(x))
        x = torch.relu(self.encoder2(x))
        x = torch.relu(self.decoder1(x))
        x = self.decoder2(x)
        # Scale sigmoid output to [1, 5]
        return 1 + 4 * torch.sigmoid(x)

    
#initialize the model
num_users, num_items = ratings_torch.shape
np.save("num_users.npy", np.array(num_users))
np.save("num_items.npy", np.array(num_items))
latent_dim = int(num_items/8) # Number of latent features

model = SparseAutoencoder(num_items, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Use MSE loss but only consider observed values
def masked_mse_loss(reconstructed, original, mask):
    loss = ((reconstructed - original) ** 2) * mask
    return loss.sum() / mask.sum()

#break up data into train and val
dataset = TensorDataset(ratings_torch, mask_tensor) #keeping the mask
print("ratings_torch shape =", ratings_torch.shape)
print(len(dataset))
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("len(train_loader) = ", len(train_loader))
print("len(val_loader) = ", len(val_loader))

#train the model
epochs = 5000
best_loss = 10e10
counter = 0
for epoch in range(epochs):
    train_loss = 0.0
    for inputs, mask in train_loader:
        model.train()
        optimizer.zero_grad()

        # Forward pass
        reconstructed = model(inputs)
        loss = masked_mse_loss(reconstructed, inputs, mask)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    if (epoch + 1) % 10 == 0:
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_loss}")
        #check validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, mask in val_loader:
                outputs = model(inputs)
                loss = masked_mse_loss(outputs, inputs, mask)
                val_loss += loss.item()
                

        val_loss /= len(val_loader)
        

        print(f"Epoch {epoch + 1} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
        if val_loss < best_loss: #if improve then save
            #save model 
            torch.save(model.state_dict(), "2model{}.pkl".format(latent_dim))
            best_loss = val_loss
            print("Model saved to model{}.pkl.".format(latent_dim))
            counter = 0
        else:
            counter += 10
            
    if counter > 200:
        print("Done training because of no improvement.")
        break
        
            


  mask_tensor = torch.tensor(mask)


ratings_torch shape = torch.Size([1398, 2687])
1398
len(train_loader) =  35
len(val_loader) =  9
Epoch 10 - Train Loss: 0.9704 - Val Loss: 1.1040
Model saved to model335.pkl.
Epoch 20 - Train Loss: 0.8614 - Val Loss: 1.0344
Model saved to model335.pkl.
Epoch 30 - Train Loss: 0.8432 - Val Loss: 1.0376
Epoch 40 - Train Loss: 0.8272 - Val Loss: 1.0616
Epoch 50 - Train Loss: 0.8427 - Val Loss: 1.0371
Epoch 60 - Train Loss: 0.8463 - Val Loss: 1.0069
Model saved to model335.pkl.
Epoch 70 - Train Loss: 0.8807 - Val Loss: 1.0507
Epoch 80 - Train Loss: 0.8758 - Val Loss: 1.0201
Epoch 90 - Train Loss: 0.9166 - Val Loss: 1.0080
Epoch 100 - Train Loss: 0.8726 - Val Loss: 1.0230
Epoch 110 - Train Loss: 0.8715 - Val Loss: 1.0060
Model saved to model335.pkl.
Epoch 120 - Train Loss: 0.8782 - Val Loss: 1.0060
Epoch 130 - Train Loss: 0.8747 - Val Loss: 1.0059
Model saved to model335.pkl.
Epoch 140 - Train Loss: 0.9060 - Val Loss: 1.0167
Epoch 150 - Train Loss: 0.8929 - Val Loss: 0.9787
Model saved to mo

In [18]:
num_users, num_items

(1398, 2687)

In [19]:
print(sum(sum(mask)))
print(mask.shape)
print(mask.shape[0] * mask.shape[1])
print(sum(sum(mask)) / (mask.shape[0] * mask.shape[1]))



tensor(1244.)
torch.Size([24, 2687])
64488
tensor(0.0193)


In [20]:
#Evaulating the model
model.eval()
with torch.no_grad():
    reconstructed = model(ratings_torch)

# Fill missing values in the original matrix
filled_data = ratings_torch.clone()
filled_data[mask_tensor == 0] = reconstructed[mask_tensor == 0]

print("Original Data:\n", ratings_torch)
print("Reconstructed Data:\n", reconstructed)
print("Filled Data:\n", filled_data)


Original Data:
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Reconstructed Data:
 tensor([[3.5418, 3.9119, 3.8005,  ..., 3.3622, 3.4526, 3.6786],
        [3.5418, 3.9119, 3.8005,  ..., 3.3622, 3.4526, 3.6786],
        [1.0000, 4.2027, 5.0000,  ..., 4.9995, 1.0000, 5.0000],
        ...,
        [3.1340, 3.8067, 4.0334,  ..., 3.3121, 2.9549, 4.1466],
        [3.6083, 4.4923, 4.6938,  ..., 4.2762, 3.4196, 3.7561],
        [3.8707, 4.9992, 4.9981,  ..., 4.7592, 4.9859, 1.4525]])
Filled Data:
 tensor([[3.5418, 3.9119, 3.8005,  ..., 3.3622, 3.4526, 3.6786],
        [3.5418, 3.9119, 3.8005,  ..., 3.3622, 3.4526, 3.6786],
        [1.0000, 4.2027, 5.0000,  ..., 4.9995, 1.0000, 5.0000],
        ...,
        [3.1340, 3.8067, 4.0334,  ..., 3.3121, 2.9549, 4.1466],
        [3.6083, 4.4923, 4.6

In [21]:
print(reconstructed[0].numpy())
print(ratings_torch[0].numpy())

print((reconstructed[0].numpy()-ratings_torch[0].numpy())/ratings_torch[0].numpy())

[3.5417907 3.9118884 3.8004873 ... 3.362156  3.4525716 3.6785946]
[0. 0. 0. ... 0. 0. 0.]
[inf inf inf ... inf inf inf]


  print((reconstructed[0].numpy()-ratings_torch[0].numpy())/ratings_torch[0].numpy())


In [22]:
pred_ratings_list = reconstructed[0].detach().numpy()

#give a list sorted out with books you've already read:
sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if  (ratings_matrix[user_id, idx] > 0) or(np.isnan(pred_ratings_list[idx])) :
        continue
    print("#", list_num , titles[idx], " - Predicted Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1

Top books are:
# 1 The Rainbow Fish  - Predicted Rating: 5.0
# 2 Unbelievable
        (Pretty Little Liars, #4)  - Predicted Rating: 5.0
# 3 Flawless
        (Pretty Little Liars, #2)  - Predicted Rating: 5.0
# 4 Perfect
        (Pretty Little Liars, #3)  - Predicted Rating: 5.0
# 5 Our Souls at Night  - Predicted Rating: 5.0
# 6 A Wanted Man
        (Jack Reacher, #17)  - Predicted Rating: 5.0
# 7 The Ultimate Hitchhiker’s Guide to the Galaxy
        (Hitchhiker's Guide to the Galaxy, #1-5)  - Predicted Rating: 4.8
# 8 Harry Potter Series Box Set
        (Harry Potter, #1-7)  - Predicted Rating: 4.8
# 9 The Complete Stories  - Predicted Rating: 4.8
# 10 The Complete Maus  - Predicted Rating: 4.8
# 11 The Deal
        (Off-Campus, #1)  - Predicted Rating: 4.8
# 12 Losing Hope
        (Hopeless, #2)  - Predicted Rating: 4.8
# 13 Words of Radiance
        (The Stormlight Archive, #2)  - Predicted Rating: 4.8
# 14 Kushiel's Dart
        (PhÃ¨dre's Trilogy, #1)  - Predicted Rating: 4.8
# 1

# 176 The Psychopath Test: A Journey Through the Madness Industry  - Predicted Rating: 4.4
# 177 Purity  - Predicted Rating: 4.4
# 178 Enderâ€™s Game
        (Ender's Saga, #1)  - Predicted Rating: 4.4
# 179 Unearthly
        (Unearthly, #1)  - Predicted Rating: 4.4
# 180 The Rose Code  - Predicted Rating: 4.4
# 181 Cilka's Journey
        (The Tattooist of Auschwitz, #2)  - Predicted Rating: 4.4
# 182 Ham on Rye  - Predicted Rating: 4.4
# 183 Ruthless Vows
        (Letters of Enchantment, #2)  - Predicted Rating: 4.4
# 184 The Four Winds  - Predicted Rating: 4.4
# 185 Half Broke Horses  - Predicted Rating: 4.4
# 186 Me  - Predicted Rating: 4.4
# 187 The Prophet  - Predicted Rating: 4.4
# 188 The Exception to the Rule
        (The Improbable Meet-Cute, #1)  - Predicted Rating: 4.4
# 189 Still Life  - Predicted Rating: 4.4
# 190 The Diamond Eye  - Predicted Rating: 4.4
# 191 Olive, Again
        (Olive Kitteridge, #2)  - Predicted Rating: 4.4
# 192 Murder on the Orient Express  - Predic

# 330 The Moon Is a Harsh Mistress  - Predicted Rating: 4.3
# 331 The Wager: A Tale of Shipwreck, Mutiny and Murder  - Predicted Rating: 4.3
# 332 Endurance: Shackleton's Incredible Voyage  - Predicted Rating: 4.3
# 333 Once There Were Wolves  - Predicted Rating: 4.3
# 334 Between the World and Me  - Predicted Rating: 4.3
# 335 Ender's Shadow
        (The Shadow Series, #1)  - Predicted Rating: 4.3
# 336 Razorblade Tears  - Predicted Rating: 4.3
# 337 The Martian Chronicles  - Predicted Rating: 4.3
# 338 Reamde  - Predicted Rating: 4.3
# 339 The Violin Conspiracy  - Predicted Rating: 4.3
# 340 Towers of Midnight
        (The Wheel of Time, #13)  - Predicted Rating: 4.3
# 341 The Sandman, Vol. 3: Dream Country  - Predicted Rating: 4.3
# 342 Yellow Wife  - Predicted Rating: 4.3
# 343 Still Missing  - Predicted Rating: 4.3
# 344 Night Road  - Predicted Rating: 4.3
# 345 The Red Tent  - Predicted Rating: 4.3
# 346 The Song of Achilles  - Predicted Rating: 4.3
# 347 The Stars My Destination

# 485 News of the World  - Predicted Rating: 4.2
# 486 The Alienist
        (Dr. Laszlo Kreizler, #1)  - Predicted Rating: 4.2
# 487 Just for the Summer
        (Part of Your World, #3)  - Predicted Rating: 4.2
# 488 Only the Beautiful  - Predicted Rating: 4.2
# 489 Charlotte’s Web  - Predicted Rating: 4.2
# 490 Half of a Yellow Sun  - Predicted Rating: 4.2
# 491 The Liars' Club  - Predicted Rating: 4.2
# 492 This is Where I Leave You  - Predicted Rating: 4.2
# 493 The Exiles  - Predicted Rating: 4.2
# 494 Girl, Woman, Other  - Predicted Rating: 4.2
# 495 Pippi Longstocking
        (Pippi LÃ¥ngstrump, #1)  - Predicted Rating: 4.2
# 496 In a Sunburned Country  - Predicted Rating: 4.2
# 497 Malibu Rising  - Predicted Rating: 4.2
# 498 Firefly Lane
        (Firefly Lane, #1)  - Predicted Rating: 4.2
# 499 It Happened One Summer
        (Bellinger Sisters, #1)  - Predicted Rating: 4.2
# 500 Demian  - Predicted Rating: 4.2
# 501 Tell No One  - Predicted Rating: 4.2
# 502 Educated  - Predict

# 716 Number the Stars  - Predicted Rating: 4.1
# 717 Batman: The Dark Knight Returns  - Predicted Rating: 4.1
# 718 The Dragon Reborn
        (The Wheel of Time, #3)  - Predicted Rating: 4.1
# 719 The Bullet That Missed
        (Thursday Murder Club, #3)  - Predicted Rating: 4.1
# 720 Leaves of Grass  - Predicted Rating: 4.1
# 721 The House of the Spirits  - Predicted Rating: 4.1
# 722 The Sirens of Titan  - Predicted Rating: 4.1
# 723 A Tale for the Time Being  - Predicted Rating: 4.1
# 724 The Thirteenth Tale  - Predicted Rating: 4.1
# 725 Isaac's Storm: A Man, a Time, and the Deadliest Hurricane in History  - Predicted Rating: 4.1
# 726 Leviathan
        (Leviathan, #1)  - Predicted Rating: 4.1
# 727 Tom Lake  - Predicted Rating: 4.1
# 728 Going Postal
        (Discworld, #33; Moist von Lipwig, #1)  - Predicted Rating: 4.1
# 729 Red Rising
        (Red Rising Saga, #1)  - Predicted Rating: 4.1
# 730 The Storied Life of A.J. Fikry  - Predicted Rating: 4.1
# 731 The Ocean at the End 

# 936 Merrick
        (The Vampire Chronicles, #7)  - Predicted Rating: 4.0
# 937 The Good Girl  - Predicted Rating: 4.0
# 938 Leviathan Wakes
        (The Expanse, #1)  - Predicted Rating: 4.0
# 939 A Wrinkle in Time
        (A Wrinkle in Time Quintet, #1)  - Predicted Rating: 4.0
# 940 The Sun Also Rises  - Predicted Rating: 4.0
# 941 House of Sky and Breath
        (Crescent City, #2)  - Predicted Rating: 4.0
# 942 The Golden Couple  - Predicted Rating: 4.0
# 943 For Your Own Good  - Predicted Rating: 4.0
# 944 She's Not Sorry  - Predicted Rating: 4.0
# 945 The Chronicles of Narnia
        (The Chronicles of Narnia, #1-7)  - Predicted Rating: 4.0
# 946 The Hard Way
        (Jack Reacher, #10)  - Predicted Rating: 4.0
# 947 House of Earth and Blood
        (Crescent City, #1)  - Predicted Rating: 4.0
# 948 The School of Essential Ingredients  - Predicted Rating: 4.0
# 949 The Girl Who Played with Fire
        (Millennium, #2)  - Predicted Rating: 4.0
# 950 The Aeneid  - Predicted Rat

# 1146 Yes Please  - Predicted Rating: 3.9
# 1147 The Great Divorce  - Predicted Rating: 3.9
# 1148 A Midsummer Nightâ€™s Dream  - Predicted Rating: 3.9
# 1149 Tell Me Lies  - Predicted Rating: 3.9
# 1150 Once Upon a River  - Predicted Rating: 3.9
# 1151 Wizard's First Rule
        (Sword of Truth, #1)  - Predicted Rating: 3.9
# 1152 The Locked Door  - Predicted Rating: 3.9
# 1153 The Five Love Languages: How to Express Heartfelt Commitment to Your Mate  - Predicted Rating: 3.9
# 1154 Passion
        (Fallen, #3)  - Predicted Rating: 3.9
# 1155 The Merchant of Venice  - Predicted Rating: 3.9
# 1156 The Great Hunt
        (The Wheel of Time, #2)  - Predicted Rating: 3.9
# 1157 City of Heavenly Fire
        (The Mortal Instruments, #6)  - Predicted Rating: 3.9
# 1158 A Short History of Nearly Everything  - Predicted Rating: 3.9
# 1159 Sheâ€™s Come Undone  - Predicted Rating: 3.9
# 1160 The Art of Fielding  - Predicted Rating: 3.9
# 1161 Empire of Storms
        (Throne of Glass, #5)  - P

# 1394 Stiff: The Curious Lives of Human Cadavers  - Predicted Rating: 3.8
# 1395 Seabiscuit: An American Legend  - Predicted Rating: 3.8
# 1396 State of Wonder  - Predicted Rating: 3.8
# 1397 Turn Coat
        (The Dresden Files, #11)  - Predicted Rating: 3.8
# 1398 The Evening and the Morning
        (Kingsbridge, #0)  - Predicted Rating: 3.8
# 1399 A Good Neighborhood  - Predicted Rating: 3.8
# 1400 How to Sell a Haunted House  - Predicted Rating: 3.8
# 1401 The Time Machine  - Predicted Rating: 3.8
# 1402 Moon Called
        (Mercy Thompson, #1)  - Predicted Rating: 3.8
# 1403 Persuasion  - Predicted Rating: 3.8
# 1404 Testimony  - Predicted Rating: 3.8
# 1405 Marley and Me: Life and Love With the Worldâ€™s Worst Dog  - Predicted Rating: 3.8
# 1406 Speaker for the Dead
        (Ender's Saga, #2)  - Predicted Rating: 3.8
# 1407 Ignite Me
        (Shatter Me, #3)  - Predicted Rating: 3.8
# 1408 The Metamorphosis  - Predicted Rating: 3.8
# 1409 Specials
        (Uglies, #3)  - Predict

# 1674 Sharp Objects  - Predicted Rating: 3.7
# 1675 Outline  - Predicted Rating: 3.7
# 1676 City of Ashes
        (The Mortal Instruments, #2)  - Predicted Rating: 3.7
# 1677 The Radium Girls: The Dark Story of Americaâ€™s Shining Women  - Predicted Rating: 3.7
# 1678 The Tipping Point: How Little Things Can Make a Big Difference  - Predicted Rating: 3.7
# 1679 A Visit from the Goon Squad  - Predicted Rating: 3.7
# 1680 The Cricket in Times Square
        (Chester Cricket and His Friends, #1)  - Predicted Rating: 3.7
# 1681 The Little Engine That Could  - Predicted Rating: 3.7
# 1682 Rage  - Predicted Rating: 3.7
# 1683 The Book of Cold Cases  - Predicted Rating: 3.7
# 1684 Happiness for Beginners  - Predicted Rating: 3.7
# 1685 The Rum Diary  - Predicted Rating: 3.7
# 1686 The Celestine Prophecy
        (Celestine Prophecy, #1)  - Predicted Rating: 3.7
# 1687 Pretty Girls  - Predicted Rating: 3.7
# 1688 The Life We Bury
        (Joe Talbert, #1; Detective Max Rupert, #1)  - Predicted

# 1977 â€™Salemâ€™s Lot  - Predicted Rating: 3.6
# 1978 Brooklyn
        (Eilis Lacey, #1)  - Predicted Rating: 3.6
# 1979 Click, Clack, Moo: Cows That Type  - Predicted Rating: 3.6
# 1980 The Woods  - Predicted Rating: 3.6
# 1981 Test of the Twins
        (Dragonlance: Legends, #3)  - Predicted Rating: 3.6
# 1982 The Tao of Pooh  - Predicted Rating: 3.6
# 1983 Time of the Twins
        (Dragonlance: Legends, #1)  - Predicted Rating: 3.6
# 1984 War of the Twins
        (Dragonlance: Legends, #2)  - Predicted Rating: 3.6
# 1985 The Guest List  - Predicted Rating: 3.6
# 1986 The Woman in Black  - Predicted Rating: 3.6
# 1987 Breaking Dawn
        (The Twilight Saga, #4)  - Predicted Rating: 3.6
# 1988 The Friend Zone
        (The Friend Zone, #1)  - Predicted Rating: 3.6
# 1989 The BFG  - Predicted Rating: 3.6
# 1990 The Kiss Quotient
        (The Kiss Quotient, #1)  - Predicted Rating: 3.6
# 1991 Turtles All the Way Down  - Predicted Rating: 3.6
# 1992 The Vile Village
        (A Series

# 2232 The Zombie Survival Guide: Complete Protection from the Living Dead  - Predicted Rating: 3.4
# 2233 The Woman in Cabin 10  - Predicted Rating: 3.4
# 2234 Marked
        (House of Night, #1)  - Predicted Rating: 3.4
# 2235 The Last Kingdom
        (The Saxon Stories, #1)  - Predicted Rating: 3.4
# 2236 Gone
        (Gone, #1)  - Predicted Rating: 3.4
# 2237 The Orphan's Tale  - Predicted Rating: 3.4
# 2238 The Affair
        (Jack Reacher, #16)  - Predicted Rating: 3.4
# 2239 This Summer Will Be Different  - Predicted Rating: 3.4
# 2240 The Midnight Line
        (Jack Reacher, #22)  - Predicted Rating: 3.4
# 2241 Mexican Gothic  - Predicted Rating: 3.4
# 2242 Fablehaven
        (Fablehaven, #1)  - Predicted Rating: 3.4
# 2243 Memory Man
        (Amos Decker, #1)  - Predicted Rating: 3.4
# 2244 Blue Bloods
        (Blue Bloods, #1)  - Predicted Rating: 3.4
# 2245 What Happened to the Bennetts  - Predicted Rating: 3.4
# 2246 Bag of Bones  - Predicted Rating: 3.4
# 2247 The Stinky C

        (Three Sisters Island, #1)  - Predicted Rating: 3.0
# 2525 Masquerade
        (Blue Bloods, #2)  - Predicted Rating: 3.0
# 2526 Bloody Bones
        (Anita Blake, Vampire Hunter, #5)  - Predicted Rating: 3.0
# 2527 Entwined with You
        (Crossfire, #3)  - Predicted Rating: 3.0
# 2528 The 6th Target
        (Women's Murder Club, #6)  - Predicted Rating: 3.0
# 2529 Violets Are Blue
        (Alex Cross, #7)  - Predicted Rating: 3.0
# 2530 Untamed
        (House of Night, #4)  - Predicted Rating: 3.0
# 2531 Hunted
        (House of Night, #5)  - Predicted Rating: 3.0
# 2532 Forever...  - Predicted Rating: 3.0
# 2533 Behind Closed Doors  - Predicted Rating: 3.0
# 2534 4th of July
        (Women's Murder Club, #4)  - Predicted Rating: 3.0
# 2535 Hour Game
        (Sean King & Michelle Maxwell, #2)  - Predicted Rating: 3.0
# 2536 On Beauty  - Predicted Rating: 3.0
# 2537 The Life-Changing Magic of Tidying Up: The Japanese Art of Decluttering and Organizing  - Predicted Rating: 3.0

In [23]:
#making weighted loss matrix
percents = np.array([ 2.0839861,   6.38564535, 22.8939068,  37.94135873, 30.69510302])
each_weights = 100/percents
print(each_weights)
print(each_weights.sum())

print(each_weights * percents)

weights_array = np.zeros(ratings_torch.shape)
for i in tqdm(range(len(ratings_torch))):
    for j in range(len(ratings_torch[0])):
        for num in [1, 2, 3, 4, 5]:
            if ratings_torch[i,j] == num:
                weights_array[i,j] = each_weights[num-1]
weights_tensor = torch.tensor(weights_array)

[47.98496497 15.66012431  4.36797445  2.63564625  3.25784865]
73.90655863743766
[100. 100. 100. 100. 100.]


100%|███████████████████████████████████████████████████████████████████████████████| 1398/1398 [29:46<00:00,  1.28s/it]


In [24]:
weights[0]

NameError: name 'weights' is not defined

In [None]:
#doing masked autoencoder with weighted loss
latent_dim = 100 # Number of latent features

model = SparseAutoencoder(num_items, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Use MSE loss with weights but only consider observed values
def masked_mse_loss_diff(reconstructed, original, mask, weights):
    loss = (((reconstructed - original) ** 2) * mask)
    weighted_loss = loss * weights
    return weighted_loss.sum() / mask.sum()/100

#break up data into train and val
print("ratings_torch shape = ", ratings_torch.shape)
print("mask_tensor shape = ", mask_tensor.shape)
print("weights shape = ", weights_tensor.shape)

dataset = TensorDataset(ratings_torch, mask_tensor, weights_tensor) #keeping the mask
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


#train the model
epochs = 5000
best_loss = 10e10
counter = 0
for epoch in range(epochs):
    train_loss = 0.0
    for inputs, mask, this_weight in train_loader:
        model.train()
        optimizer.zero_grad()

        # Forward pass
        reconstructed = model(inputs)
        loss = masked_mse_loss_diff(reconstructed, inputs, mask, this_weight)
#         loss_not_weighted = masked_mse_loss(reconstructed, inputs, mask)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    if (epoch + 1) % 10 == 0:
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_loss}")
        #check validation
        model.eval()
        val_loss = 0.0
        val_loss_not_weighted = 0.0
        with torch.no_grad():
            for inputs, mask, this_weight in val_loader:
                outputs = model(inputs)
                loss = masked_mse_loss_diff(outputs, inputs, mask, this_weight)
                loss_not_weighted = masked_mse_loss(outputs, inputs, mask)
                val_loss += loss.item()
                val_loss_not_weighted += loss_not_weighted.item()
                

        val_loss /= len(val_loader)
        

        print(f"Epoch {epoch + 1} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Val not weighted: {val_loss_not_weighted:.4f}" )
        if val_loss < best_loss: #if improve then save
            #save model 
            torch.save(model.state_dict(), "model_weighted{}.pkl".format(latent_dim))
            best_loss = val_loss
            print("Model saved to model_weighted{}.pkl.".format(latent_dim))
            counter = 0
        else:
            counter += 10
            
    if counter > 200:
        print("Done training because of no improvement.")
        break
        
            


In [None]:
dfghj
import torch
from sklearn.model_selection import KFold

# Mask for observed values (1 for observed, 0 for missing)
ratings_torch = torch.tensor(ratings).float()
mask = (ratings_torch != 0).float()
print(mask)


#Define autoencoder
class SparseAutoencoder(nn.Module):
    def __init__(self, num_items, latent_dim):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Linear(num_items, latent_dim)
        self.decoder = nn.Linear(latent_dim, num_items)
        
    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        # Scale sigmoid output to [1, 5]
        return 1 + 4 * torch.sigmoid(decoded)
        return decoded

    
#initialize the model
num_users, num_items = ratings_torch.shape
np.save("num_users.npy", np.array(num_users))
np.save("num_items.npy", np.array(num_items))

for latent_dim in [2, 5, 10, 20, 40, 50, 75, 100]:
    print("latent_dim = ", latent_dim)
# latent_dim = 20  # Number of latent features

    model = SparseAutoencoder(num_items, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Define your model, loss function, and optimizer
    # Assuming model, ratings_torch, mask, and optimizer are already defined

    epochs = 1000
    k_folds = 5  # Number of folds for cross-validation
    kf = KFold(n_splits=k_folds, shuffle=True)

    # Store the losses for each fold
    fold_losses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(ratings_torch)):
#         print(f"\nFold {fold + 1}/{k_folds}")

        # Split the data into training and validation sets
        train_ratings = ratings_torch[train_idx]
        val_ratings = ratings_torch[val_idx]
        train_mask = mask[train_idx]
        val_mask = mask[val_idx]

        # Initialize a new model for each fold
        model = SparseAutoencoder(num_items, latent_dim)
    #     optimizer = optim.Adam(model.parameters(), lr=0.01)

        # Re-initialize optimizer for each fold
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        # Train the model
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()

            # Forward pass for training
            reconstructed = model(train_ratings)
            loss = masked_mse_loss(reconstructed, train_ratings, train_mask)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

#             if (epoch + 1) % 1000 == 0:
#                 print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

        # Evaluate the model on the validation set
        model.eval()
        with torch.no_grad():
            reconstructed_val = model(val_ratings)
            val_loss = masked_mse_loss(reconstructed_val, val_ratings, val_mask)

        print(f"Validation Loss for Fold {fold + 1}: {val_loss.item():.4f}")

        # Store the validation loss for this fold
        fold_losses.append(val_loss.item())

    # Print the average validation loss after all folds
    print(f"\nAverage Validation Loss across all folds: {sum(fold_losses)/k_folds:.4f}")


In [None]:
fgh

In [None]:
ratings_matrix[user_id]

In [None]:
# Convert the filled ratings data into a numpy array
ratings_matrix = ratings_df.values

# Initialize KNN (using user-based KNN)
import math
knn = NearestNeighbors(n_neighbors=math.ceil(num_users/10), metric='cosine')  # Using cosine similarity
knn.fit(ratings_matrix)

# Example: Predict rating for User 0 on Item 2
user_id = 0  # Index of user in the matrix
item_id = 9  # Index of item to predict rating for

# Get the nearest neighbors for user 0 (excluding the user itself)
distances, indices = knn.kneighbors([ratings_matrix[user_id]])

pred_ratings_list = np.array([])
rankings_list = np.array([])
for item_id in range(num_titles):
    # Get the ratings for the neighbors on item 2
    neighbor_ratings = np.array([ratings_matrix[i, item_id] for i in indices[0] if not np.isnan(ratings_matrix[i, item_id])])
    print(ratings_matrix[:, item_id])
    print(neighbor_ratings)
    ghjk
    
    predicted_rating = np.mean(neighbor_ratings[np.nonzero(neighbor_ratings)])
    rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

#     print(predicted_rating)

    pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    rankings_list = np.append(rankings_list, rankings)
#     print(f"Predicted rating for User {user_id} on Item {item_id}: {predicted_rating}")
    
best_book_rating = np.max(pred_ratings_list)
best_book_idx = np.argmax(pred_ratings_list)

sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
for i, idx in enumerate(sorted_indices): 
    print("#", (i+1) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))

In [None]:
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds

# Create a sparse matrix (CSR format)
A = np.array([[1.0, 0, 0], [0, 2, 3], [4, 0, 6], [0, 0, 0]])
# print(A)
sparse_matrix = sp.csr_matrix(ratings_matrix)

# Perform SVD on the sparse matrix
# k is the number of singular values to compute (you can choose a value smaller than min(m, n))
U, S, VT = svds(sparse_matrix, k=500)

# Output the matrices
print("U (Left Singular Vectors):\n", U)
print("\nS (Singular Values):\n", S)
print("\nVT (Right Singular Vectors - Transposed):\n", VT)

# Reconstruct the matrix from U, S, VT
S_full = np.diag(S)  # Convert singular values to a diagonal matrix
A_reconstructed = np.dot(U, np.dot(S_full, VT))

print("\nReconstructed Matrix A:\n", A_reconstructed)


In [None]:
my_diff = (ratings_matrix[0]- A_reconstructed[0])
print(ratings_matrix.shape)
plt.plot(my_diff, '.')

# for i in range(len(ratings_matrix[0])):
#     if ratings_matrix[0, i] > 0:
#         print(ratings_matrix[0, i], A_reconstructed[0, i], titles[i])
        
for i in range(len(ratings_matrix[0])):
    if ratings_matrix[0, i] == 0 and A_reconstructed[0, i] > 0:
        print(ratings_matrix[0, i], A_reconstructed[0, i], titles[i])


In [None]:
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Create a custom colormap with white for 0 and red for non-zero
cmap = mcolors.ListedColormap(['white', 'red'])
bounds = [0, 0.1, 1]  # Set bounds for 0 (white) and non-zero (red)
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# plt.imshow(ratings_matrix - A_reconstructed, cmap=cmap, norm=norm)
plt.plot(ratings_matrix - A_reconstructed)
plt.show()

In [None]:
ratings_matrix - A_reconstructed

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.75, min_samples=2, metric='cosine')
labels = dbscan.fit_predict(ratings_matrix)


In [None]:
print(list(set(labels)))
print(labels.shape)

In [None]:
idx_in_group = np.arange(len(labels))
filtered_users = [idx_in_group for idx_in_group, label in zip(idx_in_group, labels) if label == 0]
print(filtered_users)

In [None]:
from sklearn.cluster import SpectralClustering
from scipy.sparse import csr_matrix
import numpy as np

# Example sparse data
# X = np.random.rand(100, 2)
X_sparse = csr_matrix(ratings_matrix)

n_clusters = 50
# Apply Spectral Clustering
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors')
labels = spectral.fit_predict(X_sparse)

print(labels)
print(list(set(labels)))
print(labels.shape)

In [None]:
# idx_in_group = np.arange(len(labels))
# filtered_users = [idx_in_group for idx_in_group, label in zip(idx_in_group, labels) if label == 0]
# print(filtered_users)

group_averages = []

for group in range(n_clusters):
    # Find indices of users in the current group
    group_users = np.where(labels == group)[0]
    
    # Extract the rows for users in this group
    group_data = ratings_matrix[group_users]
    
    print("Number of perople in group = ", group_data.shape[0])
    
    pred_ratings_list = np.array([])
    for item_id in range(num_titles):
        data = group_data[:,item_id]
        predicted_rating = np.mean(data[np.nonzero(data)])
#                 rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

        pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    # Compute the average for each column (item) for this group
#     group_avg = np.mean(group_data[np.nonzero(group_data)], axis=0)
    
    # Append the average for this group
    group_averages.append(pred_ratings_list)

# Convert the list of group averages to a numpy array for easy viewing
group_averages = np.array(group_averages)

# Display the average for each item in each group
print("Average preferences for each item by group:")
print(group_averages.shape)

In [None]:
group = labels[0]
print("my group = ", group)
sorted_indices = np.argsort(group_averages[group])[::-1]
print(sorted_indices)
for i in sorted_indices:
    if (ratings_matrix[0, i] > 0) or (np.isnan(group_averages[group, i])):
        pass
    else:
        print(titles[i], round(group_averages[group,i], 1))