In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
import pickle 

def scrape_goodreads_ratings(user_id, max_pages=10):
    """
    Scrape a user's star ratings from Goodreads.
    
    Args:
    - user_id (str): Goodreads user ID or profile suffix.
    - max_pages (int): Maximum number of pages to scrape (each page contains ~30 books).
    
    Returns:
    - pd.DataFrame: A DataFrame containing book titles and ratings.
    """
    base_url = f"https://www.goodreads.com/review/list/{user_id}?shelf=read"
    headers = {"User-Agent": "Mozilla/5.0"}
    books = []

    for page in range(1, max_pages + 1):
        url = f"{base_url}&page={page}"
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find all book entries in the table
        rows = soup.find_all("tr", class_="bookalike review")
        if not rows:
            print("No more data found.")
            break

        for row in rows:
            try:
                title = row.find("td", class_="field title").a.text.strip()
                rating_element = row.find("td", class_="field rating")
                rating = rating_element.find("span", class_="staticStars").get("title", "No rating")
                stars = map_rating(rating)
                books.append({"Title": title, "Rating": stars, "User_id": user_id})
#                 print(title, rating, stars)
            except AttributeError:
                # Handle rows with missing data
                continue

        print(f"Page {page} scraped successfully.")
        time.sleep(random.uniform(1, 5))  # Be kind to the server and avoid being blocked

    # Return data as a pandas DataFrame
    return pd.DataFrame(books)



In [2]:
def map_rating(phrase):
    rating_map = {
        "liked it": 3,
        "really liked it": 4,
        "it was ok": 2, 
        "it was amazing": 5, 
        "did not like it": 1,
    }
    
    return rating_map.get(phrase, "Invalid rating")  # Default to "Invalid rating" if the phrase isn't in the dictionary


In [3]:
# if __name__ == "__main__":
# #     user_id = "6688207"  # Replace with the Goodreads user ID or profile suffix
# #     for user_id in tqdm(['30181442', '75009563', '11345366', '110912303', '113964939', '11215896', '53701594', '4622890', '93628736', '176180116']):
# #     for user_id in tqdm(['2974095', '4622890', '28953843', '16174645', '4159922', '4125660', '54886546', '16912659', '260116', '4685500', '21865425']):
# #     for user_id in tqdm(['53701594', '27709782', '7566229', '16652861', '30817744', '56259255', '4125660', '60964126', 
# #                          '176167767', '28510930', '1029975', '131020767', '28862120', '88713906', '160141433', '41097916', 
# #                          '20809863', '69519261', '24017481', '7376365', '75941333', '13571407', '106618742', '17792052',
# #                          '3534528', '130656897', '7474475', '4125412', '6336365', '6026811', '3438047']):
#     for user_id in ['169695556']:
#         print("User_id = ", user_id)
#         max_pages = 30  # Adjust based on expected data
#         ratings_data = scrape_goodreads_ratings(user_id, max_pages)

#         if not ratings_data.empty:
# #             print(ratings_data.head())
# #             ratings_data.to_csv("goodreads_ratings.csv", index=False)
#             ratings_data.to_csv('goodreads_ratings.csv', mode='a', header=False, index=False)
#             print("Data saved to goodreads_ratings.csv.")
#         else:
#             print("No data retrieved.")


In [4]:
df = pd.read_csv('goodreads_ratings_series.csv')
print(df.shape)
df = df.drop_duplicates()
# Print the entire DataFrame
print(df)

duplicate_count = df['Title'].duplicated().sum()
print("Number of books with at least two people rating it:", duplicate_count)
duplicate_counts_per_value = df['Title'].value_counts()
print(duplicate_counts_per_value)
print("Number of unique books: ", df['Title'].nunique())
num_users = df['User_id'].nunique()
user_ids = list(df['User_id'].unique())
print("number of users is: ", num_users)
print("user_ids = ", user_ids)

  df = pd.read_csv('goodreads_ratings_series.csv')


(284940, 6)
                                                    Title Rating    User_id  \
0                                       I Am Watching You      3  169695558   
1       Three to Get Deadly\n        (Stephanie Plum, #3)      3  169695558   
2       Before the Coffee Gets Cold\n        (Before t...      4  169695558   
3       Dark Sacred Night\n        (Renée Ballard, #2;...      4  169695558   
4         Two for the Dough\n        (Stephanie Plum, #2)      4  169695558   
...                                                   ...    ...        ...   
284935        Heir of Fire\n        (Throne of Glass, #3)      5  127541816   
284936           Six of Crows\n        (Six of Crows, #1)      5  127541816   
284937  A Court of Mist and Fury\n        (A Court of ...      5  127541816   
284938  The Cruel Prince\n        (The Folk of the Air...      5  127541816   
284939  Harry Potter and the Sorcerer's Stone\n       ...      5  127541816   

       Series  First  Suggest  
0      

In [5]:
# # Get a list of top titles in order
# top_titles = duplicate_counts_per_value.index.tolist()
# top_100 = top_titles[:100]

# for title in top_100:
#     print(title)
    
# with open("top_100.pkl", "wb") as file:
#     pickle.dump(top_100, file)

In [6]:
# threshold = 5#num_users * 0.1
# pop_titles = list(duplicate_counts_per_value[duplicate_counts_per_value > threshold].index)
# my_titles = df.loc[df["User_id"] == 169695558, "Title"].tolist()
# # print(my_titles)

# print("pop titles len = ", len(pop_titles))
# print(pop_titles)
# print("my titles len = ", len(my_titles))
# titles = list(set(pop_titles))# + my_titles))

# # #remove Harry Potter titles:
# # titles = [s for s in titles if "Harry Potter" not in s]

# num_titles = len(titles)


# print(titles)
# # print(titles)
# print("num_titles =", num_titles)

# # ratings = np.full((num_users, num_titles), None)
# ratings = np.zeros((num_users, num_titles))

# for index, row in df.iterrows():
#     if row['Title'] in titles:
#         try:
#             ratings[user_ids.index(row['User_id']), titles.index(row["Title"])] = int(row["Rating"])
# #             print("found ", row["Title"])
#         except:
#             pass
        
# print("ratings size = ", ratings.shape)
# ratings = ratings[~np.all(ratings == 0, axis=1)]
# print("ratings size = ", ratings.shape)
# # Save the list to a file
# with open("titles.pkl", "wb") as file:
#     pickle.dump(titles, file)

In [7]:
# # print(df['Suggest'])

# suggest = list(df['Suggest'])
# print(suggest)

# with open("suggest.pkl", "wb") as file:
#     suggest.dump(suggest, file)

In [8]:
# ratings_df = pd.DataFrame(ratings)
# print(ratings_df.shape)
# #delete users that don't have any of these ratings
# # ratings_df = ratings_df.loc[~(ratings_df == 0).all(axis=1)]
# ratings_df = ratings_df.loc[(ratings_df != 0).sum(axis=1) >= 4] #need at least 4 entries to stay
# print(ratings_df.shape)

# # Calculate percentage of non-zero elements
# percentage_nonzero = (np.count_nonzero(ratings_df) / ratings_df.size) * 100
# print("percentage_nonzero =", round(percentage_nonzero, 1), '%')

# # Save the list to a file
# with open("ratings_df.pkl", "wb") as file:
#     pickle.dump(ratings_df, file)


In [9]:
with open("titles.pkl", "rb") as file:
    titles = pickle.load(file)
    
with open("top_100.pkl", "rb") as file:
    top_100 = pickle.load(file)
    
with open("suggest.pkl", "rb") as file:
    suggest = pickle.load(file)
    
num_titles = len(titles)

# Load the list from the file
with open("ratings_df.pkl", "rb") as file:
    ratings_df = pickle.load(file)
    ratings = ratings_df.to_numpy()


In [10]:
# Convert the filled ratings data into a numpy array
ratings_matrix = ratings_df.values

# Initialize KNN (using user-based KNN)
import math
knn = NearestNeighbors(n_neighbors=50, metric='cosine')  # Using cosine similarity  math.ceil(num_users/10)
knn.fit(ratings_matrix)

with open("knn_model.pkl", "wb") as file:
    pickle.dump(knn, file)

# Example: Predict rating for User 0 on Item 2
user_id = 0  # Index of user in the matrix
item_id = 2  # Index of item to predict rating for

# Get the nearest neighbors for user 0 (excluding the user itself)
distances, indices = knn.kneighbors([ratings_matrix[user_id]])

pred_ratings_list = np.array([])
rankings_list = np.array([])
for item_id in range(num_titles):
    # Get the ratings for the neighbors on item 2
    neighbor_ratings = np.array([ratings_matrix[i, item_id] for i in indices[0] if not np.isnan(ratings_matrix[i, item_id])])

    predicted_rating = np.mean(neighbor_ratings[np.nonzero(neighbor_ratings)])
    rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

#     print(predicted_rating)

    pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    rankings_list = np.append(rankings_list, rankings)
#     print(f"Predicted rating for User {user_id} on Item {item_id}: {predicted_rating}")
    
best_book_rating = np.max(pred_ratings_list)
best_book_idx = np.argmax(pred_ratings_list)

sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
n = 1
for i, idx in enumerate(sorted_indices): 
    if (ratings[0, idx] > 0) or (np.isnan(pred_ratings_list[idx])):
        pass
    else:
        print("#", (n) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
        n+=1

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Top books are:
# 1 Homegoing Rating: 5.0
# 2 Love on the Brain Rating: 5.0
# 3 The Last Battle
        (Chronicles of Narnia, #7) Rating: 5.0
# 4 Bride Rating: 5.0
# 5 The Tale of Despereaux Rating: 5.0
# 6 Rebecca Rating: 5.0
# 7 Rise of the Evening Star
        (Fablehaven, #2) Rating: 5.0
# 8 Beneath a Scarlet Sky Rating: 5.0
# 9 November 9 Rating: 5.0
# 10 Xenocide
        (Ender's Saga, #3) Rating: 5.0
# 11 City of Heavenly Fire
        (The Mortal Instruments, #6) Rating: 5.0
# 12 The Marriage Bargain
        (Marriage to a Billionaire, #1) Rating: 5.0
# 13 All the Light We Cannot See Rating: 5.0
# 14 Inkspell
        (Inkworld, #2) Rating: 5.0
# 15 Seriously... I'm Kidding Rating: 5.0
# 16 The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics Rating: 5.0
# 17 Beautiful Chaos
        (Caster Chronicles, #3) Rating: 5.0
# 18 The Invention of Hugo Cabret Rating: 5.0
# 19 This Lullaby Rating: 5.0
# 20 Roll of Thunder, Hear My Cry
        (Log

# 381 A Passage to India Rating: 4.0
# 382 Check & Mate Rating: 4.0
# 383 Sometimes I Lie Rating: 4.0
# 384 The Screwtape Letters Rating: 4.0
# 385 I Capture the Castle Rating: 4.0
# 386 Change of Heart Rating: 4.0
# 387 The Tempest Rating: 4.0
# 388 Oliver Twist Rating: 4.0
# 389 Tomorrow, and Tomorrow, and Tomorrow Rating: 4.0
# 390 If You Give a Moose a Muffin Rating: 4.0
# 391 Extremely Loud & Incredibly Close Rating: 4.0
# 392 The Essential Calvin and Hobbes: A Calvin and Hobbes Treasury Rating: 4.0
# 393 Confessions of a Shopaholic
        (Shopaholic, #1) Rating: 4.0
# 394 The Witches Rating: 4.0
# 395 Dead Witch Walking
        (The Hollows, #1) Rating: 4.0
# 396 Doctor Sleep
        (The Shining, #2) Rating: 4.0
# 397 Midnight in the Garden of Good and Evil Rating: 4.0
# 398 Hamlet: Screenplay, Introduction And Film Diary Rating: 4.0
# 399 Let's Explore Diabetes with Owls: Essays, Etc. Rating: 4.0
# 400 The World According to Garp Rating: 4.0
# 401 Circe Rating: 4.0
# 402 On W

# 704 Station Eleven Rating: 3.0
# 705 Nothing to See Here Rating: 3.0
# 706 Beautiful Boy: A Father's Journey Through His Son's Addiction Rating: 3.0
# 707 Old Yeller Rating: 3.0
# 708 Forever in Blue: The Fourth Summer of the Sisterhood
        (Sisterhood, #4) Rating: 3.0
# 709 Murder on the Orient Express Rating: 3.0
# 710 Seabiscuit: An American Legend Rating: 3.0
# 711 Tess of the Dâ€™Urbervilles Rating: 3.0
# 712 Behind Closed Doors Rating: 3.0
# 713 James and the Giant Peach Rating: 3.0
# 714 Before I Fall Rating: 3.0
# 715 Stellaluna Rating: 3.0
# 716 The Notebook
        (The Notebook, #1) Rating: 2.8
# 717 Fight Club Rating: 2.7
# 718 Plum Lovin'
        (Stephanie Plum, #12.5) Rating: 2.7
# 719 Fifty Shades of Grey
        (Fifty Shades, #1) Rating: 2.7
# 720 Something Borrowed
        (Darcy & Rachel, #1) Rating: 2.7
# 721 The Odyssey Rating: 2.6
# 722 Matched
        (Matched, #1) Rating: 2.5
# 723 The Shack Rating: 2.5
# 724 Go the Fuck to Sleep Rating: 2.5
# 725 The Two

In [11]:
#make knn for the similar user part
# Initialize KNN (using user-based KNN)
# import math
knn_30 = NearestNeighbors(n_neighbors=50, metric='cosine')  # Using cosine similarity
knn_30.fit(ratings_matrix)

with open("knn_model_30.pkl", "wb") as file:
    pickle.dump(knn_30, file)

In [12]:
rankings_list

best_book_rating = np.max(rankings_list)
best_book_idx = np.argmax(rankings_list)

sorted_indices = np.argsort(rankings_list)[::-1]
print("Top books are:")
for i, idx in enumerate(sorted_indices): 
    print("#", (i+1) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1), ". Ranking:", rankings_list[idx])

Top books are:
# 1 Harry Potter and the Goblet of Fire
        (Harry Potter, #4) Rating: 4.7 . Ranking: 218.0
# 2 Harry Potter and the Deathly Hallows
        (Harry Potter, #7) Rating: 4.8 . Ranking: 216.0
# 3 Harry Potter and the Prisoner of Azkaban
        (Harry Potter, #3) Rating: 4.7 . Ranking: 213.0
# 4 Harry Potter and the Half-Blood Prince
        (Harry Potter, #6) Rating: 4.7 . Ranking: 206.0
# 5 Harry Potter and the Order of the Phoenix
        (Harry Potter, #5) Rating: 4.7 . Ranking: 205.0
# 6 Harry Potter and the Chamber of Secrets
        (Harry Potter, #2) Rating: 4.6 . Ranking: 201.0
# 7 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1) Rating: 4.7 . Ranking: 193.0
# 8 The Hunger Games
        (The Hunger Games, #1) Rating: 4.7 . Ranking: 177.0
# 9 Catching Fire
        (The Hunger Games, #2) Rating: 4.7 . Ranking: 149.0
# 10 Mockingjay
        (The Hunger Games, #3) Rating: 4.4 . Ranking: 136.0
# 11 Twilight
        (The Twilight Saga, #1) Rating: 4.0

# 258 The Screwtape Letters Rating: 4.0 . Ranking: 8.0
# 259 The Hiding Place: The Triumphant True Story of Corrie Ten Boom Rating: 4.0 . Ranking: 8.0
# 260 A Good Girl's Guide to Murder
        (A Good Girl's Guide to Murder, #1) Rating: 4.0 . Ranking: 8.0
# 261 Something Borrowed
        (Darcy & Rachel, #1) Rating: 2.7 . Ranking: 8.0
# 262 Storm Front
        (The Dresden Files, #1) Rating: 4.0 . Ranking: 8.0
# 263 The Short Second Life of Bree Tanner
        (The Twilight Saga, #3.5) Rating: 4.0 . Ranking: 8.0
# 264 Steve Jobs Rating: 4.0 . Ranking: 8.0
# 265 Uglies
        (Uglies, #1) Rating: 4.0 . Ranking: 8.0
# 266 Atonement Rating: 4.0 . Ranking: 8.0
# 267 The Red Pyramid
        (The Kane Chronicles, #1) Rating: 4.0 . Ranking: 8.0
# 268 Fight Club Rating: 2.7 . Ranking: 8.0
# 269 A Midsummer Night’s Dream Rating: 4.0 . Ranking: 8.0
# 270 Hamlet: Screenplay, Introduction And Film Diary Rating: 4.0 . Ranking: 8.0
# 271 Circe Rating: 4.0 . Ranking: 8.0
# 272 It Ends with Us
    

        (The Iron Fey, #3) Rating: 4.0 . Ranking: 4.0
# 503 Madame Bovary Rating: 4.0 . Ranking: 4.0
# 504 Hush, Hush
        (Hush, Hush, #1) Rating: 4.0 . Ranking: 4.0
# 505 I Capture the Castle Rating: 4.0 . Ranking: 4.0
# 506 Daisy Jones & The Six Rating: 4.0 . Ranking: 4.0
# 507 The Tale of Peter Rabbit
        (World of Beatrix Potter, #1) Rating: 4.0 . Ranking: 4.0
# 508 Island of the Blue Dolphins Rating: 2.0 . Ranking: 4.0
# 509 The World According to Garp Rating: 4.0 . Ranking: 4.0
# 510 Tiny Beautiful Things: Advice on Love and Life from Dear Sugar Rating: 4.0 . Ranking: 4.0
# 511 Northanger Abbey Rating: 4.0 . Ranking: 4.0
# 512 The Power of Habit: Why We Do What We Do in Life and Business Rating: 4.0 . Ranking: 4.0
# 513 Before the Coffee Gets Cold
        (Before the Coffee Gets Cold, #1) Rating: 4.0 . Ranking: 4.0
# 514 Sh*t My Dad Says Rating: 4.0 . Ranking: 4.0
# 515 The Forgotten Garden Rating: 4.0 . Ranking: 4.0
# 516 The Iliad Rating: 2.0 . Ranking: 4.0
# 517 The Re

# 747 Fried Green Tomatoes at the Whistle Stop Cafe Rating: 3.0 . Ranking: 3.0
# 748 Dead Poets Society Rating: 3.0 . Ranking: 3.0
# 749 Gulliverâ€™s Travels Rating: 3.0 . Ranking: 3.0
# 750 The Girl Who Kicked the Hornet’s Nest
        (Millennium, #3) Rating: 3.0 . Ranking: 3.0
# 751 Klara and the Sun Rating: 3.0 . Ranking: 3.0
# 752 Dear John Rating: 3.0 . Ranking: 3.0
# 753 Tuck Everlasting Rating: 3.0 . Ranking: 3.0
# 754 Othello Rating: 3.0 . Ranking: 3.0
# 755 Before I Fall Rating: 3.0 . Ranking: 3.0
# 756 Bird Box
        (Bird Box, #1) Rating: 3.0 . Ranking: 3.0
# 757 Dark Places Rating: 3.0 . Ranking: 3.0
# 758 Click, Clack, Moo: Cows That Type Rating: 3.0 . Ranking: 3.0
# 759 Jonathan Livingston Seagull Rating: 3.0 . Ranking: 3.0
# 760 Friends, Lovers, and the Big Terrible Thing Rating: 3.0 . Ranking: 3.0
# 761 The Mayor of Casterbridge Rating: 3.0 . Ranking: 3.0
# 762 Winter
        (The Lunar Chronicles, #4) Rating: 3.0 . Ranking: 3.0
# 763 Lean In: Women, Work, and the Wi

# 982 The Kitchen House Rating: nan . Ranking: 0.0
# 983 Later Rating: nan . Ranking: 0.0
# 984 The Happy Ever After Playlist
        (The Friend Zone, #2) Rating: nan . Ranking: 0.0
# 985 The Lost City of Z: A Tale of Deadly Obsession in the Amazon Rating: nan . Ranking: 0.0
# 986 A Great and Terrible Beauty
        (Gemma Doyle, #1) Rating: nan . Ranking: 0.0
# 987 Moloka'i
        (Moloka'i, #1) Rating: nan . Ranking: 0.0
# 988 Over the Top: A Raw Journey to Self-Love Rating: nan . Ranking: 0.0
# 989 Tales of a Fourth Grade Nothing
        (Fudge, #1) Rating: nan . Ranking: 0.0
# 990 Postmortem
        (Kay Scarpetta, #1) Rating: nan . Ranking: 0.0
# 991 Prep Rating: nan . Ranking: 0.0
# 992 The Fine Print
        (Dreamland Billionaires, #1) Rating: nan . Ranking: 0.0
# 993 Modern Romance Rating: nan . Ranking: 0.0
# 994 Hotel on the Corner of Bitter and Sweet Rating: nan . Ranking: 0.0
# 995 What Lies in the Woods Rating: nan . Ranking: 0.0
# 996 2001: A Space Odyssey
        (Spa

        (A Series of Unfortunate Events, #4) Rating: nan . Ranking: 0.0
# 1225 Brooklyn
        (Eilis Lacey, #1) Rating: nan . Ranking: 0.0
# 1226 Bring Me Back Rating: nan . Ranking: 0.0
# 1227 Obsidian Butterfly
        (Anita Blake, Vampire Hunter, #9) Rating: nan . Ranking: 0.0
# 1228 The Orphan Master's Son Rating: nan . Ranking: 0.0
# 1229 Narcissus in Chains
        (Anita Blake, Vampire Hunter, #10) Rating: nan . Ranking: 0.0
# 1230 I Have Some Questions for You Rating: nan . Ranking: 0.0
# 1231 What Happens in Paradise
        (Paradise, #2) Rating: nan . Ranking: 0.0
# 1232 Stay Rating: nan . Ranking: 0.0
# 1233 Like Water for Chocolate Rating: nan . Ranking: 0.0
# 1234 Twenty Years Later Rating: nan . Ranking: 0.0
# 1235 The New Drawing on the Right Side of the Brain Rating: nan . Ranking: 0.0
# 1236 A Curse for True Love
        (Once Upon a Broken Heart, #3) Rating: nan . Ranking: 0.0
# 1237 Ham on Rye Rating: nan . Ranking: 0.0
# 1238 Schindlerâ€™s List Rating: nan . Ran

# 1460 Illusions: The Adventures of a Reluctant Messiah Rating: nan . Ranking: 0.0
# 1461 As You Like It Rating: nan . Ranking: 0.0
# 1462 The Dry
        (Aaron Falk, #1) Rating: nan . Ranking: 0.0
# 1463 The Slow Regard of Silent Things
        (The Kingkiller Chronicle, #2.5) Rating: nan . Ranking: 0.0
# 1464 I Am Legend and Other Stories Rating: nan . Ranking: 0.0
# 1465 Cosmos Rating: nan . Ranking: 0.0
# 1466 To Kill a Kingdom
        (Hundred Kingdoms, #1) Rating: nan . Ranking: 0.0
# 1467 Salem Falls Rating: nan . Ranking: 0.0
# 1468 Brown Bear, Brown Bear, What Do You See? Rating: nan . Ranking: 0.0
# 1469 The Storyteller Rating: nan . Ranking: 0.0
# 1470 For a Few Demons More
        (The Hollows, #5) Rating: nan . Ranking: 0.0
# 1471 The Housemaid Is Watching
        (The Housemaid, #3) Rating: nan . Ranking: 0.0
# 1472 Homecoming Rating: nan . Ranking: 0.0
# 1473 A Heartbreaking Work of Staggering Genius Rating: nan . Ranking: 0.0
# 1474 And Every Morning the Way Home Gets 

# 1683 The Book of Lost Friends Rating: nan . Ranking: 0.0
# 1684 Every Day
        (Every Day, #1) Rating: nan . Ranking: 0.0
# 1685 Olive Kitteridge
        (Olive Kitteridge, #1) Rating: nan . Ranking: 0.0
# 1686 First Comes Love Rating: nan . Ranking: 0.0
# 1687 SuperFreakonomics: Global Cooling, Patriotic Prostitutes And Why Suicide Bombers Should Buy Life Insurance Rating: nan . Ranking: 0.0
# 1688 Curious George Rating: nan . Ranking: 0.0
# 1689 The Mystery of Mrs. Christie Rating: nan . Ranking: 0.0
# 1690 The Sleepwalker's Guide to Dancing Rating: nan . Ranking: 0.0
# 1691 The Eye of the World
        (The Wheel of Time, #1) Rating: nan . Ranking: 0.0
# 1692 The Red Badge of Courage Rating: nan . Ranking: 0.0
# 1693 One by One Rating: nan . Ranking: 0.0
# 1694 Howlâ€™s Moving Castle
        (Howlâ€™s Moving Castle, #1) Rating: nan . Ranking: 0.0
# 1695 Assassin's Quest
        (Farseer Trilogy, #3) Rating: nan . Ranking: 0.0
# 1696 Calibanâ€™s War
        (The Expanse, #2) Rat

# 1936 The Witch of Blackbird Pond Rating: nan . Ranking: 0.0
# 1937 Before We Were Yours Rating: nan . Ranking: 0.0
# 1938 The Perfect Marriage
        (Perfect, #1) Rating: nan . Ranking: 0.0
# 1939 The Nature of Disappearing Rating: nan . Ranking: 0.0
# 1940 Fox in Socks Rating: nan . Ranking: 0.0
# 1941 In the Time of the Butterflies Rating: nan . Ranking: 0.0
# 1942 One of Us Is Dead Rating: nan . Ranking: 0.0
# 1943 I Let You Go Rating: nan . Ranking: 0.0
# 1944 Eric
        (Discworld, #9; Rincewind, #4) Rating: nan . Ranking: 0.0
# 1945 The Swiss Family Robinson Rating: nan . Ranking: 0.0
# 1946 This Is How You Lose Her Rating: nan . Ranking: 0.0
# 1947 Entwined with You
        (Crossfire, #3) Rating: nan . Ranking: 0.0
# 1948 Legends & Lattes
        (Legends & Lattes, #1) Rating: nan . Ranking: 0.0
# 1949 Something Wilder Rating: nan . Ranking: 0.0
# 1950 Reflected in You
        (Crossfire, #2) Rating: nan . Ranking: 0.0
# 1951 Mystic River Rating: nan . Ranking: 0.0
# 1952

# 2189 The Wind-Up Bird Chronicle Rating: nan . Ranking: 0.0
# 2190 Parable of the Talents
        (Earthseed, #2) Rating: nan . Ranking: 0.0
# 2191 Neil Gaiman's Neverwhere Rating: nan . Ranking: 0.0
# 2192 Purple Hibiscus Rating: nan . Ranking: 0.0
# 2193 The Pull of the Stars Rating: nan . Ranking: 0.0
# 2194 The Serpent and the Wings of Night
        (Crowns of Nyaxia, #1) Rating: nan . Ranking: 0.0
# 2195 The Breakdown Rating: nan . Ranking: 0.0
# 2196 Warbreaker Rating: nan . Ranking: 0.0
# 2197 The Andromeda Strain
        (Andromeda, #1) Rating: nan . Ranking: 0.0
# 2198 A Dog's Purpose
        (A Dog's Purpose, #1) Rating: nan . Ranking: 0.0
# 2199 Little Earthquakes Rating: nan . Ranking: 0.0
# 2200 Strange the Dreamer
        (Strange the Dreamer, #1) Rating: nan . Ranking: 0.0
# 2201 Breakfast at Tiffanyâ€™s and Three Stories Rating: nan . Ranking: 0.0
# 2202 The Stationery Shop Rating: nan . Ranking: 0.0
# 2203 What If?: Serious Scientific Answers to Absurd Hypothetical Qu

# 2434 Catâ€™s Cradle Rating: nan . Ranking: 0.0
# 2435 The Silent Sister
        (Riley MacPherson, #1) Rating: nan . Ranking: 0.0
# 2436 Time of the Twins
        (Dragonlance: Legends, #1) Rating: nan . Ranking: 0.0
# 2437 Seveneves Rating: nan . Ranking: 0.0
# 2438 State of Fear Rating: nan . Ranking: 0.0
# 2439 Red Seas Under Red Skies
        (Gentleman Bastard, #2) Rating: nan . Ranking: 0.0
# 2440 The Woman in Black Rating: nan . Ranking: 0.0
# 2441 Going Postal
        (Discworld, #33; Moist von Lipwig, #1) Rating: nan . Ranking: 0.0
# 2442 Hideaway Rating: nan . Ranking: 0.0
# 2443 Oona Out of Order Rating: nan . Ranking: 0.0
# 2444 21 Lessons for the 21st Century Rating: nan . Ranking: 0.0
# 2445 Life After Life
        (Todd Family, #1) Rating: nan . Ranking: 0.0
# 2446 Chocolat
        (Chocolat, #1) Rating: nan . Ranking: 0.0
# 2447 Frog and Toad Are Friends
        (Frog and Toad, #1) Rating: nan . Ranking: 0.0
# 2448 The Strange Case of Dr. Jekyll and Mr. Hyde and Other

# 2673 The Storyteller's Secret Rating: nan . Ranking: 0.0
# 2674 State of Terror Rating: nan . Ranking: 0.0
# 2675 The Illustrated Man Rating: nan . Ranking: 0.0
# 2676 The Hangman's Daughter
        (The Hangman's Daughter, #1) Rating: nan . Ranking: 0.0
# 2677 The Pilgrim's Progress Rating: nan . Ranking: 0.0
# 2678 Lost in a Good Book
        (Thursday Next, #2) Rating: nan . Ranking: 0.0
# 2679 Duma Key Rating: nan . Ranking: 0.0
# 2680 Snow Crash Rating: nan . Ranking: 0.0
# 2681 Calvin and Hobbes
        (Calvin and Hobbes, #1) Rating: nan . Ranking: 0.0
# 2682 Every Heart a Doorway
        (Wayward Children, #1) Rating: nan . Ranking: 0.0
# 2683 Dead Beat
        (The Dresden Files, #7) Rating: nan . Ranking: 0.0
# 2684 Die Trying
        (Jack Reacher, #2) Rating: nan . Ranking: 0.0
# 2685 Body of Evidence
        (Kay Scarpetta, #2) Rating: nan . Ranking: 0.0
# 2686 Vicious
        (Villains, #1) Rating: nan . Ranking: 0.0
# 2687 Other Birds Rating: nan . Ranking: 0.0
# 2688 

        (Hyperion Cantos, #1) Rating: nan . Ranking: 0.0
# 2914 A Darker Shade of Magic
        (Shades of Magic, #1) Rating: nan . Ranking: 0.0
# 2915 The Dead Romantics Rating: nan . Ranking: 0.0
# 2916 Lincoln in the Bardo Rating: nan . Ranking: 0.0
# 2917 Blindsighted
        (Grant County, #1) Rating: nan . Ranking: 0.0
# 2918 Shadow Kiss
        (Vampire Academy, #3) Rating: nan . Ranking: 0.0
# 2919 The Jungle Book
        (Jungle Book, #1) Rating: nan . Ranking: 0.0
# 2920 The Last Mrs. Parrish
        (Mrs. Parrish, #1) Rating: nan . Ranking: 0.0
# 2921 One Day Rating: nan . Ranking: 0.0
# 2922 Rendezvous with Rama
        (Rama, #1) Rating: nan . Ranking: 0.0
# 2923 The Outsider Rating: nan . Ranking: 0.0
# 2924 Belladonna
        (Belladonna, #1) Rating: nan . Ranking: 0.0
# 2925 Small Favor
        (The Dresden Files, #10) Rating: nan . Ranking: 0.0
# 2926 Ruthless Vows
        (Letters of Enchantment, #2) Rating: nan . Ranking: 0.0
# 2927 The Way of Shadows
        (Night 

In [13]:
#give a list sorted out with books you've already read:
# sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if (ratings_matrix[user_id, idx] > 0) or (np.isnan(pred_ratings_list[idx])):
        continue
    print("#", list_num , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1

Top books are:
# 1 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1) Rating: 4.7
# 2 Eclipse
        (The Twilight Saga, #3) Rating: 4.1
# 3 Breaking Dawn
        (The Twilight Saga, #4) Rating: 4.0
# 4 The da Vinci Code
        (Robert Langdon, #2) Rating: 4.4
# 5 The Great Gatsby Rating: 3.6
# 6 Pride and Prejudice Rating: 4.4
# 7 The Lion, the Witch and the Wardrobe
        (Chronicles of Narnia, #1) Rating: 4.2
# 8 The Hobbit, or There and Back Again
        (The Lord of the Rings, #0) Rating: 4.5
# 9 A Game of Thrones
        (A Song of Ice and Fire, #1) Rating: 4.9
# 10 The Kite Runner Rating: 4.8
# 11 1984 Rating: 4.2
# 12 Where the Sidewalk Ends Rating: 3.8
# 13 The Book Thief Rating: 4.7
# 14 The Sea of Monsters
        (Percy Jackson and the Olympians, #2) Rating: 4.1
# 15 Angels & Demons
        (Robert Langdon, #1) Rating: 4.1
# 16 A Wrinkle in Time
        (A Wrinkle in Time Quintet, #1) Rating: 4.6
# 17 The Help Rating: 4.6
# 18 The Lovely Bones Rating: 3.4

# 177 Love You Forever Rating: 4.5
# 178 The Stinky Cheese Man and Other Fairly Stupid Tales Rating: 3.0
# 179 The Tell-Tale Heart and Other Writings Rating: 4.5
# 180 Maus I: A Survivor's Tale: My Father Bleeds History
        (Maus, #1) Rating: 4.5
# 181 The Shining
        (The Shining, #1) Rating: 4.5
# 182 The Velveteen Rabbit Rating: 4.5
# 183 The Adventures of Huckleberry Finn Rating: 3.0
# 184 The Girl Who Kicked the Hornetâ€™s Nest
        (Millennium, #3) Rating: 4.5
# 185 The Little House Collection
        (Little House, #1-9) Rating: 4.5
# 186 A Time to Kill
        (Jake Brigance, #1) Rating: 4.5
# 187 The Alchemyst
        (The Secrets of the Immortal Nicholas Flamel, #1) Rating: 4.5
# 188 The Alchemist Rating: 3.0
# 189 Dead Ever After
        (Sookie Stackhouse, #13) Rating: 4.5
# 190 Plum Lovin'
        (Stephanie Plum, #12.5) Rating: 2.7
# 191 A Christmas Carol Rating: 4.0
# 192 Dead in the Family
        (Sookie Stackhouse, #10) Rating: 4.0
# 193 Gone Girl Rating: 4

        (Unhoneymooners, #1) Rating: 4.0
# 482 The Cricket in Times Square
        (Chester Cricket and His Friends, #1) Rating: 4.0
# 483 Let's Explore Diabetes with Owls: Essays, Etc. Rating: 4.0
# 484 The Silkworm
        (Cormoran Strike, #2) Rating: 4.0
# 485 Shiver
        (The Wolves of Mercy Falls, #1) Rating: 4.0
# 486 The Black Cauldron
        (The Chronicles of Prydain, #2) Rating: 4.0
# 487 Midnight in the Garden of Good and Evil Rating: 4.0
# 488 Watership Down
        (Watership Down, #1) Rating: 4.0
# 489 Fear Nothing
        (Moonlight Bay, #1) Rating: 4.0
# 490 A Million Little Pieces Rating: 4.0
# 491 The Innocent Rating: 4.0
# 492 Bossypants Rating: 4.0
# 493 The Rooster Bar Rating: 4.0
# 494 We Have Always Lived in the Castle Rating: 4.0
# 495 The Metamorphosis Rating: 4.0
# 496 Nights in Rodanthe Rating: 4.0
# 497 A Wind in the Door
        (Time Quintet, #2) Rating: 4.0
# 498 Americanah Rating: 4.0
# 499 If You Ask Me Rating: 4.0
# 500 Sometimes I Lie Rating: 4.0

In [14]:
idx = 8
this_ratings = ratings_matrix[indices[0,idx]]
print(this_ratings)

my_ratings = ratings_matrix[indices[0,0]]
print(my_ratings)

for i, rating in enumerate(this_ratings):
    if rating > 0 and my_ratings[i]>0:
        print("-", titles[i], ", their Rating:", rating, " My Rating:", my_ratings[i])

# print(distances, indices[-1,-1])

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
- A Breath of Snow and Ashes
        (Outlander, #6) , their Rating: 4.0  My Rating: 5.0
- An Echo in the Bone
        (Outlander, #7) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Deathly Hallows
        (Harry Potter, #7) , their Rating: 5.0  My Rating: 5.0
- Catching Fire
        (The Hunger Games, #2) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Prisoner of Azkaban
        (Harry Potter, #3) , their Rating: 5.0  My Rating: 5.0
- To Kill a Mockingbird , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Goblet of Fire
        (Harry Potter, #4) , their Rating: 5.0  My Rating: 5.0
- Divergent
        (Divergent, #1) , their Rating: 4.0  My Rating: 5.0
- Allegiant
        (Divergent, #3) , their Rating: 3.0  My Rating: 5.0
- Harry Potter and the Order of the Phoenix
        (Harry Potter, #5) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Chamber of Secrets
        (Harry Potter, #2) , their R

In [15]:
#find most similar books using cosine similarity
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(ratings_matrix.T)

similarity_df = pd.DataFrame(similarity_matrix, index=titles, columns=titles)

# Function to get k nearest neighbors for a movie
def get_similar_book(book_name, k=3):
    similar_book = similarity_df[book_name].sort_values(ascending=False)[1:k+1]
    return similar_book

book_name = 'First Lie Wins'
print("\nTop 5 similar book to", book_name, ":")
print(get_similar_book(book_name, k=5))


Top 5 similar book to First Lie Wins :
She's Not Sorry         0.551181
Listen for the Lie      0.534251
Darling Girls           0.533345
The Fury                0.506713
None of This Is True    0.488660
Name: First Lie Wins, dtype: float64


In [16]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

# Example user rating data (rows = users, columns = items)
ratings_df = pd.DataFrame(ratings)

# Step 1: Handle missing values using imputation (we will fill missing values with the mean rating)
imputer = SimpleImputer(strategy='mean')
ratings_filled = imputer.fit_transform(ratings_df)

# Step 2: Apply KMeans clustering
# We will use 2 clusters as an example
kmeans = KMeans(n_clusters=int(num_users/10), random_state=42)
clusters = kmeans.fit_predict(ratings_filled)

# Step 3: Add the cluster labels to the original DataFrame
ratings_df['Cluster'] = clusters

# # Print the user ratings with cluster assignments
# print("\nUser Ratings with Clusters:")
# print(ratings_df)

# # Step 4: Print the cluster centers (the centroid of each cluster)
# print("\nCluster Centers (Centroids):")
# print(kmeans.cluster_centers_)


user_id = 0
# print(clusters)

cluster_this_user = clusters[user_id]
# print(cluster_this_user)

pred_ratings_list = kmeans.cluster_centers_[user_id]
# print(pred_ratings_list)
# for i in range(len())

#give a list sorted out with books you've already read:
sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if (ratings_matrix[user_id, idx] > 0) or (np.isnan(pred_ratings_list[idx])) :
        continue
    print("#", list_num , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1



Top books are:
# 1 The Art of War Rating: 4.0
# 2 The Road Less Traveled: A New Psychology of Love, Traditional Values and Spiritual Growth Rating: 2.5
# 3 Dreams from My Father: A Story of Race and Inheritance Rating: 2.5
# 4 Freakonomics: A Rogue Economist Explores the Hidden Side of Everything Rating: 2.5
# 5 Not a Happy Family Rating: 2.5
# 6 Things We Left Behind
        (Knockemout, #3) Rating: 2.5
# 7 The Wife Between Us Rating: 2.5
# 8 Think and Grow Rich Rating: 2.5
# 9 The Storyteller: Tales of Life and Music Rating: 2.5
# 10 Cutting for Stone Rating: 2.5
# 11 Pineapple Street Rating: 2.5
# 12 The House of Eve Rating: 2.5
# 13 Things We Hide from the Light
        (Knockemout, #2) Rating: 2.5
# 14 The Handmaidâ€™s Tale
        (The Handmaid's Tale, #1) Rating: 2.5
# 15 Did You Hear About Kitty Karr? Rating: 2.5
# 16 Carrie Soto Is Back Rating: 2.5
# 17 Wuthering Heights Rating: 2.5
# 18 Regretting You Rating: 2.5
# 19 The Golden Couple Rating: 2.5
# 20 The Diary of a Young Gi

        (Joe Talbert, #1; Detective Max Rupert, #1) Rating: 0.0
# 242 Nothing to Lose
        (Jack Reacher, #12) Rating: 0.0
# 243 Violeta Rating: 0.0
# 244 Blood Song
        (Raven's Shadow, #1) Rating: 0.0
# 245 Hour Game
        (Sean King & Michelle Maxwell, #2) Rating: 0.0
# 246 Sing You Home Rating: 0.0
# 247 The Mists of Avalon
        (Avalon, #1) Rating: 0.0
# 248 Kiss the Girls
        (Alex Cross, #2) Rating: 0.0
# 249 Redwall
        (Redwall, #1) Rating: 0.0
# 250 The Diamond Eye Rating: 0.0
# 251 The Children Act Rating: 0.0
# 252 The Tale of the Body Thief
        (The Vampire Chronicles, #4) Rating: 0.0
# 253 Girl, Woman, Other Rating: 0.0
# 254 King Lear Rating: 0.0
# 255 The Pelican Brief Rating: 0.0
# 256 Guilty Pleasures
        (Anita Blake, Vampire Hunter, #1) Rating: 0.0
# 257 The Ugly Truth
        (Diary of a Wimpy Kid, #5) Rating: 0.0
# 258 Isaac's Storm: A Man, a Time, and the Deadliest Hurricane in History Rating: 0.0
# 259 The Slippery Slope
        (A Se

# 470 Black and Blue Rating: 0.0
# 471 Eligible: A Modern Retelling of Pride & Prejudice Rating: 0.0
# 472 The Secret Keeper of Jaipur
        (The Jaipur Trilogy, #2) Rating: 0.0
# 473 Dead to the World
        (Sookie Stackhouse, #4) Rating: 0.0
# 474 Artemis Rating: 0.0
# 475 The Wives Rating: 0.0
# 476 The Starless Sea Rating: 0.0
# 477 Inkheart
        (Inkworld, #1) Rating: 0.0
# 478 The Marriage Plot Rating: 0.0
# 479 Skeleton Crew Rating: 0.0
# 480 Ghost Story
        (The Dresden Files, #13) Rating: 0.0
# 481 Mountains Beyond Mountains: The Quest of Dr. Paul Farmer, a Man Who Would Cure the World Rating: 0.0
# 482 The Cricket in Times Square
        (Chester Cricket and His Friends, #1) Rating: 0.0
# 483 Blankets Rating: 0.0
# 484 The Magicianâ€™s Nephew
        (Chronicles of Narnia, #6) Rating: 0.0
# 485 On the Road Rating: 0.0
# 486 The Sweetness at the Bottom of the Pie
        (Flavia de Luce, #1) Rating: 0.0
# 487 A Caress of Twilight
        (Merry Gentry, #2) Rating: 0

# 754 The Shack Rating: 0.0
# 755 Matilda Rating: 0.0
# 756 The Evening and the Morning
        (Kingsbridge, #0) Rating: 0.0
# 757 Daisy Darker Rating: 0.0
# 758 Hawaii Rating: 0.0
# 759 The Clockmaker's Daughter Rating: 0.0
# 760 The Wonder Rating: 0.0
# 761 â€™Salemâ€™s Lot Rating: 0.0
# 762 The Hypnotist's Love Story Rating: 0.0
# 763 What Happened to the Bennetts Rating: 0.0
# 764 Into the Wilderness
        (Wilderness, #1) Rating: 0.0
# 765 Infidel Rating: 0.0
# 766 The Locked Door Rating: 0.0
# 767 Steve Jobs Rating: 0.0
# 768 Keep It in the Family Rating: 0.0
# 769 Destined
        (House of Night, #9) Rating: 0.0
# 770 The Night Watchman Rating: 0.0
# 771 Shuggie Bain Rating: 0.0
# 772 Blood Rites
        (The Dresden Files, #6) Rating: 0.0
# 773 Big Lies in a Small Town Rating: 0.0
# 774 The Innocent
        (Will Robie, #1) Rating: 0.0
# 775 Angels Flight
        (Harry Bosch, #6; Harry Bosch Universe, #8) Rating: 0.0
# 776 City of Heavenly Fire
        (The Mortal Instrume

# 935 Paper Towns Rating: 0.0
# 936 The Exorcist Rating: 0.0
# 937 Dolores Claiborne Rating: 0.0
# 938 A Flicker in the Dark Rating: 0.0
# 939 Fear and Loathing in Las Vegas Rating: 0.0
# 940 Revival Rating: 0.0
# 941 The Summons Rating: 0.0
# 942 Finders Keepers
        (Bill Hodges Trilogy, #2) Rating: 0.0
# 943 The Unbecoming of Mara Dyer
        (Mara Dyer, #1) Rating: 0.0
# 944 Hard Eight
        (Stephanie Plum, #8) Rating: 0.0
# 945 The Once and Future Witches Rating: 0.0
# 946 Do Androids Dream of Electric Sheep? Rating: 0.0
# 947 Sharp Objects Rating: 0.0
# 948 28 Summers Rating: 0.0
# 949 The Devil in the White City: Murder, Magic, and Madness at the Fair That Changed America Rating: 0.0
# 950 The Book Woman's Daughter
        (The Book Woman of Troublesome Creek, #2) Rating: 0.0
# 951 The Man in the High Castle Rating: 0.0
# 952 Saturday Rating: 0.0
# 953 Brida Rating: 0.0
# 954 The Girls Rating: 0.0
# 955 Fortunately, the Milk Rating: 0.0
# 956 Goodnight Beautiful Rating: 0

# 1249 First Comes Love Rating: 0.0
# 1250 Olive Kitteridge
        (Olive Kitteridge, #1) Rating: 0.0
# 1251 Every Day
        (Every Day, #1) Rating: 0.0
# 1252 The Book of Lost Friends Rating: 0.0
# 1253 The Violin Conspiracy Rating: 0.0
# 1254 Burnt Offerings
        (Anita Blake, Vampire Hunter, #7) Rating: 0.0
# 1255 The Bluest Eye Rating: 0.0
# 1256 The Guest List Rating: 0.0
# 1257 The Last Juror Rating: 0.0
# 1258 Death Comes to Pemberley Rating: 0.0
# 1259 Gilead
        (Gilead, #1) Rating: 0.0
# 1260 Gone Tomorrow
        (Jack Reacher, #13) Rating: 0.0
# 1261 The Soulmate Equation Rating: 0.0
# 1262 How the Grinch Stole Christmas! Rating: 0.0
# 1263 Killing Lincoln: The Shocking Assassination that Changed America Forever Rating: 0.0
# 1264 The Polar Express Rating: 0.0
# 1265 Flight Behavior Rating: 0.0
# 1266 When No One Is Watching Rating: 0.0
# 1267 The Street Lawyer Rating: 0.0
# 1268 The One Hundred Years of Lenni and Margot Rating: 0.0
# 1269 The Last Olympian
      

# 1594 The Secret History Rating: 0.0
# 1595 The Quiet Tenant Rating: 0.0
# 1596 The Little House Collection
        (Little House, #1-9) Rating: 0.0
# 1597 The House Across the Lake Rating: 0.0
# 1598 Big Little Lies Rating: 0.0
# 1599 The Runaway Jury Rating: 0.0
# 1600 The Time Machine Rating: 0.0
# 1601 The Good Daughter
        (Good Daughter, #1) Rating: 0.0
# 1602 We'll Always Have Summer
        (Summer #3) Rating: 0.0
# 1603 The Tales of Beedle the Bard
        (Hogwarts Library, #3) Rating: 0.0
# 1604 Is Everyone Hanging Out Without Me? Rating: 0.0
# 1605 City of Ashes
        (The Mortal Instruments, #2) Rating: 0.0
# 1606 The Only Woman in the Room Rating: 0.0
# 1607 Rebecca Rating: 0.0
# 1608 The Sanatorium
        (Detective Elin Warner, #1) Rating: 0.0
# 1609 Suzanne's Diary for Nicholas Rating: 0.0
# 1610 Message in a Bottle Rating: 0.0
# 1611 The Selection
        (The Selection, #1) Rating: 0.0
# 1612 The House Girl Rating: 0.0
# 1613 Proven Guilty
        (The Dresde

# 1774 Visions of Sugar Plums
        (Stephanie Plum, #8.5) Rating: 0.0
# 1775 A Passage to India Rating: 0.0
# 1776 After Anna Rating: 0.0
# 1777 Where We Belong Rating: 0.0
# 1778 The Hour I First Believed Rating: 0.0
# 1779 The Cruelest Month
        (Chief Inspector Armand Gamache, #3) Rating: 0.0
# 1780 Brisingr
        (The Inheritance Cycle, #3) Rating: 0.0
# 1781 Misery Rating: 0.0
# 1782 Kisscut
        (Grant County, #2) Rating: 0.0
# 1783 Practice Makes Perfect
        (When in Rome, #2) Rating: 0.0
# 1784 Divine Rivals
        (Letters of Enchantment, #1) Rating: 0.0
# 1785 Persuader
        (Jack Reacher, #7) Rating: 0.0
# 1786 Anne of Windy Poplars
        (Anne of Green Gables, #4) Rating: 0.0
# 1787 The Forest of Vanishing Stars Rating: 0.0
# 1788 Angela’s Ashes
        (Frank McCourt, #1) Rating: 0.0
# 1789 Snuff Rating: 0.0
# 1790 White Teeth Rating: 0.0
# 1791 Without Merit Rating: 0.0
# 1792 The Heir
        (The Selection, #4) Rating: 0.0
# 1793 Shopaholic & Baby


# 1968 The Husbands Rating: 0.0
# 1969 Blood of the Fold
        (Sword of Truth, #3) Rating: 0.0
# 1970 The Bell Jar Rating: 0.0
# 1971 Julie and Julia: 365 Days, 524 Recipes, 1 Tiny Apartment Kitchen Rating: 0.0
# 1972 Hyperbole and a Half: Unfortunate Situations, Flawed Coping Mechanisms, Mayhem, and Other Things That Happened Rating: 0.0
# 1973 Tithe
        (Modern Faerie Tales, #1) Rating: 0.0
# 1974 Anne of the Island
        (Anne of Green Gables, #3) Rating: 0.0
# 1975 The Metamorphosis Rating: 0.0
# 1976 Wall and Piece Rating: 0.0
# 1977 Born Standing Up: A Comic's Life Rating: 0.0
# 1978 The Secret Keeper Rating: 0.0
# 1979 Fever 1793 Rating: 0.0
# 1980 Once Upon a Broken Heart
        (Once Upon a Broken Heart, #1) Rating: 0.0
# 1981 On Chesil Beach Rating: 0.0
# 1982 A Curse So Dark and Lonely
        (Cursebreakers, #1) Rating: 0.0
# 1983 We Solve Murders
        (We Solve Murders, #1) Rating: 0.0
# 1984 Big Summer Rating: 0.0
# 1985 The House of Hades
        (The Heroes

# 2173 Water for Elephants Rating: 0.0
# 2174 Astrophysics for People in a Hurry Rating: 0.0
# 2175 The Complete Anne of Green Gables
        (Anne of Green Gables, #1-8) Rating: 0.0
# 2176 The Importance of Being Earnest Rating: 0.0
# 2177 The Reader Rating: 0.0
# 2178 Ten Big Ones
        (Stephanie Plum, #10) Rating: 0.0
# 2179 We Were the Mulvaneys Rating: 0.0
# 2180 The Love of my Life Rating: 0.0
# 2181 Hex Hall
        (Hex Hall, #1) Rating: 0.0
# 2182 Men at Arms
        (Discworld, #15; City Watch, #2) Rating: 0.0
# 2183 The King of Torts Rating: 0.0
# 2184 The Physick Book of Deliverance Dane
        (The Physick Book, #1) Rating: 0.0
# 2185 The Warrior Heir
        (The Heir Chronicles, #1) Rating: 0.0
# 2186 The Miraculous Journey of Edward Tulane Rating: 0.0
# 2187 The Goldfinch Rating: 0.0
# 2188 The Left Hand of Darkness Rating: 0.0
# 2189 The Bone Clocks Rating: 0.0
# 2190 Once There Were Wolves Rating: 0.0
# 2191 News of the World Rating: 0.0
# 2192 Brother Odd
       

# 2385 The Shadows Rating: 0.0
# 2386 Gone
        (Gone, #1) Rating: 0.0
# 2387 The Collected Regrets of Clover Rating: 0.0
# 2388 After Dark Rating: 0.0
# 2389 Ramona the Pest
        (Ramona, #2) Rating: 0.0
# 2390 The Phantom of the Opera Rating: 0.0
# 2391 11/22/63 Rating: 0.0
# 2392 Heir to the Empire
        (Star Wars: The Thrawn Trilogy, #1) Rating: 0.0
# 2393 Mort
        (Discworld, #4; Death, #1) Rating: 0.0
# 2394 American Gods Rating: 0.0
# 2395 Choke Rating: 0.0
# 2396 The Snow Child Rating: 0.0
# 2397 Paradise Lost Rating: 0.0
# 2398 A Room with a View Rating: 0.0
# 2399 From Potter's Field
        (Kay Scarpetta, #6) Rating: 0.0
# 2400 Winter Garden Rating: 0.0
# 2401 Beyond the Shadows
        (Night Angel, #3) Rating: 0.0
# 2402 61 Hours
        (Jack Reacher, #14) Rating: 0.0
# 2403 City of Thieves Rating: 0.0
# 2404 Gulp: Adventures on the Alimentary Canal Rating: 0.0
# 2405 Flowers in the Attic
        (Dollanganger, #1) Rating: 0.0
# 2406 The It Girl Rating: 0.0


# 2616 Anansi Boys Rating: 0.0
# 2617 The Color of Magic
        (Discworld, #1; Rincewind, #1) Rating: 0.0
# 2618 House of Earth and Blood
        (Crescent City, #1) Rating: 0.0
# 2619 Champion
        (Legend, #3) Rating: 0.0
# 2620 The Invisible Man Rating: 0.0
# 2621 Beartown
        (Beartown, #1) Rating: 0.0
# 2622 The Heart Is a Lonely Hunter Rating: 0.0
# 2623 My Sister's Grave
        (Tracy Crosswhite, #1) Rating: 0.0
# 2624 The Bear and the Nightingale
        (The Winternight Trilogy, #1) Rating: 0.0
# 2625 Secrets of the Dragon Sanctuary
        (Fablehaven, #4) Rating: 0.0
# 2626 The Sugar Queen Rating: 0.0
# 2627 A Promised Land Rating: 0.0
# 2628 Last Argument of Kings
        (The First Law, #3) Rating: 0.0
# 2629 The Last Devil to Die
        (Thursday Murder Club, #4) Rating: 0.0
# 2630 The Body Keeps the Score: Brain, Mind, and Body in the Healing of Trauma Rating: 0.0
# 2631 Look Me in the Eye Rating: 0.0
# 2632 Equal Rites
        (Discworld, #3; Witches, #1) Rat

# 2843 The Keeper of Happy Endings Rating: 0.0
# 2844 The Night Tiger Rating: 0.0
# 2845 Scar Tissue Rating: 0.0
# 2846 Code Name Verity
        (Code Name Verity, #1) Rating: 0.0
# 2847 Takedown Twenty
        (Stephanie Plum, #20) Rating: 0.0
# 2848 The Moon Is a Harsh Mistress Rating: 0.0
# 2849 My Lovely Wife Rating: 0.0
# 2850 Son
        (The Giver, #4) Rating: 0.0
# 2851 Hooked
        (Never After, #1) Rating: 0.0
# 2852 In the Unlikely Event Rating: 0.0
# 2853 The Longest Ride Rating: 0.0
# 2854 Fallen
        (Fallen, #1) Rating: 0.0
# 2855 I'm Thinking of Ending Things Rating: 0.0
# 2856 The Little Book of Hygge: The Danish Way to Live Well Rating: 0.0
# 2857 The Mysterious Affair at Styles
        (Hercule Poirot, #1) Rating: 0.0
# 2858 Raise High the Roof Beam, Carpenters & Seymour: An Introduction Rating: 0.0
# 2859 On the Banks of Plum Creek
        (Little House, #4) Rating: 0.0
# 2860 Camino Island
        (Camino Island, #1) Rating: 0.0
# 2861 When We Believed in Merm

In [17]:
#doing masked autoencoder
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset, random_split

# Mask for observed values (1 for observed, 0 for missing)
ratings_torch = torch.tensor(ratings).float()
mask = (ratings_torch != 0).float()
mask_tensor = torch.tensor(mask)


#Define autoencoder
class SparseAutoencoder(nn.Module):
    def __init__(self, num_items, latent_dim):
        super(SparseAutoencoder, self).__init__()
        hidden1 = latent_dim*2
        self.encoder1 = nn.Linear(num_items, hidden1)
        self.encoder2 = nn.Linear(hidden1, latent_dim)
        self.decoder1 = nn.Linear(latent_dim, hidden1)
        self.decoder2 = nn.Linear(hidden1, num_items)
        
    def forward(self, x):
        x = torch.relu(self.encoder1(x))
        x = torch.relu(self.encoder2(x))
        x = torch.relu(self.decoder1(x))
        x = self.decoder2(x)
        # Scale sigmoid output to [1, 5]
        return 1 + 4 * torch.sigmoid(x)

    
#initialize the model
num_users, num_items = ratings_torch.shape
np.save("num_users.npy", np.array(num_users))
np.save("num_items.npy", np.array(num_items))
latent_dim = int(num_items/8) # Number of latent features

model = SparseAutoencoder(num_items, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Use MSE loss but only consider observed values
def masked_mse_loss(reconstructed, original, mask):
    loss = ((reconstructed - original) ** 2) * mask
    return loss.sum() / mask.sum()

#break up data into train and val
dataset = TensorDataset(ratings_torch, mask_tensor) #keeping the mask
print("ratings_torch shape =", ratings_torch.shape)
print(len(dataset))
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

print("len(train_loader) = ", len(train_loader))
print("len(val_loader) = ", len(val_loader))

#train the model
epochs = 5000
best_loss = 1.0#10e10
counter = 0
for epoch in range(epochs):
    train_loss = 0.0
    for inputs, mask in train_loader:
        model.train()
        optimizer.zero_grad()

        # Forward pass
        reconstructed = model(inputs)
        loss = masked_mse_loss(reconstructed, inputs, mask)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    if (epoch + 1) % 10 == 0:
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_loss}")
        #check validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, mask in val_loader:
                outputs = model(inputs)
                loss = masked_mse_loss(outputs, inputs, mask)
                val_loss += loss.item()
                

        val_loss /= len(val_loader)
        

        print(f"Epoch {epoch + 1} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
        if val_loss < best_loss: #if improve then save
            #save model 
            torch.save(model.state_dict(), "2model{}.pkl".format(latent_dim))
            best_loss = val_loss
            print("Model saved to model{}.pkl.".format(latent_dim))
            counter = 0
        else:
            counter += 10
            
    if counter > 200:
        print("Done training because of no improvement.")
        break
        
            


  mask_tensor = torch.tensor(mask)


ratings_torch shape = torch.Size([1864, 3023])
1864
len(train_loader) =  12
len(val_loader) =  3
Epoch 10 - Train Loss: 0.4743 - Val Loss: 1.1338
Epoch 20 - Train Loss: 0.3134 - Val Loss: 1.1573
Epoch 30 - Train Loss: 0.2040 - Val Loss: 1.1840
Epoch 40 - Train Loss: 0.1747 - Val Loss: 1.1833
Epoch 50 - Train Loss: 0.1251 - Val Loss: 1.2068
Epoch 60 - Train Loss: 0.0887 - Val Loss: 1.2098
Epoch 70 - Train Loss: 0.1340 - Val Loss: 1.2145
Epoch 80 - Train Loss: 0.4356 - Val Loss: 1.2085
Epoch 90 - Train Loss: 0.8634 - Val Loss: 1.1684
Epoch 100 - Train Loss: 0.8101 - Val Loss: 1.0639
Epoch 110 - Train Loss: 0.7981 - Val Loss: 1.0459
Epoch 120 - Train Loss: 0.7755 - Val Loss: 1.0508
Epoch 130 - Train Loss: 0.7584 - Val Loss: 1.0383
Epoch 140 - Train Loss: 0.7474 - Val Loss: 1.0387
Epoch 150 - Train Loss: 0.7400 - Val Loss: 1.0437
Epoch 160 - Train Loss: 0.7395 - Val Loss: 1.0485
Epoch 170 - Train Loss: 0.7325 - Val Loss: 1.0279
Epoch 180 - Train Loss: 0.7510 - Val Loss: 1.0324
Epoch 190 - 

In [18]:
num_users, num_items

(1864, 3023)

In [19]:
print(sum(sum(mask)))
print(mask.shape)
print(mask.shape[0] * mask.shape[1])
print(sum(sum(mask)) / (mask.shape[0] * mask.shape[1]))



tensor(5906.)
torch.Size([117, 3023])
353691
tensor(0.0167)


In [20]:
#Evaulating the model
model.eval()
with torch.no_grad():
    reconstructed = model(ratings_torch)

# Fill missing values in the original matrix
filled_data = ratings_torch.clone()
filled_data[mask_tensor == 0] = reconstructed[mask_tensor == 0]

print("Original Data:\n", ratings_torch)
print("Reconstructed Data:\n", reconstructed)
print("Filled Data:\n", filled_data)


Original Data:
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 3.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Reconstructed Data:
 tensor([[3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        ...,
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519]])
Filled Data:
 tensor([[3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.0000],
        ...,
        [3.8964, 3.9854, 4.5331,  ..., 3.3886, 3.8845, 3.5519],
        [3.8964, 3.9854, 4.5

In [21]:
print(reconstructed[0].numpy())
print(ratings_torch[0].numpy())

print((reconstructed[0].numpy()-ratings_torch[0].numpy())/ratings_torch[0].numpy())

[3.8963966 3.985393  4.533129  ... 3.3885612 3.884497  3.551901 ]
[0. 0. 0. ... 0. 0. 0.]
[inf inf inf ... inf inf inf]


  print((reconstructed[0].numpy()-ratings_torch[0].numpy())/ratings_torch[0].numpy())


In [22]:
pred_ratings_list = reconstructed[0].detach().numpy()

#give a list sorted out with books you've already read:
sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if  (ratings_matrix[user_id, idx] > 0) or(np.isnan(pred_ratings_list[idx])) :
        continue
    print("#", list_num , titles[idx], " - Predicted Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1

Top books are:
# 1 Band of Brothers: E Company, 506th Regiment, 101st Airborne from Normandy to Hitler's Eagle's Nest  - Predicted Rating: 4.9
# 2 Crossing to Safety  - Predicted Rating: 4.8
# 3 The House of Hades
        (The Heroes of Olympus, #4)  - Predicted Rating: 4.8
# 4 Lone Survivor: The Eyewitness Account of Operation Redwing and the Lost Heroes of SEAL Team 10  - Predicted Rating: 4.8
# 5 Ways of Seeing  - Predicted Rating: 4.8
# 6 The Ultimate Hitchhiker’s Guide to the Galaxy
        (Hitchhiker's Guide to the Galaxy, #1-5)  - Predicted Rating: 4.8
# 7 The Stationery Shop  - Predicted Rating: 4.8
# 8 The Ballad of Never After
        (Once Upon a Broken Heart, #2)  - Predicted Rating: 4.8
# 9 Bone: The Complete Edition  - Predicted Rating: 4.7
# 10 Aristotle and Dante Discover the Secrets of the Universe
        (Aristotle and Dante, #1)  - Predicted Rating: 4.7
# 11 The Essential Calvin and Hobbes: A Calvin and Hobbes Treasury  - Predicted Rating: 4.7
# 12 Say Nothing: A T

# 220 Ham on Rye  - Predicted Rating: 4.3
# 221 Invisible Cities  - Predicted Rating: 4.3
# 222 Know My Name  - Predicted Rating: 4.3
# 223 Between Shades of Gray  - Predicted Rating: 4.3
# 224 Migrations  - Predicted Rating: 4.3
# 225 The Great Santini  - Predicted Rating: 4.3
# 226 Don't Let the Pigeon Drive the Bus!  - Predicted Rating: 4.3
# 227 Academ's Fury
        (Codex Alera, #2)  - Predicted Rating: 4.3
# 228 Two Twisted Crowns
        (The Shepherd King, #2)  - Predicted Rating: 4.3
# 229 Always and Forever, Lara Jean
        (To All the Boys I've Loved Before, #3)  - Predicted Rating: 4.3
# 230 White Night
        (The Dresden Files, #9)  - Predicted Rating: 4.3
# 231 Tiny Beautiful Things: Advice on Love and Life from Dear Sugar  - Predicted Rating: 4.3
# 232 Empire of Storms
        (Throne of Glass, #5)  - Predicted Rating: 4.3
# 233 Hyperbole and a Half: Unfortunate Situations, Flawed Coping Mechanisms, Mayhem, and Other Things That Happened  - Predicted Rating: 4.3
# 2

        (The Broken Earth, #1)  - Predicted Rating: 4.2
# 408 A Clash of Kings
        (A Song of Ice and Fire, #2)  - Predicted Rating: 4.2
# 409 Mad Honey  - Predicted Rating: 4.2
# 410 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1)  - Predicted Rating: 4.2
# 411 The Art of Happiness  - Predicted Rating: 4.2
# 412 The Storied Life of A.J. Fikry  - Predicted Rating: 4.2
# 413 The Light We Carry: Overcoming in Uncertain Times  - Predicted Rating: 4.2
# 414 The Berry Pickers  - Predicted Rating: 4.2
# 415 The Tea Girl of Hummingbird Lane  - Predicted Rating: 4.2
# 416 Anne of Avonlea
        (Anne of Green Gables, #2)  - Predicted Rating: 4.2
# 417 Just Mercy  - Predicted Rating: 4.2
# 418 Crying in H Mart  - Predicted Rating: 4.2
# 419 The Dark Tower
        (The Dark Tower, #7)  - Predicted Rating: 4.2
# 420 Gone Tomorrow
        (Jack Reacher, #13)  - Predicted Rating: 4.2
# 421 Americanah  - Predicted Rating: 4.2
# 422 What If?: Serious Scientific Answers to Absurd

# 681 One Day in the Life of Ivan Denisovich  - Predicted Rating: 4.1
# 682 Preacher, Volume 1: Gone to Texas  - Predicted Rating: 4.1
# 683 West With Giraffes  - Predicted Rating: 4.1
# 684 Northanger Abbey  - Predicted Rating: 4.1
# 685 The Seven Year Slip  - Predicted Rating: 4.1
# 686 Going Postal
        (Discworld, #33; Moist von Lipwig, #1)  - Predicted Rating: 4.1
# 687 Persuasion  - Predicted Rating: 4.1
# 688 The Island of Missing Trees  - Predicted Rating: 4.1
# 689 The Martian  - Predicted Rating: 4.1
# 690 Pope Joan  - Predicted Rating: 4.1
# 691 Mrs. Dalloway  - Predicted Rating: 4.1
# 692 The Story of Babar
        (Babar, #1)  - Predicted Rating: 4.1
# 693 Ninth House
        (Alex Stern, #1)  - Predicted Rating: 4.1
# 694 Handle with Care  - Predicted Rating: 4.1
# 695 King of Wrath
        (Kings of Sin, #1)  - Predicted Rating: 4.1
# 696 Funny Story  - Predicted Rating: 4.1
# 697 Circling the Sun  - Predicted Rating: 4.1
# 698 Batman: The Dark Knight Returns  - Predi

# 957 Daughter of Mine  - Predicted Rating: 4.0
# 958 Every Heart a Doorway
        (Wayward Children, #1)  - Predicted Rating: 4.0
# 959 The Forever War
        (The Forever War, #1)  - Predicted Rating: 4.0
# 960 Warbreaker  - Predicted Rating: 4.0
# 961 The Road Less Traveled: A New Psychology of Love, Traditional Values and Spiritual Growth  - Predicted Rating: 4.0
# 962 The Adventures of Sherlock Holmes
        (Sherlock Holmes, #3)  - Predicted Rating: 4.0
# 963 Oh William!  - Predicted Rating: 4.0
# 964 Raise High the Roof Beam, Carpenters & Seymour: An Introduction  - Predicted Rating: 4.0
# 965 Around the World in Eighty Days  - Predicted Rating: 4.0
# 966 City of Lost Souls
        (The Mortal Instruments, #5)  - Predicted Rating: 4.0
# 967 Men at Arms
        (Discworld, #15; City Watch, #2)  - Predicted Rating: 4.0
# 968 Presumed Innocent
        (Kindle County Legal Thriller, #1)  - Predicted Rating: 4.0
# 969 Dune
        (Dune, #1)  - Predicted Rating: 4.0
# 970 High Fid

# 1232 Predictably Irrational: The Hidden Forces That Shape Our Decisions  - Predicted Rating: 3.9
# 1233 The Talisman
        (The Talisman, #1)  - Predicted Rating: 3.9
# 1234 Magician: Apprentice
        (The Riftwar Saga, #1)  - Predicted Rating: 3.9
# 1235 City of Heavenly Fire
        (The Mortal Instruments, #6)  - Predicted Rating: 3.9
# 1236 A Curse So Dark and Lonely
        (Cursebreakers, #1)  - Predicted Rating: 3.9
# 1237 Brooklyn
        (Eilis Lacey, #1)  - Predicted Rating: 3.9
# 1238 Slade House  - Predicted Rating: 3.9
# 1239 The Dharma Bums  - Predicted Rating: 3.9
# 1240 Oona Out of Order  - Predicted Rating: 3.9
# 1241 The Light We Lost  - Predicted Rating: 3.9
# 1242 In Five Years  - Predicted Rating: 3.9
# 1243 Anna and the French Kiss
        (Anna and the French Kiss, #1)  - Predicted Rating: 3.9
# 1244 The Last Flight  - Predicted Rating: 3.9
# 1245 The Chronicles of Narnia
        (The Chronicles of Narnia, #1-7)  - Predicted Rating: 3.9
# 1246 The Housemaid

        (House of Earth, #1)  - Predicted Rating: 3.8
# 1539 Interview with the Vampire
        (The Vampire Chronicles, #1)  - Predicted Rating: 3.8
# 1540 Oh, the Places You’ll Go!  - Predicted Rating: 3.8
# 1541 Magic Bites
        (Kate Daniels, #1)  - Predicted Rating: 3.8
# 1542 So Long, and Thanks for All the Fish
        (Hitchhiker's Guide to the Galaxy, #4)  - Predicted Rating: 3.8
# 1543 All the Pretty Horses
        (The Border Trilogy, #1)  - Predicted Rating: 3.8
# 1544 The Only Woman in the Room  - Predicted Rating: 3.8
# 1545 The Whisper Man  - Predicted Rating: 3.8
# 1546 The Vile Village
        (A Series of Unfortunate Events, #7)  - Predicted Rating: 3.8
# 1547 Tinkers  - Predicted Rating: 3.8
# 1548 Gone Tonight  - Predicted Rating: 3.8
# 1549 Bag of Bones  - Predicted Rating: 3.8
# 1550 Conversations with Friends  - Predicted Rating: 3.8
# 1551 Heart Bones  - Predicted Rating: 3.8
# 1552 Night Embrace
        (Dark-Hunter, #2)  - Predicted Rating: 3.8
# 1553 Kitch

# 1862 The Pelican Brief  - Predicted Rating: 3.7
# 1863 The Witches  - Predicted Rating: 3.7
# 1864 Eileen  - Predicted Rating: 3.7
# 1865 Broken  - Predicted Rating: 3.7
# 1866 Rich People Problems
        (Crazy Rich Asians, #3)  - Predicted Rating: 3.7
# 1867 Thinner  - Predicted Rating: 3.7
# 1868 Doctor Sleep
        (The Shining, #2)  - Predicted Rating: 3.7
# 1869 Caleb's Crossing  - Predicted Rating: 3.7
# 1870 The Dog Stars  - Predicted Rating: 3.7
# 1871 Fool Moon
        (The Dresden Files, #2)  - Predicted Rating: 3.7
# 1872 Done and Dusted
        (Rebel Blue Ranch, #1)  - Predicted Rating: 3.7
# 1873 Anne's House of Dreams
        (Anne of Green Gables, #5)  - Predicted Rating: 3.7
# 1874 I Am Legend and Other Stories  - Predicted Rating: 3.7
# 1875 Oryx and Crake
        (MaddAddam, #1)  - Predicted Rating: 3.7
# 1876 Lean Mean Thirteen
        (Stephanie Plum, #13)  - Predicted Rating: 3.7
# 1877 Side Jobs
        (The Dresden Files, #12.5)  - Predicted Rating: 3.7
# 1

# 2173 Moonwalking with Einstein: The Art and Science of Remembering Everything  - Predicted Rating: 3.6
# 2174 Witches Abroad
        (Discworld, #12; Witches, #3)  - Predicted Rating: 3.6
# 2175 The Paris Wife  - Predicted Rating: 3.6
# 2176 Tripwire
        (Jack Reacher, #3)  - Predicted Rating: 3.6
# 2177 Lone Wolf  - Predicted Rating: 3.6
# 2178 The Summoning
        (Darkest Powers, #1)  - Predicted Rating: 3.6
# 2179 The Republic of Thieves
        (Gentleman Bastard, #3)  - Predicted Rating: 3.6
# 2180 The Vegetarian  - Predicted Rating: 3.6
# 2181 We Were Liars  - Predicted Rating: 3.6
# 2182 Frostbite
        (Vampire Academy, #2)  - Predicted Rating: 3.6
# 2183 Obsidian Butterfly
        (Anita Blake, Vampire Hunter, #9)  - Predicted Rating: 3.6
# 2184 Sybil: The Classic True Story of a Woman Possessed by Sixteen Personalities  - Predicted Rating: 3.6
# 2185 Tender Is the Flesh  - Predicted Rating: 3.6
# 2186 Speak  - Predicted Rating: 3.6
# 2187 The Tao of Pooh  - Predicte

# 2516 Wyrd Sisters
        (Discworld, #6; Witches, #2)  - Predicted Rating: 3.4
# 2517 The Hour I First Believed  - Predicted Rating: 3.4
# 2518 The Duke and I
        (Bridgertons, #1)  - Predicted Rating: 3.4
# 2519 Notorious Nineteen
        (Stephanie Plum, #19)  - Predicted Rating: 3.4
# 2520 The Husbands  - Predicted Rating: 3.4
# 2521 Dorothy Must Die
        (Dorothy Must Die, #1)  - Predicted Rating: 3.4
# 2522 The Last Kingdom
        (The Saxon Stories, #1)  - Predicted Rating: 3.4
# 2523 Frindle  - Predicted Rating: 3.4
# 2524 13 Little Blue Envelopes
        (Little Blue Envelope, #1)  - Predicted Rating: 3.4
# 2525 The Closers
        (Harry Bosch, #11; Harry Bosch Universe, #15)  - Predicted Rating: 3.4
# 2526 The Affair
        (Jack Reacher, #16)  - Predicted Rating: 3.4
# 2527 Snow  - Predicted Rating: 3.4
# 2528 After You
        (Me Before You, #2)  - Predicted Rating: 3.4
# 2529 Tithe
        (Modern Faerie Tales, #1)  - Predicted Rating: 3.4
# 2530 The Sanatoriu

# 2749 Rich Dad, Poor Dad  - Predicted Rating: 3.2
# 2750 The Other Mrs.  - Predicted Rating: 3.2
# 2751 Diary of a Wimpy Kid
        (Diary of a Wimpy Kid, #1)  - Predicted Rating: 3.2
# 2752 The Reversal
        (The Lincoln Lawyer, #3; Harry Bosch Universe, #22)  - Predicted Rating: 3.2
# 2753 Robinson Crusoe  - Predicted Rating: 3.2
# 2754 Force of Nature
        (Aaron Falk, #2)  - Predicted Rating: 3.2
# 2755 Luster  - Predicted Rating: 3.2
# 2756 Heaven is for Real: A Little Boy's Astounding Story of His Trip to Heaven and Back  - Predicted Rating: 3.2
# 2757 An Abundance of Katherines  - Predicted Rating: 3.2
# 2758 The Marriage Bargain
        (Marriage to a Billionaire, #1)  - Predicted Rating: 3.2
# 2759 Unnatural Exposure
        (Kay Scarpetta, #8)  - Predicted Rating: 3.2
# 2760 The Friend Zone
        (The Friend Zone, #1)  - Predicted Rating: 3.2
# 2761 Attachments  - Predicted Rating: 3.2
# 2762 The Women in the Castle  - Predicted Rating: 3.2
# 2763 Betrayed
        (

In [23]:
#making weighted loss matrix
percents = np.array([ 2.0839861,   6.38564535, 22.8939068,  37.94135873, 30.69510302])
each_weights = 100/percents
print(each_weights)
print(each_weights.sum())

print(each_weights * percents)

weights_array = np.zeros(ratings_torch.shape)
for i in tqdm(range(len(ratings_torch))):
    for j in range(len(ratings_torch[0])):
        for num in [1, 2, 3, 4, 5]:
            if ratings_torch[i,j] == num:
                weights_array[i,j] = each_weights[num-1]
weights_tensor = torch.tensor(weights_array)

[47.98496497 15.66012431  4.36797445  2.63564625  3.25784865]
73.90655863743766
[100. 100. 100. 100. 100.]


100%|███████████████████████████████████████████████████████████████████████████████| 1864/1864 [04:50<00:00,  6.42it/s]


In [24]:
weights[0]

NameError: name 'weights' is not defined

In [None]:
#doing masked autoencoder with weighted loss
latent_dim = 100 # Number of latent features

model = SparseAutoencoder(num_items, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Use MSE loss with weights but only consider observed values
def masked_mse_loss_diff(reconstructed, original, mask, weights):
    loss = (((reconstructed - original) ** 2) * mask)
    weighted_loss = loss * weights
    return weighted_loss.sum() / mask.sum()/100

#break up data into train and val
print("ratings_torch shape = ", ratings_torch.shape)
print("mask_tensor shape = ", mask_tensor.shape)
print("weights shape = ", weights_tensor.shape)

dataset = TensorDataset(ratings_torch, mask_tensor, weights_tensor) #keeping the mask
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


#train the model
epochs = 5000
best_loss = 10e10
counter = 0
for epoch in range(epochs):
    train_loss = 0.0
    for inputs, mask, this_weight in train_loader:
        model.train()
        optimizer.zero_grad()

        # Forward pass
        reconstructed = model(inputs)
        loss = masked_mse_loss_diff(reconstructed, inputs, mask, this_weight)
#         loss_not_weighted = masked_mse_loss(reconstructed, inputs, mask)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    if (epoch + 1) % 10 == 0:
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_loss}")
        #check validation
        model.eval()
        val_loss = 0.0
        val_loss_not_weighted = 0.0
        with torch.no_grad():
            for inputs, mask, this_weight in val_loader:
                outputs = model(inputs)
                loss = masked_mse_loss_diff(outputs, inputs, mask, this_weight)
                loss_not_weighted = masked_mse_loss(outputs, inputs, mask)
                val_loss += loss.item()
                val_loss_not_weighted += loss_not_weighted.item()
                

        val_loss /= len(val_loader)
        

        print(f"Epoch {epoch + 1} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Val not weighted: {val_loss_not_weighted:.4f}" )
        if val_loss < best_loss: #if improve then save
            #save model 
            torch.save(model.state_dict(), "model_weighted{}.pkl".format(latent_dim))
            best_loss = val_loss
            print("Model saved to model_weighted{}.pkl.".format(latent_dim))
            counter = 0
        else:
            counter += 10
            
    if counter > 200:
        print("Done training because of no improvement.")
        break
        
            


In [None]:
dfghj
import torch
from sklearn.model_selection import KFold

# Mask for observed values (1 for observed, 0 for missing)
ratings_torch = torch.tensor(ratings).float()
mask = (ratings_torch != 0).float()
print(mask)


#Define autoencoder
class SparseAutoencoder(nn.Module):
    def __init__(self, num_items, latent_dim):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Linear(num_items, latent_dim)
        self.decoder = nn.Linear(latent_dim, num_items)
        
    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        # Scale sigmoid output to [1, 5]
        return 1 + 4 * torch.sigmoid(decoded)
        return decoded

    
#initialize the model
num_users, num_items = ratings_torch.shape
np.save("num_users.npy", np.array(num_users))
np.save("num_items.npy", np.array(num_items))

for latent_dim in [2, 5, 10, 20, 40, 50, 75, 100]:
    print("latent_dim = ", latent_dim)
# latent_dim = 20  # Number of latent features

    model = SparseAutoencoder(num_items, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Define your model, loss function, and optimizer
    # Assuming model, ratings_torch, mask, and optimizer are already defined

    epochs = 1000
    k_folds = 5  # Number of folds for cross-validation
    kf = KFold(n_splits=k_folds, shuffle=True)

    # Store the losses for each fold
    fold_losses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(ratings_torch)):
#         print(f"\nFold {fold + 1}/{k_folds}")

        # Split the data into training and validation sets
        train_ratings = ratings_torch[train_idx]
        val_ratings = ratings_torch[val_idx]
        train_mask = mask[train_idx]
        val_mask = mask[val_idx]

        # Initialize a new model for each fold
        model = SparseAutoencoder(num_items, latent_dim)
    #     optimizer = optim.Adam(model.parameters(), lr=0.01)

        # Re-initialize optimizer for each fold
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        # Train the model
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()

            # Forward pass for training
            reconstructed = model(train_ratings)
            loss = masked_mse_loss(reconstructed, train_ratings, train_mask)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

#             if (epoch + 1) % 1000 == 0:
#                 print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

        # Evaluate the model on the validation set
        model.eval()
        with torch.no_grad():
            reconstructed_val = model(val_ratings)
            val_loss = masked_mse_loss(reconstructed_val, val_ratings, val_mask)

        print(f"Validation Loss for Fold {fold + 1}: {val_loss.item():.4f}")

        # Store the validation loss for this fold
        fold_losses.append(val_loss.item())

    # Print the average validation loss after all folds
    print(f"\nAverage Validation Loss across all folds: {sum(fold_losses)/k_folds:.4f}")


In [None]:
fgh

In [None]:
ratings_matrix[user_id]

In [None]:
# Convert the filled ratings data into a numpy array
ratings_matrix = ratings_df.values

# Initialize KNN (using user-based KNN)
import math
knn = NearestNeighbors(n_neighbors=math.ceil(num_users/10), metric='cosine')  # Using cosine similarity
knn.fit(ratings_matrix)

# Example: Predict rating for User 0 on Item 2
user_id = 0  # Index of user in the matrix
item_id = 9  # Index of item to predict rating for

# Get the nearest neighbors for user 0 (excluding the user itself)
distances, indices = knn.kneighbors([ratings_matrix[user_id]])

pred_ratings_list = np.array([])
rankings_list = np.array([])
for item_id in range(num_titles):
    # Get the ratings for the neighbors on item 2
    neighbor_ratings = np.array([ratings_matrix[i, item_id] for i in indices[0] if not np.isnan(ratings_matrix[i, item_id])])
    print(ratings_matrix[:, item_id])
    print(neighbor_ratings)
    ghjk
    
    predicted_rating = np.mean(neighbor_ratings[np.nonzero(neighbor_ratings)])
    rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

#     print(predicted_rating)

    pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    rankings_list = np.append(rankings_list, rankings)
#     print(f"Predicted rating for User {user_id} on Item {item_id}: {predicted_rating}")
    
best_book_rating = np.max(pred_ratings_list)
best_book_idx = np.argmax(pred_ratings_list)

sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
for i, idx in enumerate(sorted_indices): 
    print("#", (i+1) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))

In [None]:
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds

# Create a sparse matrix (CSR format)
A = np.array([[1.0, 0, 0], [0, 2, 3], [4, 0, 6], [0, 0, 0]])
# print(A)
sparse_matrix = sp.csr_matrix(ratings_matrix)

# Perform SVD on the sparse matrix
# k is the number of singular values to compute (you can choose a value smaller than min(m, n))
U, S, VT = svds(sparse_matrix, k=500)

# Output the matrices
print("U (Left Singular Vectors):\n", U)
print("\nS (Singular Values):\n", S)
print("\nVT (Right Singular Vectors - Transposed):\n", VT)

# Reconstruct the matrix from U, S, VT
S_full = np.diag(S)  # Convert singular values to a diagonal matrix
A_reconstructed = np.dot(U, np.dot(S_full, VT))

print("\nReconstructed Matrix A:\n", A_reconstructed)


In [None]:
my_diff = (ratings_matrix[0]- A_reconstructed[0])
print(ratings_matrix.shape)
plt.plot(my_diff, '.')

# for i in range(len(ratings_matrix[0])):
#     if ratings_matrix[0, i] > 0:
#         print(ratings_matrix[0, i], A_reconstructed[0, i], titles[i])
        
for i in range(len(ratings_matrix[0])):
    if ratings_matrix[0, i] == 0 and A_reconstructed[0, i] > 0:
        print(ratings_matrix[0, i], A_reconstructed[0, i], titles[i])


In [None]:
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Create a custom colormap with white for 0 and red for non-zero
cmap = mcolors.ListedColormap(['white', 'red'])
bounds = [0, 0.1, 1]  # Set bounds for 0 (white) and non-zero (red)
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# plt.imshow(ratings_matrix - A_reconstructed, cmap=cmap, norm=norm)
plt.plot(ratings_matrix - A_reconstructed)
plt.show()

In [None]:
ratings_matrix - A_reconstructed

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.75, min_samples=2, metric='cosine')
labels = dbscan.fit_predict(ratings_matrix)


In [None]:
print(list(set(labels)))
print(labels.shape)

In [None]:
idx_in_group = np.arange(len(labels))
filtered_users = [idx_in_group for idx_in_group, label in zip(idx_in_group, labels) if label == 0]
print(filtered_users)

In [None]:
from sklearn.cluster import SpectralClustering
from scipy.sparse import csr_matrix
import numpy as np

# Example sparse data
# X = np.random.rand(100, 2)
X_sparse = csr_matrix(ratings_matrix)

n_clusters = 50
# Apply Spectral Clustering
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors')
labels = spectral.fit_predict(X_sparse)

print(labels)
print(list(set(labels)))
print(labels.shape)

In [None]:
# idx_in_group = np.arange(len(labels))
# filtered_users = [idx_in_group for idx_in_group, label in zip(idx_in_group, labels) if label == 0]
# print(filtered_users)

group_averages = []

for group in range(n_clusters):
    # Find indices of users in the current group
    group_users = np.where(labels == group)[0]
    
    # Extract the rows for users in this group
    group_data = ratings_matrix[group_users]
    
    print("Number of perople in group = ", group_data.shape[0])
    
    pred_ratings_list = np.array([])
    for item_id in range(num_titles):
        data = group_data[:,item_id]
        predicted_rating = np.mean(data[np.nonzero(data)])
#                 rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

        pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    # Compute the average for each column (item) for this group
#     group_avg = np.mean(group_data[np.nonzero(group_data)], axis=0)
    
    # Append the average for this group
    group_averages.append(pred_ratings_list)

# Convert the list of group averages to a numpy array for easy viewing
group_averages = np.array(group_averages)

# Display the average for each item in each group
print("Average preferences for each item by group:")
print(group_averages.shape)

In [None]:
group = labels[0]
print("my group = ", group)
sorted_indices = np.argsort(group_averages[group])[::-1]
print(sorted_indices)
for i in sorted_indices:
    if (ratings_matrix[0, i] > 0) or (np.isnan(group_averages[group, i])):
        pass
    else:
        print(titles[i], round(group_averages[group,i], 1))