In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
import pickle 

def scrape_goodreads_ratings(user_id, max_pages=10):
    """
    Scrape a user's star ratings from Goodreads.
    
    Args:
    - user_id (str): Goodreads user ID or profile suffix.
    - max_pages (int): Maximum number of pages to scrape (each page contains ~30 books).
    
    Returns:
    - pd.DataFrame: A DataFrame containing book titles and ratings.
    """
    base_url = f"https://www.goodreads.com/review/list/{user_id}?shelf=read"
    headers = {"User-Agent": "Mozilla/5.0"}
    books = []

    for page in range(1, max_pages + 1):
        url = f"{base_url}&page={page}"
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find all book entries in the table
        rows = soup.find_all("tr", class_="bookalike review")
        if not rows:
            print("No more data found.")
            break

        for row in rows:
            try:
                title = row.find("td", class_="field title").a.text.strip()
                rating_element = row.find("td", class_="field rating")
                rating = rating_element.find("span", class_="staticStars").get("title", "No rating")
                stars = map_rating(rating)
                books.append({"Title": title, "Rating": stars, "User_id": user_id})
#                 print(title, rating, stars)
            except AttributeError:
                # Handle rows with missing data
                continue

        print(f"Page {page} scraped successfully.")
        time.sleep(random.uniform(1, 5))  # Be kind to the server and avoid being blocked

    # Return data as a pandas DataFrame
    return pd.DataFrame(books)



In [2]:
def map_rating(phrase):
    rating_map = {
        "liked it": 3,
        "really liked it": 4,
        "it was ok": 2, 
        "it was amazing": 5, 
        "did not like it": 1,
    }
    
    return rating_map.get(phrase, "Invalid rating")  # Default to "Invalid rating" if the phrase isn't in the dictionary


In [3]:
# if __name__ == "__main__":
# #     user_id = "6688207"  # Replace with the Goodreads user ID or profile suffix
# #     for user_id in tqdm(['30181442', '75009563', '11345366', '110912303', '113964939', '11215896', '53701594', '4622890', '93628736', '176180116']):
# #     for user_id in tqdm(['2974095', '4622890', '28953843', '16174645', '4159922', '4125660', '54886546', '16912659', '260116', '4685500', '21865425']):
# #     for user_id in tqdm(['53701594', '27709782', '7566229', '16652861', '30817744', '56259255', '4125660', '60964126', 
# #                          '176167767', '28510930', '1029975', '131020767', '28862120', '88713906', '160141433', '41097916', 
# #                          '20809863', '69519261', '24017481', '7376365', '75941333', '13571407', '106618742', '17792052',
# #                          '3534528', '130656897', '7474475', '4125412', '6336365', '6026811', '3438047']):
#     for user_id in ['169695556']:
#         print("User_id = ", user_id)
#         max_pages = 30  # Adjust based on expected data
#         ratings_data = scrape_goodreads_ratings(user_id, max_pages)

#         if not ratings_data.empty:
# #             print(ratings_data.head())
# #             ratings_data.to_csv("goodreads_ratings.csv", index=False)
#             ratings_data.to_csv('goodreads_ratings.csv', mode='a', header=False, index=False)
#             print("Data saved to goodreads_ratings.csv.")
#         else:
#             print("No data retrieved.")


In [4]:
df = pd.read_csv('goodreads_ratings_series.csv')
print(df.shape)
df = df.drop_duplicates()
# Print the entire DataFrame
print(df)

duplicate_count = df['Title'].duplicated().sum()
print("Number of books with at least two people rating it:", duplicate_count)
duplicate_counts_per_value = df['Title'].value_counts()
print(duplicate_counts_per_value)
print("Number of unique books: ", df['Title'].nunique())
num_users = df['User_id'].nunique()
user_ids = list(df['User_id'].unique())
print("number of users is: ", num_users)
print("user_ids = ", user_ids)

  df = pd.read_csv('goodreads_ratings_series.csv')


(271813, 6)
                                                    Title Rating    User_id  \
0                                       I Am Watching You      3  169695558   
1       Three to Get Deadly\n        (Stephanie Plum, #3)      3  169695558   
2       Before the Coffee Gets Cold\n        (Before t...      4  169695558   
3       Dark Sacred Night\n        (Renée Ballard, #2;...      4  169695558   
4         Two for the Dough\n        (Stephanie Plum, #2)      4  169695558   
...                                                   ...    ...        ...   
271808                                    Five Feet Apart      4  133546120   
271809         Me Before You\n        (Me Before You, #1)      5  133546120   
271810                                     Eleanor & Park      4  133546120   
271811  Anna and the French Kiss\n        (Anna and th...      5  133546120   
271812                             The Fault in Our Stars      4  133546120   

       Series  First  Suggest  
0      

In [5]:
# # Get a list of top titles in order
# top_titles = duplicate_counts_per_value.index.tolist()
# top_100 = top_titles[:100]

# for title in top_100:
#     print(title)
    
# with open("top_100.pkl", "wb") as file:
#     pickle.dump(top_100, file)

In [6]:
# threshold = 5#num_users * 0.1
# pop_titles = list(duplicate_counts_per_value[duplicate_counts_per_value > threshold].index)
# my_titles = df.loc[df["User_id"] == 169695558, "Title"].tolist()
# # print(my_titles)

# print("pop titles len = ", len(pop_titles))
# print(pop_titles)
# print("my titles len = ", len(my_titles))
# titles = list(set(pop_titles))# + my_titles))

# # #remove Harry Potter titles:
# # titles = [s for s in titles if "Harry Potter" not in s]

# num_titles = len(titles)


# print(titles)
# # print(titles)
# print("num_titles =", num_titles)

# # ratings = np.full((num_users, num_titles), None)
# ratings = np.zeros((num_users, num_titles))

# for index, row in df.iterrows():
#     if row['Title'] in titles:
#         try:
#             ratings[user_ids.index(row['User_id']), titles.index(row["Title"])] = int(row["Rating"])
# #             print("found ", row["Title"])
#         except:
#             pass
        
# print("ratings size = ", ratings.shape)
# ratings = ratings[~np.all(ratings == 0, axis=1)]
# print("ratings size = ", ratings.shape)
# # Save the list to a file
# with open("titles.pkl", "wb") as file:
#     pickle.dump(titles, file)

In [7]:
# # print(df['Suggest'])

# suggest = list(df['Suggest'])
# print(suggest)

# with open("suggest.pkl", "wb") as file:
#     suggest.dump(suggest, file)

In [8]:
# ratings_df = pd.DataFrame(ratings)
# print(ratings_df.shape)
# #delete users that don't have any of these ratings
# # ratings_df = ratings_df.loc[~(ratings_df == 0).all(axis=1)]
# ratings_df = ratings_df.loc[(ratings_df != 0).sum(axis=1) >= 4] #need at least 4 entries to stay
# print(ratings_df.shape)

# # Calculate percentage of non-zero elements
# percentage_nonzero = (np.count_nonzero(ratings_df) / ratings_df.size) * 100
# print("percentage_nonzero =", round(percentage_nonzero, 1), '%')

# # Save the list to a file
# with open("ratings_df.pkl", "wb") as file:
#     pickle.dump(ratings_df, file)


In [9]:
with open("titles.pkl", "rb") as file:
    titles = pickle.load(file)
    
with open("top_100.pkl", "rb") as file:
    top_100 = pickle.load(file)
    
with open("suggest.pkl", "rb") as file:
    suggest = pickle.load(file)
    
num_titles = len(titles)

# Load the list from the file
with open("ratings_df.pkl", "rb") as file:
    ratings_df = pickle.load(file)
    ratings = ratings_df.to_numpy()


In [10]:
# Convert the filled ratings data into a numpy array
ratings_matrix = ratings_df.values

# Initialize KNN (using user-based KNN)
import math
knn = NearestNeighbors(n_neighbors=50, metric='cosine')  # Using cosine similarity  math.ceil(num_users/10)
knn.fit(ratings_matrix)

with open("knn_model.pkl", "wb") as file:
    pickle.dump(knn, file)

# Example: Predict rating for User 0 on Item 2
user_id = 0  # Index of user in the matrix
item_id = 2  # Index of item to predict rating for

# Get the nearest neighbors for user 0 (excluding the user itself)
distances, indices = knn.kneighbors([ratings_matrix[user_id]])

pred_ratings_list = np.array([])
rankings_list = np.array([])
for item_id in range(num_titles):
    # Get the ratings for the neighbors on item 2
    neighbor_ratings = np.array([ratings_matrix[i, item_id] for i in indices[0] if not np.isnan(ratings_matrix[i, item_id])])

    predicted_rating = np.mean(neighbor_ratings[np.nonzero(neighbor_ratings)])
    rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

#     print(predicted_rating)

    pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    rankings_list = np.append(rankings_list, rankings)
#     print(f"Predicted rating for User {user_id} on Item {item_id}: {predicted_rating}")
    
best_book_rating = np.max(pred_ratings_list)
best_book_idx = np.argmax(pred_ratings_list)

sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
n = 1
for i, idx in enumerate(sorted_indices): 
    if (ratings[0, idx] > 0) or (np.isnan(pred_ratings_list[idx])):
        pass
    else:
        print("#", (n) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
        n+=1

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Top books are:
# 1 Magyk
        (Septimus Heap, #1) Rating: 5.0
# 2 Love, Theoretically Rating: 5.0
# 3 Seriously... I'm Kidding Rating: 5.0
# 4 A Short History of Nearly Everything Rating: 5.0
# 5 All Together Dead
        (Sookie Stackhouse, #7) Rating: 5.0
# 6 A Bear Called Paddington
        (Paddington, #1) Rating: 5.0
# 7 The Camel Club
        (The Camel Club, #1) Rating: 5.0
# 8 The Burning Bridge
        (Ranger's Apprentice, #2) Rating: 5.0
# 9 Vampire Academy
        (Vampire Academy, #1) Rating: 5.0
# 10 The Witch of Blackbird Pond Rating: 5.0
# 11 Gregor the Overlander
        (Underland Chronicles, #1) Rating: 5.0
# 12 My Horizontal Life: A Collection of One-Night Stands Rating: 5.0
# 13 Divine Secrets of the Ya-Ya Sisterhood Rating: 5.0
# 14 The Merchant of Death
        (Pendragon, #1) Rating: 5.0
# 15 Cujo Rating: 5.0
# 16 A Room with a View Rating: 5.0
# 17 The Mouse and the Motorcycle
        (Ralph S. Mouse, #1) Rating: 5.0
# 18 Winter
        (The Lunar Chronicles

        (Time Quintet, #3) Rating: 4.0
# 350 The Black Cauldron
        (The Chronicles of Prydain, #2) Rating: 4.0
# 351 J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings Rating: 4.0
# 352 Three Cups of Tea: One Man's Mission to Promote Peace ... One School at a Time Rating: 4.0
# 353 The Tao of Pooh Rating: 4.0
# 354 The Year of Living Biblically: One Man's Humble Quest to Follow the Bible as Literally as Possible Rating: 4.0
# 355 Tiny Beautiful Things: Advice on Love and Life from Dear Sugar Rating: 4.0
# 356 Steve Jobs Rating: 4.0
# 357 The Memory Keeper's Daughter Rating: 4.0
# 358 Shiver
        (The Wolves of Mercy Falls, #1) Rating: 4.0
# 359 Matilda Rating: 4.0
# 360 Book Lovers Rating: 4.0
# 361 The Paris Wife Rating: 4.0
# 362 Unwind
        (Unwind, #1) Rating: 4.0
# 363 Rodrick Rules
        (Diary of a Wimpy Kid, #2) Rating: 4.0
# 364 Black Beauty Rating: 4.0
# 365 The Island of Sea Women Rating: 4.0
# 366 The Hound of the Baskervilles
        (Sherlo

# 683 The Bell Jar Rating: 3.0
# 684 Snow Falling on Cedars Rating: 3.0
# 685 In Her Shoes Rating: 3.0
# 686 Murder on the Orient Express
        (Hercule Poirot, #10) Rating: 3.0
# 687 Dark Places Rating: 3.0
# 688 The Girls Rating: 3.0
# 689 Darkly Dreaming Dexter
        (Dexter, #1) Rating: 3.0
# 690 Visions of Sugar Plums
        (Stephanie Plum, #8.5) Rating: 3.0
# 691 How the King of Elfhame Learned to Hate Stories
        (The Folk of the Air, #3.5) Rating: 3.0
# 692 The Adventures of Huckleberry Finn Rating: 3.0
# 693 Dead Poets Society Rating: 3.0
# 694 I Am the Messenger Rating: 3.0
# 695 The Raven Boys
        (The Raven Cycle, #1) Rating: 3.0
# 696 The Restaurant at the End of the Universe
        (The Hitchhiker's Guide to the Galaxy, #2) Rating: 3.0
# 697 The Devil Wears Prada
        (The Devil Wears Prada, #1) Rating: 3.0
# 698 Girls in Pants: The Third Summer of the Sisterhood
        (Sisterhood, #3) Rating: 3.0
# 699 From Potter's Field
        (Kay Scarpetta, #6) R

In [11]:
#make knn for the similar user part
# Initialize KNN (using user-based KNN)
# import math
knn_30 = NearestNeighbors(n_neighbors=50, metric='cosine')  # Using cosine similarity
knn_30.fit(ratings_matrix)

with open("knn_model_30.pkl", "wb") as file:
    pickle.dump(knn_30, file)

In [12]:
rankings_list

best_book_rating = np.max(rankings_list)
best_book_idx = np.argmax(rankings_list)

sorted_indices = np.argsort(rankings_list)[::-1]
print("Top books are:")
for i, idx in enumerate(sorted_indices): 
    print("#", (i+1) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1), ". Ranking:", rankings_list[idx])

Top books are:
# 1 Harry Potter and the Prisoner of Azkaban
        (Harry Potter, #3) Rating: 4.8 . Ranking: 220.0
# 2 Harry Potter and the Goblet of Fire
        (Harry Potter, #4) Rating: 4.8 . Ranking: 218.0
# 3 Harry Potter and the Deathly Hallows
        (Harry Potter, #7) Rating: 4.8 . Ranking: 218.0
# 4 Harry Potter and the Half-Blood Prince
        (Harry Potter, #6) Rating: 4.7 . Ranking: 212.0
# 5 Harry Potter and the Chamber of Secrets
        (Harry Potter, #2) Rating: 4.7 . Ranking: 211.0
# 6 Harry Potter and the Order of the Phoenix
        (Harry Potter, #5) Rating: 4.7 . Ranking: 208.0
# 7 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1) Rating: 4.7 . Ranking: 203.0
# 8 The Hunger Games
        (The Hunger Games, #1) Rating: 4.7 . Ranking: 178.0
# 9 Catching Fire
        (The Hunger Games, #2) Rating: 4.7 . Ranking: 150.0
# 10 Mockingjay
        (The Hunger Games, #3) Rating: 4.5 . Ranking: 143.0
# 11 Twilight
        (The Twilight Saga, #1) Rating: 3.9

        (Sookie Stackhouse, #7) Rating: 5.0 . Ranking: 10.0
# 215 Anna Karenina Rating: 3.3 . Ranking: 10.0
# 216 Atlas Shrugged Rating: 3.3 . Ranking: 10.0
# 217 Inheritance
        (The Inheritance Cycle, #4) Rating: 5.0 . Ranking: 10.0
# 218 Ready Player One
        (Ready Player One, #1) Rating: 5.0 . Ranking: 10.0
# 219 From Dead to Worse
        (Sookie Stackhouse, #8) Rating: 5.0 . Ranking: 10.0
# 220 The Sisterhood of the Traveling Pants
        (Sisterhood, #1) Rating: 3.3 . Ranking: 10.0
# 221 Dead as a Doornail
        (Sookie Stackhouse, #5) Rating: 5.0 . Ranking: 10.0
# 222 Beloved
        (Beloved Trilogy, #1) Rating: 3.3 . Ranking: 10.0
# 223 When You Reach Me Rating: 5.0 . Ranking: 10.0
# 224 Into the Wild Rating: 3.3 . Ranking: 10.0
# 225 The Mark of Athena
        (The Heroes of Olympus, #3) Rating: 5.0 . Ranking: 10.0
# 226 To All the Boys I've Loved Before
        (To All the Boys I've Loved Before, #1) Rating: 5.0 . Ranking: 10.0
# 227 Aristotle and Dante Discover 

# 419 Where the Crawdads Sing Rating: 5.0 . Ranking: 5.0
# 420 The Last Wish
        (The Witcher, #0.5) Rating: 5.0 . Ranking: 5.0
# 421 Anne of Windy Poplars
        (Anne of Green Gables, #4) Rating: 5.0 . Ranking: 5.0
# 422 Corduroy Rating: 5.0 . Ranking: 5.0
# 423 Divine Rivals
        (Letters of Enchantment, #1) Rating: 5.0 . Ranking: 5.0
# 424 Christine Rating: 5.0 . Ranking: 5.0
# 425 Requiem
        (Delirium, #3) Rating: 5.0 . Ranking: 5.0
# 426 Destiny of the Republic: A Tale of Madness, Medicine and the Murder of a President Rating: 5.0 . Ranking: 5.0
# 427 Torment
        (Fallen, #2) Rating: 5.0 . Ranking: 5.0
# 428 Freedom Rating: 5.0 . Ranking: 5.0
# 429 The Wise Manâ€™s Fear
        (The Kingkiller Chronicle, #2) Rating: 5.0 . Ranking: 5.0
# 430 Go the Fuck to Sleep Rating: 2.5 . Ranking: 5.0
# 431 A Room with a View Rating: 5.0 . Ranking: 5.0
# 432 The Song of Achilles Rating: 5.0 . Ranking: 5.0
# 433 The Name of the Wind
        (The Kingkiller Chronicle, #1) Rating

        (Stephanie Plum, #20) Rating: 4.0 . Ranking: 4.0
# 646 Rodrick Rules
        (Diary of a Wimpy Kid, #2) Rating: 4.0 . Ranking: 4.0
# 647 Twenties Girl Rating: 4.0 . Ranking: 4.0
# 648 American Born Chinese Rating: 4.0 . Ranking: 4.0
# 649 Lady Chatterley's Lover Rating: 4.0 . Ranking: 4.0
# 650 The Dream Thieves
        (The Raven Cycle, #2) Rating: 4.0 . Ranking: 4.0
# 651 J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings Rating: 4.0 . Ranking: 4.0
# 652 Doctor Sleep
        (The Shining, #2) Rating: 4.0 . Ranking: 4.0
# 653 All Creatures Great and Small
        (All Creatures Great and Small, #1-2) Rating: 4.0 . Ranking: 4.0
# 654 The Raven King
        (The Raven Cycle, #4) Rating: 4.0 . Ranking: 4.0
# 655 A Swiftly Tilting Planet
        (Time Quintet, #3) Rating: 4.0 . Ranking: 4.0
# 656 The Ugly Truth
        (Diary of a Wimpy Kid, #5) Rating: 4.0 . Ranking: 4.0
# 657 A Million Little Pieces Rating: 4.0 . Ranking: 4.0
# 658 Caraval
        (Caraval, #1

        (The Forest of Hands and Teeth, #1) Rating: 3.0 . Ranking: 3.0
# 876 Dr. Jekyll and Mr. Hyde Rating: 3.0 . Ranking: 3.0
# 877 The Circle
        (The Circle, #1) Rating: 2.0 . Ranking: 2.0
# 878 Happy Place Rating: 2.0 . Ranking: 2.0
# 879 A Spool of Blue Thread Rating: 2.0 . Ranking: 2.0
# 880 Next Rating: 2.0 . Ranking: 2.0
# 881 Sophieâ€™s World Rating: 2.0 . Ranking: 2.0
# 882 A Peopleâ€™s History of the United States: 1492 - Present Rating: 2.0 . Ranking: 2.0
# 883 After You
        (Me Before You, #2) Rating: 2.0 . Ranking: 2.0
# 884 The Return of the King
        (The Lord of the Rings, #3) Rating: 2.0 . Ranking: 2.0
# 885 Choke Rating: 2.0 . Ranking: 2.0
# 886 The Girls' Guide to Hunting and Fishing Rating: 2.0 . Ranking: 2.0
# 887 The Husband's Secret Rating: 2.0 . Ranking: 2.0
# 888 Parable of the Sower
        (Earthseed, #1) Rating: 2.0 . Ranking: 2.0
# 889 The Knife of Never Letting Go
        (Chaos Walking, #1) Rating: 2.0 . Ranking: 2.0
# 890 The Girl Who Loved 

# 1114 Weyward Rating: nan . Ranking: 0.0
# 1115 Later Rating: nan . Ranking: 0.0
# 1116 The Satanic Verses Rating: nan . Ranking: 0.0
# 1117 Sula Rating: nan . Ranking: 0.0
# 1118 Under the Tuscan Sun: At Home in Italy Rating: nan . Ranking: 0.0
# 1119 The Searcher Rating: nan . Ranking: 0.0
# 1120 The Tattooist of Auschwitz Rating: nan . Ranking: 0.0
# 1121 The Last Word Rating: nan . Ranking: 0.0
# 1122 The Whisper Man Rating: nan . Ranking: 0.0
# 1123 The Dutch House Rating: nan . Ranking: 0.0
# 1124 Alexander and the Terrible, Horrible, No Good, Very Bad Day Rating: nan . Ranking: 0.0
# 1125 Wrong Place Wrong Time Rating: nan . Ranking: 0.0
# 1126 Tao Te Ching Rating: nan . Ranking: 0.0
# 1127 The Golem and the Jinni
        (The Golem and the Jinni, #1) Rating: nan . Ranking: 0.0
# 1128 The Invention of Wings Rating: nan . Ranking: 0.0
# 1129 Giovanniâ€™s Room Rating: nan . Ranking: 0.0
# 1130 The Racketeer Rating: nan . Ranking: 0.0
# 1131 Don't Sweat the Small Stuff ... and It'

# 1286 Howlâ€™s Moving Castle
        (Howlâ€™s Moving Castle, #1) Rating: nan . Ranking: 0.0
# 1287 The Eye of the World
        (The Wheel of Time, #1) Rating: nan . Ranking: 0.0
# 1288 Secluded Cabin Sleeps Six Rating: nan . Ranking: 0.0
# 1289 The Brass Verdict
        (The Lincoln Lawyer, #2; Harry Bosch Universe, #19) Rating: nan . Ranking: 0.0
# 1290 Say Nothing: A True Story of Murder and Memory in Northern Ireland Rating: nan . Ranking: 0.0
# 1291 The Whistler
        (The Whistler, #1) Rating: nan . Ranking: 0.0
# 1292 Furies of Calderon
        (Codex Alera, #1) Rating: nan . Ranking: 0.0
# 1293 One True Loves Rating: nan . Ranking: 0.0
# 1294 Angle of Repose Rating: nan . Ranking: 0.0
# 1295 As I Lay Dying Rating: nan . Ranking: 0.0
# 1296 The Sleepwalker's Guide to Dancing Rating: nan . Ranking: 0.0
# 1297 The Bonfire of the Vanities Rating: nan . Ranking: 0.0
# 1298 His & Hers Rating: nan . Ranking: 0.0
# 1299 Native Son Rating: nan . Ranking: 0.0
# 1300 SuperFreakonomics

        (Will Robie, #1) Rating: nan . Ranking: 0.0
# 1510 The School of Essential Ingredients Rating: nan . Ranking: 0.0
# 1511 Team of Rivals: The Political Genius of Abraham Lincoln Rating: nan . Ranking: 0.0
# 1512 Survivor Rating: nan . Ranking: 0.0
# 1513 A Portrait of the Artist as a Young Man Rating: nan . Ranking: 0.0
# 1514 A Fine Balance Rating: nan . Ranking: 0.0
# 1515 Yellowface Rating: nan . Ranking: 0.0
# 1516 The Last Flight Rating: nan . Ranking: 0.0
# 1517 Know My Name Rating: nan . Ranking: 0.0
# 1518 King of Scars
        (King of Scars, #1) Rating: nan . Ranking: 0.0
# 1519 Flawless
        (Chestnut Springs, #1) Rating: nan . Ranking: 0.0
# 1520 The New Jim Crow: Mass Incarceration in the Age of Colorblindness Rating: nan . Ranking: 0.0
# 1521 The Complete Fairy Tales Rating: nan . Ranking: 0.0
# 1522 The Polar Express Rating: nan . Ranking: 0.0
# 1523 Twisted Lies
        (Twisted, #4) Rating: nan . Ranking: 0.0
# 1524 Just the Nicest Couple Rating: nan . Rankin

# 1754 Exodus Rating: nan . Ranking: 0.0
# 1755 The Road Rating: nan . Ranking: 0.0
# 1756 Reckless Girls Rating: nan . Ranking: 0.0
# 1757 Golden Son
        (Red Rising Saga, #2) Rating: nan . Ranking: 0.0
# 1758 Endurance: Shackleton's Incredible Voyage Rating: nan . Ranking: 0.0
# 1759 The Complete Maus Rating: nan . Ranking: 0.0
# 1760 Brideshead Revisited Rating: nan . Ranking: 0.0
# 1761 The Dream Daughter Rating: nan . Ranking: 0.0
# 1762 Too Late Rating: nan . Ranking: 0.0
# 1763 Before They Are Hanged
        (The First Law, #2) Rating: nan . Ranking: 0.0
# 1764 The Door Rating: nan . Ranking: 0.0
# 1765 Dragons of Autumn Twilight
        (Dragonlance: Chronicles, #1) Rating: nan . Ranking: 0.0
# 1766 Cold Days
        (The Dresden Files, #14) Rating: nan . Ranking: 0.0
# 1767 A Separate Peace Rating: nan . Ranking: 0.0
# 1768 Frostbite
        (Vampire Academy, #2) Rating: nan . Ranking: 0.0
# 1769 The Arctic Incident
        (Artemis Fowl, #2) Rating: nan . Ranking: 0.0
# 1

# 1996 Silas Marner Rating: nan . Ranking: 0.0
# 1997 Jurassic Park
        (Jurassic Park, #1) Rating: nan . Ranking: 0.0
# 1998 Romantic Comedy Rating: nan . Ranking: 0.0
# 1999 Just Kids Rating: nan . Ranking: 0.0
# 2000 Still Me
        (Me Before You, #3) Rating: nan . Ranking: 0.0
# 2001 Ninth House
        (Alex Stern, #1) Rating: nan . Ranking: 0.0
# 2002 Migrations Rating: nan . Ranking: 0.0
# 2003 The Amazing Adventures of Kavalier & Clay Rating: nan . Ranking: 0.0
# 2004 I've Got Your Number Rating: nan . Ranking: 0.0
# 2005 King of Wrath
        (Kings of Sin, #1) Rating: nan . Ranking: 0.0
# 2006 Landline Rating: nan . Ranking: 0.0
# 2007 Every Which Way But Dead
        (The Hollows, #3) Rating: nan . Ranking: 0.0
# 2008 The Haunting of Hill House Rating: nan . Ranking: 0.0
# 2009 Before We Were Yours Rating: nan . Ranking: 0.0
# 2010 Paradise Lost Rating: nan . Ranking: 0.0
# 2011 The A.B.C. Murders
        (Hercule Poirot, #13) Rating: nan . Ranking: 0.0
# 2012 Me and E

        (Ender's Saga, #1) Rating: nan . Ranking: 0.0
# 2239 The Mermaid Chair Rating: nan . Ranking: 0.0
# 2240 The Sandcastle Girls Rating: nan . Ranking: 0.0
# 2241 Beowulf Rating: nan . Ranking: 0.0
# 2242 Anna and the French Kiss
        (Anna and the French Kiss, #1) Rating: nan . Ranking: 0.0
# 2243 Circus of the Damned
        (Anita Blake, Vampire Hunter, #3) Rating: nan . Ranking: 0.0
# 2244 The Zombie Survival Guide: Complete Protection from the Living Dead Rating: nan . Ranking: 0.0
# 2245 The Golden Couple Rating: nan . Ranking: 0.0
# 2246 The Woman in the Library Rating: nan . Ranking: 0.0
# 2247 Antigone
        (The Theban Plays, #3) Rating: nan . Ranking: 0.0
# 2248 The Sun Is Also a Star Rating: nan . Ranking: 0.0
# 2249 Pippi Longstocking
        (Pippi LÃ¥ngstrump, #1) Rating: nan . Ranking: 0.0
# 2250 The Year of Magical Thinking Rating: nan . Ranking: 0.0
# 2251 The Light Between Oceans Rating: nan . Ranking: 0.0
# 2252 The Joy of Cooking Rating: nan . Ranking: 0.

# 2480 The Magicians
        (The Magicians, #1) Rating: nan . Ranking: 0.0
# 2481 Flawless
        (Pretty Little Liars, #2) Rating: nan . Ranking: 0.0
# 2482 I'll Give You the Sun Rating: nan . Ranking: 0.0
# 2483 The Hours Rating: nan . Ranking: 0.0
# 2484 The Queen's Fool
        (The Plantagenet and Tudor Novels, #12) Rating: nan . Ranking: 0.0
# 2485 Manhattan Beach Rating: nan . Ranking: 0.0
# 2486 The Snow Child Rating: nan . Ranking: 0.0
# 2487 Perfect Chemistry
        (Perfect Chemistry, #1) Rating: nan . Ranking: 0.0
# 2488 The Long Dark Tea-Time of the Soul
        (Dirk Gently, #2) Rating: nan . Ranking: 0.0
# 2489 Valentine Rating: nan . Ranking: 0.0
# 2490 The Reptile Room
        (A Series of Unfortunate Events, #2) Rating: nan . Ranking: 0.0
# 2491 Hello Stranger Rating: nan . Ranking: 0.0
# 2492 Danse Macabre
        (Anita Blake, Vampire Hunter, #14) Rating: nan . Ranking: 0.0
# 2493 The Gifts of Imperfection Rating: nan . Ranking: 0.0
# 2494 Convenience Store Woman

        (The Guncle, #1) Rating: nan . Ranking: 0.0
# 2743 Soulless
        (Parasol Protectorate, #1) Rating: nan . Ranking: 0.0
# 2745 Two Twisted Crowns
        (The Shepherd King, #2) Rating: nan . Ranking: 0.0
# 2746 Into Thin Air: A Personal Account of the Mt. Everest Disaster Rating: nan . Ranking: 0.0
# 2747 The Death Cure
        (The Maze Runner, #3) Rating: nan . Ranking: 0.0
# 2748 Only If You're Lucky Rating: nan . Ranking: 0.0
# 2749 After I Do Rating: nan . Ranking: 0.0
# 2750 The Windup Girl Rating: nan . Ranking: 0.0
# 2751 If We Were Villains Rating: nan . Ranking: 0.0
# 2752 The Final Girl Support Group Rating: nan . Ranking: 0.0
# 2753 The Snowman
        (Harry Hole, #7) Rating: nan . Ranking: 0.0
# 2754 Hollow City
        (Miss Peregrine's Peculiar Children, #2) Rating: nan . Ranking: 0.0
# 2755 Beach Read Rating: nan . Ranking: 0.0
# 2756 The Well of Ascension
        (Mistborn, #2) Rating: nan . Ranking: 0.0
# 2757 Mercy Rating: nan . Ranking: 0.0
# 2758 Unnatu

# 2981 The Moon Is a Harsh Mistress Rating: nan . Ranking: 0.0
# 2982 My Lovely Wife Rating: nan . Ranking: 0.0
# 2983 Hooked
        (Never After, #1) Rating: nan . Ranking: 0.0
# 2984 The Radium Girls: The Dark Story of Americaâ€™s Shining Women Rating: nan . Ranking: 0.0
# 2985 Ruin and Rising
        (The Shadow and Bone Trilogy, #3) Rating: nan . Ranking: 0.0
# 2986 The Time Keeper Rating: nan . Ranking: 0.0
# 2987 Then Came You Rating: nan . Ranking: 0.0
# 2988 The Rose Code Rating: nan . Ranking: 0.0
# 2989 A Psalm for the Wild-Built
        (Monk & Robot, #1) Rating: nan . Ranking: 0.0
# 2990 King's Cage
        (Red Queen, #3) Rating: nan . Ranking: 0.0
# 2991 Blood Promise
        (Vampire Academy, #4) Rating: nan . Ranking: 0.0
# 2992 The Psychopath Test: A Journey Through the Madness Industry Rating: nan . Ranking: 0.0
# 2993 The Black Dahlia
        (L.A. Quartet, #1) Rating: nan . Ranking: 0.0
# 2994 True Colors Rating: nan . Ranking: 0.0
# 2995 The Reckoning Rating: nan 

In [13]:
#give a list sorted out with books you've already read:
# sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if (ratings_matrix[user_id, idx] > 0) or (np.isnan(pred_ratings_list[idx])):
        continue
    print("#", list_num , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1

Top books are:
# 1 Harry Potter and the Sorcerer's Stone
        (Harry Potter, #1) Rating: 4.7
# 2 Eclipse
        (The Twilight Saga, #3) Rating: 4.1
# 3 The da Vinci Code
        (Robert Langdon, #2) Rating: 4.4
# 4 Breaking Dawn
        (The Twilight Saga, #4) Rating: 3.9
# 5 Pride and Prejudice Rating: 4.2
# 6 The Hobbit, or There and Back Again
        (The Lord of the Rings, #0) Rating: 4.5
# 7 The Kite Runner Rating: 4.7
# 8 The Great Gatsby Rating: 3.6
# 9 The Help Rating: 4.7
# 10 The Lion, the Witch and the Wardrobe
        (Chronicles of Narnia, #1) Rating: 4.1
# 11 A Game of Thrones
        (A Song of Ice and Fire, #1) Rating: 4.9
# 12 Angels & Demons
        (Robert Langdon, #1) Rating: 4.1
# 13 The Book Thief Rating: 4.6
# 14 1984 Rating: 4.1
# 15 The Lovely Bones Rating: 3.5
# 16 Where the Sidewalk Ends Rating: 3.9
# 17 The Sea of Monsters
        (Percy Jackson and the Olympians, #2) Rating: 4.2
# 18 Of Mice and Men Rating: 3.3
# 19 The Diary of a Young Girl Rating: 4.

# 206 Don't Let the Pigeon Drive the Bus! Rating: 3.0
# 207 Love You Forever Rating: 4.5
# 208 The Tell-Tale Heart and Other Writings Rating: 4.5
# 209 Life of Pi Rating: 3.0
# 210 Room Rating: 4.5
# 211 It Rating: 4.5
# 212 Legend
        (Legend, #1) Rating: 4.5
# 213 When You Are Engulfed in Flames Rating: 4.5
# 214 The Girl Who Kicked the Hornetâ€™s Nest
        (Millennium, #3) Rating: 4.5
# 215 On the Banks of Plum Creek
        (Little House, #4) Rating: 4.5
# 216 A Time to Kill
        (Jake Brigance, #1) Rating: 4.5
# 217 The Throne of Fire
        (The Kane Chronicles, #2) Rating: 4.0
# 218 We Were Liars Rating: 4.0
# 219 Dead in the Family
        (Sookie Stackhouse, #10) Rating: 4.0
# 220 The Alchemist Rating: 2.7
# 221 A Good Girl's Guide to Murder
        (A Good Girl's Guide to Murder, #1) Rating: 4.0
# 222 The Screwtape Letters Rating: 4.0
# 223 Fast Food Nation: The Dark Side of the All-American Meal Rating: 4.0
# 224 Bridget Jones: The Edge of Reason
        (Bridget 

        (Last Survivors, #1) Rating: 4.0
# 502 Monster Rating: 4.0
# 503 Alice in Wonderland Rating: 4.0
# 504 If You Give a Moose a Muffin Rating: 4.0
# 505 I Capture the Castle Rating: 4.0
# 506 Brave New World and Brave New World Revisited Rating: 4.0
# 507 4th of July
        (Women's Murder Club, #4) Rating: 4.0
# 508 On Beauty Rating: 4.0
# 509 World War Z: An Oral History of the Zombie War Rating: 4.0
# 510 Lullaby Rating: 4.0
# 511 My Ãntonia Rating: 4.0
# 512 Iron Flame
        (The Empyrean, #2) Rating: 4.0
# 513 Quiet: The Power of Introverts in a World That Can't Stop Talking Rating: 4.0
# 514 Madame Bovary Rating: 4.0
# 515 The Eternity Code
        (Artemis Fowl, #3) Rating: 4.0
# 516 Heartstopper: Volume Two
        (Heartstopper, #2) Rating: 4.0
# 517 The Night Circus Rating: 4.0
# 518 The Innocent Rating: 4.0
# 519 Brown Bear, Brown Bear, What Do You See? Rating: 4.0
# 520 Watership Down
        (Watership Down, #1) Rating: 4.0
# 521 Just Mercy Rating: 4.0
# 522 The T

        (Abhorsen, #2) Rating: 2.0
# 820 Slaughterhouse-Five Rating: 2.0
# 821 Lucky Rating: 2.0
# 822 The Tipping Point: How Little Things Can Make a Big Difference Rating: 2.0
# 823 Beautiful Disaster
        (Beautiful, #1) Rating: 2.0
# 824 On the Road Rating: 2.0
# 825 The White Queen
        (The Plantagenet and Tudor Novels, #2) Rating: 2.0
# 826 Fangirl Rating: 2.0
# 827 The Bridges of Madison County Rating: 2.0
# 828 The Ballad of Songbirds and Snakes
        (The Hunger Games, #0) Rating: 2.0
# 829 1776 Rating: 2.0
# 830 Cutting for Stone Rating: 1.0
# 831 Things Fall Apart
        (The African Trilogy, #1) Rating: 1.0
# 832 Twilight: The Complete Illustrated Movie Companion Rating: 1.0
# 833 I Know Why the Caged Bird Sings
        (Maya Angelou's Autobiography, #1) Rating: 1.0
# 834 Killing Lincoln: The Shocking Assassination that Changed America Forever Rating: 1.0
# 835 Catch-22 Rating: 1.0
# 836 The Once and Future King Rating: 1.0
# 837 The Last Song Rating: 1.0
# 838 Sn

In [14]:
idx = 8
this_ratings = ratings_matrix[indices[0,idx]]
print(this_ratings)

my_ratings = ratings_matrix[indices[0,0]]
print(my_ratings)

for i, rating in enumerate(this_ratings):
    if rating > 0 and my_ratings[i]>0:
        print("-", titles[i], ", their Rating:", rating, " My Rating:", my_ratings[i])

# print(distances, indices[-1,-1])

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
- A Breath of Snow and Ashes
        (Outlander, #6) , their Rating: 4.0  My Rating: 5.0
- An Echo in the Bone
        (Outlander, #7) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Deathly Hallows
        (Harry Potter, #7) , their Rating: 5.0  My Rating: 5.0
- Catching Fire
        (The Hunger Games, #2) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Prisoner of Azkaban
        (Harry Potter, #3) , their Rating: 5.0  My Rating: 5.0
- To Kill a Mockingbird , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Goblet of Fire
        (Harry Potter, #4) , their Rating: 5.0  My Rating: 5.0
- Divergent
        (Divergent, #1) , their Rating: 4.0  My Rating: 5.0
- Allegiant
        (Divergent, #3) , their Rating: 3.0  My Rating: 5.0
- Harry Potter and the Order of the Phoenix
        (Harry Potter, #5) , their Rating: 5.0  My Rating: 5.0
- Harry Potter and the Chamber of Secrets
        (Harry Potter, #2) , their R

In [15]:
#find most similar books using cosine similarity
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(ratings_matrix.T)

similarity_df = pd.DataFrame(similarity_matrix, index=titles, columns=titles)

# Function to get k nearest neighbors for a movie
def get_similar_book(book_name, k=3):
    similar_book = similarity_df[book_name].sort_values(ascending=False)[1:k+1]
    return similar_book

book_name = 'First Lie Wins'
print("\nTop 5 similar book to", book_name, ":")
print(get_similar_book(book_name, k=5))


Top 5 similar book to First Lie Wins :
She's Not Sorry         0.557165
Listen for the Lie      0.554626
Darling Girls           0.538319
The Fury                0.512215
None of This Is True    0.504321
Name: First Lie Wins, dtype: float64


In [16]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

# Example user rating data (rows = users, columns = items)
ratings_df = pd.DataFrame(ratings)

# Step 1: Handle missing values using imputation (we will fill missing values with the mean rating)
imputer = SimpleImputer(strategy='mean')
ratings_filled = imputer.fit_transform(ratings_df)

# Step 2: Apply KMeans clustering
# We will use 2 clusters as an example
kmeans = KMeans(n_clusters=int(num_users/10), random_state=42)
clusters = kmeans.fit_predict(ratings_filled)

# Step 3: Add the cluster labels to the original DataFrame
ratings_df['Cluster'] = clusters

# # Print the user ratings with cluster assignments
# print("\nUser Ratings with Clusters:")
# print(ratings_df)

# # Step 4: Print the cluster centers (the centroid of each cluster)
# print("\nCluster Centers (Centroids):")
# print(kmeans.cluster_centers_)


user_id = 0
# print(clusters)

cluster_this_user = clusters[user_id]
# print(cluster_this_user)

pred_ratings_list = kmeans.cluster_centers_[user_id]
# print(pred_ratings_list)
# for i in range(len())

#give a list sorted out with books you've already read:
sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if (ratings_matrix[user_id, idx] > 0) or (np.isnan(pred_ratings_list[idx])) :
        continue
    print("#", list_num , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1



Top books are:
# 1 Fourth Wing
        (The Empyrean, #1) Rating: 5.0
# 2 The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life Rating: 5.0
# 3 Becoming Rating: 5.0
# 4 Lord of the Flies Rating: 5.0
# 5 The Only One Left Rating: 5.0
# 6 Happy Place Rating: 5.0
# 7 People We Meet on Vacation Rating: 5.0
# 8 The Ballad of Songbirds and Snakes
        (The Hunger Games, #0) Rating: 5.0
# 9 Book Lovers Rating: 4.0
# 10 Beach Read Rating: 4.0
# 11 Just for the Summer Rating: 4.0
# 12 The Seven Year Slip Rating: 4.0
# 13 Iron Flame
        (The Empyrean, #2) Rating: 4.0
# 14 Funny Story Rating: 4.0
# 15 The Last Flight Rating: 3.0
# 16 A Court of Frost and Starlight
        (A Court of Thorns and Roses, #3.5) Rating: 3.0
# 17 Paper Towns Rating: 3.0
# 18 The Song of Achilles Rating: 3.0
# 19 A Wrinkle in Time
        (Time Quintet, #1) Rating: 3.0
# 20 Terms and Conditions
        (Dreamland Billionaires, #2) Rating: 2.0
# 21 Cause of Death
        (Kay Scarp

# 247 The Princess Diaries
        (The Princess Diaries, #1) Rating: 0.0
# 248 It's Not Summer Without You
        (Summer, #2) Rating: 0.0
# 249 Seven Days in June Rating: 0.0
# 250 One Shot
        (Jack Reacher, #9) Rating: 0.0
# 251 Maybe You Should Talk to Someone Rating: 0.0
# 252 Lady Tan’s Circle of Women Rating: 0.0
# 253 When She Woke Rating: 0.0
# 254 Tell Me Lies Rating: 0.0
# 255 Jonathan Strange & Mr Norrell Rating: 0.0
# 256 Everything I Never Told You Rating: 0.0
# 257 Four to Score
        (Stephanie Plum, #4) Rating: 0.0
# 258 The Girl Who Kicked the Hornet’s Nest
        (Millennium, #3) Rating: 0.0
# 259 Big Lies in a Small Town Rating: 0.0
# 260 West With Giraffes Rating: 0.0
# 261 Angels Flight
        (Harry Bosch, #6; Harry Bosch Universe, #8) Rating: 0.0
# 262 The Elfstones of Shannara
        (The Original Shannara Trilogy, #2) Rating: 0.0
# 263 Eclipse
        (The Twilight Saga, #3) Rating: 0.0
# 264 Mostly Harmless
        (Hitchhiker's Guide to the Galaxy

        (Discworld, #13) Rating: 0.0
# 532 The Storyteller's Secret Rating: 0.0
# 533 Little Dorrit Rating: 0.0
# 534 State of Terror Rating: 0.0
# 535 The Illustrated Man Rating: 0.0
# 536 Aristotle and Dante Discover the Secrets of the Universe
        (Aristotle and Dante, #1) Rating: 0.0
# 537 Beautiful Darkness
        (Caster Chronicles, #2) Rating: 0.0
# 538 Foundation
        (Foundation, #1) Rating: 0.0
# 539 The Dark Half Rating: 0.0
# 540 Tomorrow, and Tomorrow, and Tomorrow Rating: 0.0
# 541 Steal Like an Artist: 10 Things Nobody Told You About Being Creative Rating: 0.0
# 542 The Omnivore's Dilemma: A Natural History of Four Meals Rating: 0.0
# 543 The Lincoln Highway Rating: 0.0
# 544 Smokin' Seventeen
        (Stephanie Plum, #17) Rating: 0.0
# 545 The Hero of Ages
        (Mistborn, #3) Rating: 0.0
# 546 The Midnight Line
        (Jack Reacher, #22) Rating: 0.0
# 547 The Wife Between Us Rating: 0.0
# 548 Band of Brothers: E Company, 506th Regiment, 101st Airborne from N

        (The Powerless Trilogy, #1) Rating: 0.0
# 817 All Together Dead
        (Sookie Stackhouse, #7) Rating: 0.0
# 818 Where the Forest Meets the Stars Rating: 0.0
# 819 Sing, Unburied, Sing Rating: 0.0
# 820 Dead in the Family
        (Sookie Stackhouse, #10) Rating: 0.0
# 821 Truly Madly Guilty Rating: 0.0
# 822 10th Anniversary
        (Women's Murder Club, #10) Rating: 0.0
# 823 Daemon
        (Daemon, #1) Rating: 0.0
# 824 The Portrait of a Lady Rating: 0.0
# 825 Mother May I Rating: 0.0
# 826 Mudbound Rating: 0.0
# 827 The Girls' Guide to Hunting and Fishing Rating: 0.0
# 828 What Lies in the Woods Rating: 0.0
# 829 2001: A Space Odyssey
        (Space Odyssey, #1) Rating: 0.0
# 830 Dragonflight
        (Dragonriders of Pern, #1) Rating: 0.0
# 831 Dead Until Dark
        (Sookie Stackhouse, #1) Rating: 0.0
# 832 State of Wonder Rating: 0.0
# 833 End of Watch
        (Bill Hodges Trilogy, #3) Rating: 0.0
# 834 What I Talk About When I Talk About Running Rating: 0.0
# 835 The Sa

# 1125 The Titan’s Curse
        (Percy Jackson and the Olympians, #3) Rating: 0.0
# 1126 The House of Eve Rating: 0.0
# 1127 Ordinary Grace Rating: 0.0
# 1128 The Eyes of the Dragon Rating: 0.0
# 1129 The Closers
        (Harry Bosch, #11; Harry Bosch Universe, #15) Rating: 0.0
# 1130 Gerald's Game Rating: 0.0
# 1131 The Risk
        (Mindf*ck, #1) Rating: 0.0
# 1132 Stone of Tears
        (Sword of Truth, #2) Rating: 0.0
# 1133 Life As We Knew It
        (Last Survivors, #1) Rating: 0.0
# 1134 In an Instant Rating: 0.0
# 1135 The Berry Pickers Rating: 0.0
# 1136 Stuart Little Rating: 0.0
# 1137 Shadow Puppets
        (The Shadow Series, #3) Rating: 0.0
# 1138 Just the Nicest Couple Rating: 0.0
# 1139 All the Bright Places Rating: 0.0
# 1140 Upgrade Rating: 0.0
# 1141 Y: The Last Man, Vol. 1: Unmanned Rating: 0.0
# 1142 I Capture the Castle Rating: 0.0
# 1143 Love You Forever Rating: 0.0
# 1144 Tinkers Rating: 0.0
# 1145 Triptych
        (Will Trent, #1) Rating: 0.0
# 1146 The Last Ki

# 1529 The Moonstone Rating: 0.0
# 1530 The Rom-Commers Rating: 0.0
# 1531 The Queen of the Damned
        (The Vampire Chronicles, #3) Rating: 0.0
# 1532 At the Water's Edge Rating: 0.0
# 1533 This is Going to Hurt: Secret Diaries of a Junior Doctor Rating: 0.0
# 1534 Lucky Rating: 0.0
# 1535 Silver Borne
        (Mercy Thompson, #5) Rating: 0.0
# 1536 Tuesdays with Morrie Rating: 0.0
# 1537 Barefoot Rating: 0.0
# 1538 The Villa Rating: 0.0
# 1539 The Deep Rating: 0.0
# 1540 Dead Witch Walking
        (The Hollows, #1) Rating: 0.0
# 1541 Pines
        (Wayward Pines, #1) Rating: 0.0
# 1542 My Sisterâ€™s Keeper Rating: 0.0
# 1543 True Believer
        (Jeremy Marsh & Lexie Darnell, #1) Rating: 0.0
# 1544 Life, the Universe and Everything
        (The Hitchhiker's Guide to the Galaxy, #3) Rating: 0.0
# 1545 Who Moved My Cheese? An Amazing Way to Deal with Change in Your Work and in Your Life... Rating: 0.0
# 1546 Desperation Rating: 0.0
# 1547 Exit Strategy
        (The Murderbot Diarie

# 1829 A Farewell to Arms Rating: 0.0
# 1830 The Love of My Afterlife Rating: 0.0
# 1831 Mother-Daughter Murder Night Rating: 0.0
# 1832 The Judge's List
        (The Whistler, #2) Rating: 0.0
# 1833 The Communist Manifesto Rating: 0.0
# 1834 Where the Red Fern Grows Rating: 0.0
# 1835 Needful Things Rating: 0.0
# 1836 Finger Lickin' Fifteen
        (Stephanie Plum, #15) Rating: 0.0
# 1837 Half Broke Horses Rating: 0.0
# 1838 The Lorax Rating: 0.0
# 1839 Hard-Boiled Wonderland and the End of the World Rating: 0.0
# 1840 Open House Rating: 0.0
# 1841 The Secret Place
        (Dublin Murder Squad, #5) Rating: 0.0
# 1842 The Life and Times of the Thunderbolt Kid Rating: 0.0
# 1843 Revelations
        (Blue Bloods, #3) Rating: 0.0
# 1844 Lola and the Boy Next Door
        (Anna and the French Kiss, #2) Rating: 0.0
# 1845 The Casual Vacancy Rating: 0.0
# 1846 The Mist Rating: 0.0
# 1847 The God of Small Things Rating: 0.0
# 1848 Watchers Rating: 0.0
# 1849 Not a Happy Family Rating: 0.0
# 1

# 2148 Thinking, Fast and Slow Rating: 0.0
# 2149 Timeline Rating: 0.0
# 2150 Every Last Fear Rating: 0.0
# 2151 Brave New World and Brave New World Revisited Rating: 0.0
# 2152 The World Is Flat: A Brief History of the Twenty-first Century Rating: 0.0
# 2153 Legendborn
        (The Legendborn Cycle, #1) Rating: 0.0
# 2154 A Talent for Murder
        (Henry Kimball/Lily Kintner, #3) Rating: 0.0
# 2155 The King of Torts Rating: 0.0
# 2156 The White Queen
        (The Plantagenet and Tudor Novels, #2) Rating: 0.0
# 2157 Out of the Dust Rating: 0.0
# 2158 Jude the Obscure Rating: 0.0
# 2159 Push Rating: 0.0
# 2160 Brother Odd
        (Odd Thomas, #3) Rating: 0.0
# 2161 Once There Were Wolves Rating: 0.0
# 2162 In the Woods
        (Dublin Murder Squad, #1) Rating: 0.0
# 2163 The Bone Clocks Rating: 0.0
# 2164 The Nest Rating: 0.0
# 2165 Leave the World Behind Rating: 0.0
# 2166 Water for Elephants Rating: 0.0
# 2167 The Great Divorce Rating: 0.0
# 2168 Astrophysics for People in a Hurry R

# 2473 The Silver Chair
        (Chronicles of Narnia, #4) Rating: 0.0
# 2474 Nightwatching Rating: 0.0
# 2475 The Power of Now: A Guide to Spiritual Enlightenment Rating: 0.0
# 2476 By Any Other Name Rating: 0.0
# 2477 Bloodsucking Fiends
        (A Love Story, #1) Rating: 0.0
# 2478 Empire of Storms
        (Throne of Glass, #5) Rating: 0.0
# 2479 The Art of Fielding Rating: 0.0
# 2480 Light in August Rating: 0.0
# 2481 Sense and Sensibility Rating: 0.0
# 2482 Medea Rating: 0.0
# 2483 Eye of the Needle Rating: 0.0
# 2484 Simple Genius
        (Sean King & Michelle Maxwell, #3) Rating: 0.0
# 2485 Danse Macabre
        (Anita Blake, Vampire Hunter, #14) Rating: 0.0
# 2486 The Gifts of Imperfection Rating: 0.0
# 2487 Convenience Store Woman Rating: 0.0
# 2488 Trace
        (Kay Scarpetta, #13) Rating: 0.0
# 2489 Manâ€™s Search for Meaning Rating: 0.0
# 2490 Heart Bones Rating: 0.0
# 2491 Ghosts Rating: 0.0
# 2492 Valentine Rating: 0.0
# 2493 Ugly Love Rating: 0.0
# 2494 The Scarlet Pimp

# 2757 Last Argument of Kings
        (The First Law, #3) Rating: 0.0
# 2758 The Body Keeps the Score: Brain, Mind, and Body in the Healing of Trauma Rating: 0.0
# 2759 Haunting Adeline
        (Cat and Mouse, #1) Rating: 0.0
# 2760 Long Island
        (Eilis Lacey, #2) Rating: 0.0
# 2761 The Prophet Rating: 0.0
# 2762 Sojourn
        (Forgotten Realms: The Dark Elf Trilogy, #3; Legend of Drizzt, #3) Rating: 0.0
# 2763 The Dead Zone Rating: 0.0
# 2764 The Gathering Rating: 0.0
# 2765 Beastly
        (Beastly, #1) Rating: 0.0
# 2766 Bone Crossed
        (Mercy Thompson, #4) Rating: 0.0
# 2767 A Wanted Man
        (Jack Reacher, #17) Rating: 0.0
# 2768 The Husband's Secret Rating: 0.0
# 2769 Kushiel's Dart
        (PhÃ¨dre's Trilogy, #1) Rating: 0.0
# 2770 Hollow City
        (Miss Peregrine's Peculiar Children, #2) Rating: 0.0
# 2771 The Ashes & the Star-Cursed King
        (Crowns of Nyaxia, #2) Rating: 0.0
# 2772 Persuasion Rating: 0.0
# 2773 The Thirteenth Tale Rating: 0.0
# 2774 The

In [None]:
#doing masked autoencoder
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset, random_split

# Mask for observed values (1 for observed, 0 for missing)
ratings_torch = torch.tensor(ratings).float()
mask = (ratings_torch != 0).float()
mask_tensor = torch.tensor(mask)


#Define autoencoder
class SparseAutoencoder(nn.Module):
    def __init__(self, num_items, latent_dim):
        super(SparseAutoencoder, self).__init__()
        hidden1 = latent_dim*2
        self.encoder1 = nn.Linear(num_items, hidden1)
        self.encoder2 = nn.Linear(hidden1, latent_dim)
        self.decoder1 = nn.Linear(latent_dim, hidden1)
        self.decoder2 = nn.Linear(hidden1, num_items)
        
    def forward(self, x):
        x = torch.relu(self.encoder1(x))
        x = torch.relu(self.encoder2(x))
        x = torch.relu(self.decoder1(x))
        x = self.decoder2(x)
        # Scale sigmoid output to [1, 5]
        return 1 + 4 * torch.sigmoid(x)

    
#initialize the model
num_users, num_items = ratings_torch.shape
np.save("num_users.npy", np.array(num_users))
np.save("num_items.npy", np.array(num_items))
latent_dim = int(num_items/8) # Number of latent features

model = SparseAutoencoder(num_items, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Use MSE loss but only consider observed values
def masked_mse_loss(reconstructed, original, mask):
    loss = ((reconstructed - original) ** 2) * mask
    return loss.sum() / mask.sum()

#break up data into train and val
dataset = TensorDataset(ratings_torch, mask_tensor) #keeping the mask
print("ratings_torch shape =", ratings_torch.shape)
print(len(dataset))
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("len(train_loader) = ", len(train_loader))
print("len(val_loader) = ", len(val_loader))

#train the model
epochs = 5000
best_loss = 10e10
counter = 0
for epoch in range(epochs):
    train_loss = 0.0
    for inputs, mask in train_loader:
        model.train()
        optimizer.zero_grad()

        # Forward pass
        reconstructed = model(inputs)
        loss = masked_mse_loss(reconstructed, inputs, mask)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    if (epoch + 1) % 10 == 0:
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_loss}")
        #check validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, mask in val_loader:
                outputs = model(inputs)
                loss = masked_mse_loss(outputs, inputs, mask)
                val_loss += loss.item()
                

        val_loss /= len(val_loader)
        

        print(f"Epoch {epoch + 1} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
        if val_loss < best_loss: #if improve then save
            #save model 
            torch.save(model.state_dict(), "2model{}.pkl".format(latent_dim))
            best_loss = val_loss
            print("Model saved to model{}.pkl.".format(latent_dim))
            counter = 0
        else:
            counter += 10
            
    if counter > 200:
        print("Done training because of no improvement.")
        break
        
            


  mask_tensor = torch.tensor(mask)


ratings_torch shape = torch.Size([1742, 3023])
1742
len(train_loader) =  44
len(val_loader) =  11
Epoch 10 - Train Loss: 0.8806 - Val Loss: 1.0247
Model saved to model377.pkl.
Epoch 20 - Train Loss: 0.8102 - Val Loss: 0.9798
Model saved to model377.pkl.
Epoch 30 - Train Loss: 0.8070 - Val Loss: 0.9693
Model saved to model377.pkl.
Epoch 40 - Train Loss: 0.8184 - Val Loss: 0.9664
Model saved to model377.pkl.
Epoch 50 - Train Loss: 0.7994 - Val Loss: 0.9694
Epoch 60 - Train Loss: 0.7973 - Val Loss: 0.9719
Epoch 70 - Train Loss: 0.7976 - Val Loss: 0.9736
Epoch 80 - Train Loss: 0.7872 - Val Loss: 0.9717
Epoch 90 - Train Loss: 0.8694 - Val Loss: 1.0036
Epoch 100 - Train Loss: 0.8799 - Val Loss: 0.9691
Epoch 110 - Train Loss: 0.8911 - Val Loss: 0.9843
Epoch 120 - Train Loss: 0.8619 - Val Loss: 0.9844
Epoch 130 - Train Loss: 0.8747 - Val Loss: 0.9725


In [None]:
num_users, num_items

In [None]:
print(sum(sum(mask)))
print(mask.shape)
print(mask.shape[0] * mask.shape[1])
print(sum(sum(mask)) / (mask.shape[0] * mask.shape[1]))



In [None]:
#Evaulating the model
model.eval()
with torch.no_grad():
    reconstructed = model(ratings_torch)

# Fill missing values in the original matrix
filled_data = ratings_torch.clone()
filled_data[mask_tensor == 0] = reconstructed[mask_tensor == 0]

print("Original Data:\n", ratings_torch)
print("Reconstructed Data:\n", reconstructed)
print("Filled Data:\n", filled_data)


In [None]:
print(reconstructed[0].numpy())
print(ratings_torch[0].numpy())

print((reconstructed[0].numpy()-ratings_torch[0].numpy())/ratings_torch[0].numpy())

In [None]:
pred_ratings_list = reconstructed[0].detach().numpy()

#give a list sorted out with books you've already read:
sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
list_num = 1
for idx in sorted_indices: 
#     print("ratings_matrix[user_id, idx]= ", ratings_matrix[user_id, idx])
    if  (ratings_matrix[user_id, idx] > 0) or(np.isnan(pred_ratings_list[idx])) :
        continue
    print("#", list_num , titles[idx], " - Predicted Rating:", round(pred_ratings_list[idx], 1))
    list_num += 1

In [None]:
#making weighted loss matrix
percents = np.array([ 2.0839861,   6.38564535, 22.8939068,  37.94135873, 30.69510302])
each_weights = 100/percents
print(each_weights)
print(each_weights.sum())

print(each_weights * percents)

weights_array = np.zeros(ratings_torch.shape)
for i in tqdm(range(len(ratings_torch))):
    for j in range(len(ratings_torch[0])):
        for num in [1, 2, 3, 4, 5]:
            if ratings_torch[i,j] == num:
                weights_array[i,j] = each_weights[num-1]
weights_tensor = torch.tensor(weights_array)

In [None]:
weights[0]

In [None]:
#doing masked autoencoder with weighted loss
latent_dim = 100 # Number of latent features

model = SparseAutoencoder(num_items, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Use MSE loss with weights but only consider observed values
def masked_mse_loss_diff(reconstructed, original, mask, weights):
    loss = (((reconstructed - original) ** 2) * mask)
    weighted_loss = loss * weights
    return weighted_loss.sum() / mask.sum()/100

#break up data into train and val
print("ratings_torch shape = ", ratings_torch.shape)
print("mask_tensor shape = ", mask_tensor.shape)
print("weights shape = ", weights_tensor.shape)

dataset = TensorDataset(ratings_torch, mask_tensor, weights_tensor) #keeping the mask
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


#train the model
epochs = 5000
best_loss = 10e10
counter = 0
for epoch in range(epochs):
    train_loss = 0.0
    for inputs, mask, this_weight in train_loader:
        model.train()
        optimizer.zero_grad()

        # Forward pass
        reconstructed = model(inputs)
        loss = masked_mse_loss_diff(reconstructed, inputs, mask, this_weight)
#         loss_not_weighted = masked_mse_loss(reconstructed, inputs, mask)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()

    train_loss /= len(train_loader)
    
    if (epoch + 1) % 10 == 0:
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_loss}")
        #check validation
        model.eval()
        val_loss = 0.0
        val_loss_not_weighted = 0.0
        with torch.no_grad():
            for inputs, mask, this_weight in val_loader:
                outputs = model(inputs)
                loss = masked_mse_loss_diff(outputs, inputs, mask, this_weight)
                loss_not_weighted = masked_mse_loss(outputs, inputs, mask)
                val_loss += loss.item()
                val_loss_not_weighted += loss_not_weighted.item()
                

        val_loss /= len(val_loader)
        

        print(f"Epoch {epoch + 1} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Val not weighted: {val_loss_not_weighted:.4f}" )
        if val_loss < best_loss: #if improve then save
            #save model 
            torch.save(model.state_dict(), "model_weighted{}.pkl".format(latent_dim))
            best_loss = val_loss
            print("Model saved to model_weighted{}.pkl.".format(latent_dim))
            counter = 0
        else:
            counter += 10
            
    if counter > 200:
        print("Done training because of no improvement.")
        break
        
            


In [None]:
dfghj
import torch
from sklearn.model_selection import KFold

# Mask for observed values (1 for observed, 0 for missing)
ratings_torch = torch.tensor(ratings).float()
mask = (ratings_torch != 0).float()
print(mask)


#Define autoencoder
class SparseAutoencoder(nn.Module):
    def __init__(self, num_items, latent_dim):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Linear(num_items, latent_dim)
        self.decoder = nn.Linear(latent_dim, num_items)
        
    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        # Scale sigmoid output to [1, 5]
        return 1 + 4 * torch.sigmoid(decoded)
        return decoded

    
#initialize the model
num_users, num_items = ratings_torch.shape
np.save("num_users.npy", np.array(num_users))
np.save("num_items.npy", np.array(num_items))

for latent_dim in [2, 5, 10, 20, 40, 50, 75, 100]:
    print("latent_dim = ", latent_dim)
# latent_dim = 20  # Number of latent features

    model = SparseAutoencoder(num_items, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Define your model, loss function, and optimizer
    # Assuming model, ratings_torch, mask, and optimizer are already defined

    epochs = 1000
    k_folds = 5  # Number of folds for cross-validation
    kf = KFold(n_splits=k_folds, shuffle=True)

    # Store the losses for each fold
    fold_losses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(ratings_torch)):
#         print(f"\nFold {fold + 1}/{k_folds}")

        # Split the data into training and validation sets
        train_ratings = ratings_torch[train_idx]
        val_ratings = ratings_torch[val_idx]
        train_mask = mask[train_idx]
        val_mask = mask[val_idx]

        # Initialize a new model for each fold
        model = SparseAutoencoder(num_items, latent_dim)
    #     optimizer = optim.Adam(model.parameters(), lr=0.01)

        # Re-initialize optimizer for each fold
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        # Train the model
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()

            # Forward pass for training
            reconstructed = model(train_ratings)
            loss = masked_mse_loss(reconstructed, train_ratings, train_mask)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

#             if (epoch + 1) % 1000 == 0:
#                 print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

        # Evaluate the model on the validation set
        model.eval()
        with torch.no_grad():
            reconstructed_val = model(val_ratings)
            val_loss = masked_mse_loss(reconstructed_val, val_ratings, val_mask)

        print(f"Validation Loss for Fold {fold + 1}: {val_loss.item():.4f}")

        # Store the validation loss for this fold
        fold_losses.append(val_loss.item())

    # Print the average validation loss after all folds
    print(f"\nAverage Validation Loss across all folds: {sum(fold_losses)/k_folds:.4f}")


In [None]:
fgh

In [None]:
ratings_matrix[user_id]

In [None]:
# Convert the filled ratings data into a numpy array
ratings_matrix = ratings_df.values

# Initialize KNN (using user-based KNN)
import math
knn = NearestNeighbors(n_neighbors=math.ceil(num_users/10), metric='cosine')  # Using cosine similarity
knn.fit(ratings_matrix)

# Example: Predict rating for User 0 on Item 2
user_id = 0  # Index of user in the matrix
item_id = 9  # Index of item to predict rating for

# Get the nearest neighbors for user 0 (excluding the user itself)
distances, indices = knn.kneighbors([ratings_matrix[user_id]])

pred_ratings_list = np.array([])
rankings_list = np.array([])
for item_id in range(num_titles):
    # Get the ratings for the neighbors on item 2
    neighbor_ratings = np.array([ratings_matrix[i, item_id] for i in indices[0] if not np.isnan(ratings_matrix[i, item_id])])
    print(ratings_matrix[:, item_id])
    print(neighbor_ratings)
    ghjk
    
    predicted_rating = np.mean(neighbor_ratings[np.nonzero(neighbor_ratings)])
    rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

#     print(predicted_rating)

    pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    rankings_list = np.append(rankings_list, rankings)
#     print(f"Predicted rating for User {user_id} on Item {item_id}: {predicted_rating}")
    
best_book_rating = np.max(pred_ratings_list)
best_book_idx = np.argmax(pred_ratings_list)

sorted_indices = np.argsort(pred_ratings_list)[::-1]
print("Top books are:")
for i, idx in enumerate(sorted_indices): 
    print("#", (i+1) , titles[idx], "Rating:", round(pred_ratings_list[idx], 1))

In [None]:
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds

# Create a sparse matrix (CSR format)
A = np.array([[1.0, 0, 0], [0, 2, 3], [4, 0, 6], [0, 0, 0]])
# print(A)
sparse_matrix = sp.csr_matrix(ratings_matrix)

# Perform SVD on the sparse matrix
# k is the number of singular values to compute (you can choose a value smaller than min(m, n))
U, S, VT = svds(sparse_matrix, k=500)

# Output the matrices
print("U (Left Singular Vectors):\n", U)
print("\nS (Singular Values):\n", S)
print("\nVT (Right Singular Vectors - Transposed):\n", VT)

# Reconstruct the matrix from U, S, VT
S_full = np.diag(S)  # Convert singular values to a diagonal matrix
A_reconstructed = np.dot(U, np.dot(S_full, VT))

print("\nReconstructed Matrix A:\n", A_reconstructed)


In [None]:
my_diff = (ratings_matrix[0]- A_reconstructed[0])
print(ratings_matrix.shape)
plt.plot(my_diff, '.')

# for i in range(len(ratings_matrix[0])):
#     if ratings_matrix[0, i] > 0:
#         print(ratings_matrix[0, i], A_reconstructed[0, i], titles[i])
        
for i in range(len(ratings_matrix[0])):
    if ratings_matrix[0, i] == 0 and A_reconstructed[0, i] > 0:
        print(ratings_matrix[0, i], A_reconstructed[0, i], titles[i])


In [None]:
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Create a custom colormap with white for 0 and red for non-zero
cmap = mcolors.ListedColormap(['white', 'red'])
bounds = [0, 0.1, 1]  # Set bounds for 0 (white) and non-zero (red)
norm = mcolors.BoundaryNorm(bounds, cmap.N)

# plt.imshow(ratings_matrix - A_reconstructed, cmap=cmap, norm=norm)
plt.plot(ratings_matrix - A_reconstructed)
plt.show()

In [None]:
ratings_matrix - A_reconstructed

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.75, min_samples=2, metric='cosine')
labels = dbscan.fit_predict(ratings_matrix)


In [None]:
print(list(set(labels)))
print(labels.shape)

In [None]:
idx_in_group = np.arange(len(labels))
filtered_users = [idx_in_group for idx_in_group, label in zip(idx_in_group, labels) if label == 0]
print(filtered_users)

In [None]:
from sklearn.cluster import SpectralClustering
from scipy.sparse import csr_matrix
import numpy as np

# Example sparse data
# X = np.random.rand(100, 2)
X_sparse = csr_matrix(ratings_matrix)

n_clusters = 50
# Apply Spectral Clustering
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors')
labels = spectral.fit_predict(X_sparse)

print(labels)
print(list(set(labels)))
print(labels.shape)

In [None]:
# idx_in_group = np.arange(len(labels))
# filtered_users = [idx_in_group for idx_in_group, label in zip(idx_in_group, labels) if label == 0]
# print(filtered_users)

group_averages = []

for group in range(n_clusters):
    # Find indices of users in the current group
    group_users = np.where(labels == group)[0]
    
    # Extract the rows for users in this group
    group_data = ratings_matrix[group_users]
    
    print("Number of perople in group = ", group_data.shape[0])
    
    pred_ratings_list = np.array([])
    for item_id in range(num_titles):
        data = group_data[:,item_id]
        predicted_rating = np.mean(data[np.nonzero(data)])
#                 rankings = np.sum(neighbor_ratings[np.nonzero(neighbor_ratings)])

        pred_ratings_list = np.append(pred_ratings_list, predicted_rating)
    # Compute the average for each column (item) for this group
#     group_avg = np.mean(group_data[np.nonzero(group_data)], axis=0)
    
    # Append the average for this group
    group_averages.append(pred_ratings_list)

# Convert the list of group averages to a numpy array for easy viewing
group_averages = np.array(group_averages)

# Display the average for each item in each group
print("Average preferences for each item by group:")
print(group_averages.shape)

In [None]:
group = labels[0]
print("my group = ", group)
sorted_indices = np.argsort(group_averages[group])[::-1]
print(sorted_indices)
for i in sorted_indices:
    if (ratings_matrix[0, i] > 0) or (np.isnan(group_averages[group, i])):
        pass
    else:
        print(titles[i], round(group_averages[group,i], 1))