# Filtering and encoding users preferences

to avoid encoding a single user's preference in a sparse vector of 10000 elements, we save only the preferences of the 3 most rated clusters (3000)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
df_books = pd.read_csv("books_autorec.csv")
df_ratings = pd.read_csv("ratings_autorec.csv")

Create a dictionary to serch user's ratings by user id

In [3]:
# merge to add cluster in ratings
df_ratings_with_clusters = df_ratings.merge(
    df_books[['goodreads_book_id', 'cluster']], 
    left_on='book_id', 
    right_on='goodreads_book_id', 
    how='left'
)

# create dictionary with user and ratings
sparse_users = {}
for user_id, group in df_ratings_with_clusters.groupby('user_id'):
    books_ratings_clusters = group[['book_id', 'rating', 'cluster']].values.tolist()
    sparse_users[user_id] = books_ratings_clusters

In [26]:
len(sparse_users[88])  # Display first 20 ratings for user 88

128

In [7]:
print("Size of the clusters:")
df_books['cluster'].value_counts().sort_index()

Size of the clusters:


cluster
0     1023
1      656
2     1104
3      708
4      109
5      477
6     1530
7     1299
8      545
9      512
10     295
11     537
12     458
13     659
14      88
Name: count, dtype: int64

In [9]:
def find_clusters(user, max_clusters=3):
    """giver a user, this dunction find the most common clusters of books they rated"""
    
    # list of clusters for a user
    list_of_clusters = []
    for book_id, rating, cluster in sparse_users[user]:
        list_of_clusters.append(cluster)
        
    # find most common clusters
    counter = Counter(list_of_clusters)
    list_of_clusters = [item for item, count in counter.most_common()]
    
    if len(list_of_clusters) < max_clusters:
        return list_of_clusters
    else:
        return list_of_clusters[:max_clusters]


# --- ALTERNATIVA CON IL RATIO ---

cluster_counts = df_books['cluster'].value_counts().sort_index()

def find_clusters_ratio(user, max_clusters=3):

    # list of clusters for a user
    list_of_clusters = []
    for book_id, rat_ing, cluster in sparse_users[user]:
        list_of_clusters.append(cluster)
        # calculate ratio of ratings per cluster for this user
        cluster_rating_counts = Counter(list_of_clusters)
        cluster_ratios = {}
        for cluster, count in cluster_rating_counts.items():
            total_books_in_cluster = cluster_counts[cluster]
            cluster_ratios[cluster] = count / total_books_in_cluster if total_books_in_cluster > 0 else 0

        # sort clusters by ratio descending
        list_of_clusters = [cluster for cluster, ratio in sorted(cluster_ratios.items(), key=lambda x: x[1], reverse=True)]
    
    if len(list_of_clusters) < max_clusters:
        return list_of_clusters
    else:
        return list_of_clusters[:max_clusters]

In [25]:
print(find_clusters_ratio(88, 5)) # show the 5 most rated clusters of user 88
len(find_clusters(88,30))

[12, 5, 8, 1, 3]


9

In [13]:
def embed(user):
    """Evaluates the embedding of a user preferences"""
    
    selected_clusters = find_clusters_ratio(user)
    
    user_embedding = []
    
    for idx, row in df_books.iterrows():
        book_id = row['goodreads_book_id']
        cluster = row['cluster']
        ratings = sparse_users[user]
        
        if cluster in selected_clusters:
            rated = next((rating for b_id, rating, c in ratings if b_id == book_id), None)
            if rated is not None:
                user_embedding.append([book_id, rated])
            else:
                user_embedding.append([book_id, 0])  
                
    return user_embedding  

In [22]:
embed(60)

[[2657, 0],
 [4671, 0],
 [5107, 0],
 [1885, 0],
 [5470, 0],
 [7613, 0],
 [7624, 0],
 [890, 0],
 [930, 0],
 [1934, 3],
 [10210, 0],
 [4214, 0],
 [30119, 0],
 [5129, 0],
 [2956, 0],
 [24178, 0],
 [6185, 0],
 [4981, 0],
 [18405, 0],
 [18490, 0],
 [14935, 0],
 [157993, 0],
 [1953, 0],
 [370493, 4],
 [1202, 0],
 [2998, 0],
 [320, 0],
 [5297, 0],
 [17245, 0],
 [7244, 4],
 [19543, 0],
 [7126, 0],
 [24280, 5],
 [168668, 0],
 [2187, 0],
 [38447, 0],
 [7604, 0],
 [2612, 0],
 [332613, 0],
 [2165, 0],
 [4395, 2],
 [8127, 5],
 [12296, 0],
 [23772, 0],
 [6310, 0],
 [2623, 0],
 [49552, 0],
 [6969, 0],
 [15823480, 0],
 [227463, 5],
 [1812457, 0],
 [7144, 0],
 [6514, 0],
 [52036, 0],
 [168642, 0],
 [39988, 0],
 [3228917, 0],
 [153747, 0],
 [11486, 1],
 [5326, 0],
 [40102, 0],
 [6867, 0],
 [2156, 0],
 [9712, 0],
 [36072, 0],
 [233093, 0],
 [4865, 0],
 [3876, 0],
 [6334, 0],
 [12873, 0],
 [6952, 0],
 [22034, 0],
 [30118, 0],
 [70401, 0],
 [295, 3],
 [4948, 0],
 [135479, 0],
 [2493, 0],
 [7933292, 0],
 [4