# Filtering and encoding users preferences

to avoid encoding a single user's preference in a sparse vector of 10000 elements, we save only the preferences of the 3 most rated clusters (3000)

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [11]:
df_books = pd.read_csv("books_autorec.csv")
df_ratings = pd.read_csv("ratings_autorec.csv")

Create a dictionary to serch user's ratings by user id

In [12]:
# merge to add cluster in ratings
df_ratings_with_clusters = df_ratings.merge(
    df_books[['goodreads_book_id', 'cluster']], 
    left_on='book_id', 
    right_on='goodreads_book_id', 
    how='left'
)

# create dictionary with user and ratings
sparse_users = {}
for user_id, group in df_ratings_with_clusters.groupby('user_id'):
    books_ratings_clusters = group[['book_id', 'rating', 'cluster']].values.tolist()
    sparse_users[user_id] = books_ratings_clusters

In [13]:
sparse_users[88][:20]  # Display first 20 ratings for user 88

[[2657, 4, 5],
 [48855, 3, 8],
 [1885, 4, 5],
 [7613, 4, 5],
 [5107, 3, 5],
 [4381, 4, 6],
 [7624, 3, 5],
 [3, 4, 1],
 [18135, 3, 6],
 [5907, 3, 0],
 [3636, 4, 1],
 [11127, 3, 1],
 [77203, 2, 2],
 [1420, 4, 6],
 [11486, 5, 5],
 [168668, 2, 5],
 [41865, 1, 3],
 [4981, 3, 5],
 [6900, 2, 6],
 [14050, 2, 2]]

In [14]:
print("Size of the clusters:")
df_books['cluster'].value_counts().sort_index()

Size of the clusters:


cluster
0     1023
1      656
2     1104
3      708
4      109
5      477
6     1530
7     1299
8      545
9      512
10     295
11     537
12     458
13     659
14      88
Name: count, dtype: int64

In [20]:
def find_clusters(user, max_clusters=3):
    """giver a user, this dunction find the most common clusters of books they rated"""
    
    # list of clusters for a user
    list_of_clusters = []
    for book_id, rating, cluster in sparse_users[user]:
        list_of_clusters.append(cluster)
        
    # find most common clusters
    counter = Counter(list_of_clusters)
    list_of_clusters = [item for item, count in counter.most_common()]
    
    if len(list_of_clusters) < max_clusters:
        return list_of_clusters
    else:
        return list_of_clusters[:max_clusters]


# --- ALTERNATIVA CON IL RATIO ---

cluster_counts = df_books['cluster'].value_counts().sort_index()

def find_clusters_ratio(user, max_clusters=3):

    # list of clusters for a user
    list_of_clusters = []
    for book_id, rat_ing, cluster in sparse_users[user]:
        list_of_clusters.append(cluster)
        # calculate ratio of ratings per cluster for this user
        cluster_rating_counts = Counter(list_of_clusters)
        cluster_ratios = {}
        for cluster, count in cluster_rating_counts.items():
            total_books_in_cluster = cluster_counts[cluster]
            cluster_ratios[cluster] = count / total_books_in_cluster if total_books_in_cluster > 0 else 0

        # sort clusters by ratio descending
        list_of_clusters = [cluster for cluster, ratio in sorted(cluster_ratios.items(), key=lambda x: x[1], reverse=True)]
    
    if len(list_of_clusters) < max_clusters:
        return list_of_clusters
    else:
        return list_of_clusters[:max_clusters]

In [31]:
print(find_clusters_ratio(88, 5)) # show the 5 most rated clusters of user 88

[12, 5, 8, 1, 3]


In [32]:
def embed(user):
    """Evaluates the embedding of a user preferences"""
    
    selected_clusters = find_clusters_ratio(user)
    
    user_embedding = []
    
    for idx, row in df_books.iterrows():
        book_id = row['goodreads_book_id']
        cluster = row['cluster']
        ratings = sparse_users[user]
        
        if cluster in selected_clusters:
            rated = next((rating for b_id, rating, c in ratings if b_id == book_id), None)
            if rated is not None:
                user_embedding.append([book_id, rated])
            else:
                user_embedding.append([book_id, 0])  
                
    return user_embedding  

In [33]:
len(embed(88))

1480