# Filtering and encoding users preferences

to avoid encoding a single user's preference in a sparse vector of 10000 elements, we save only the preferences of the 3 most rated clusters

## campionamento pesato

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
df_books = pd.read_csv("books_autorec.csv")
df_ratings = pd.read_csv("ratings_autorec.csv")

Create a dictionary to serch user's ratings by user id

In [None]:
# merge to add cluster in ratings
df_ratings_with_clusters = df_ratings.merge(
    df_books[['goodreads_book_id', 'cluster']], 
    left_on='book_id', 
    right_on='goodreads_book_id', 
    how='left'
)

# create dictionary with user and ratings
sparse_users = {}
for user_id, group in df_ratings_with_clusters.groupby('user_id'):
    books_ratings_clusters = group[['book_id', 'rating', 'cluster']].values.tolist()
    sparse_users[user_id] = books_ratings_clusters

In [None]:
sparse_users[88][:20]  # Display first 20 ratings for user 88

In [None]:
cluster_sizes = df_books['cluster'].value_counts().sort_index().values
print(cluster_sizes)

In [None]:
def find_clusters_ratio(user):
    # list of clusters for a user
    list_of_clusters = []
    for book_id, rating, cluster in sparse_users[user]:
        list_of_clusters.append(cluster)
    
    # calculate ratio of ratings per cluster for this user
    cluster_rating_counts = Counter(list_of_clusters)
    total_user_ratings = len(list_of_clusters)  # total number of ratings
    
    cluster_ratios = {}
    for cluster, count in cluster_rating_counts.items():
        # percentage of ratings in this cluster
        ratio = count / total_user_ratings
        cluster_ratios[cluster] = (ratio, count)  # tuple with (ratio, count)
    
    # sort clusters by ratio descending, with cluster, ratio and count
    list_of_clusters = [[cluster, ratio, count] for cluster, (ratio, count) in sorted(cluster_ratios.items(), key=lambda x: x[1][0], reverse=True)]
    return list_of_clusters

In [None]:
e88 = find_clusters_ratio(88)

In [None]:
def sample_from_cluster(n, cluster, embedding):
    
    all_books = df_books[df_books['cluster'] == cluster]
    count = 0
    while count < n:
        book = all_books.sample(1).iloc[0]
        if book['goodreads_book_id'] not in [x[0] for x in embedding]:
            embedding.append([book['goodreads_book_id'], 0])
            count += 1
    return embedding

In [None]:
def embed(user):
    embedding = []
    user_clusters = find_clusters_ratio(user)
    
    for book_id, rating, cluster in sparse_users[user]:
        embedding.append([book_id, rating])
    
    tot_to_sample = 300 - len(embedding)
    
    for cluster, ratio, _ in user_clusters:
        n = int(tot_to_sample * ratio)
        sample_from_cluster(n, cluster, embedding)
        
    while len(embedding) < 300:
        sample_from_cluster(1, user_clusters[0][0], embedding)
    
    return embedding

In [None]:
em88 = embed(88)
print(len(em88))
print(em88)