# Filtering and encoding users preferences

to avoid encoding a single user's preference in a sparse vector of 10000 elements, we save only the preferences of the 3 most rated clusters

## campionamento pesato

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
df_books = pd.read_csv("books_autorec.csv")
df_ratings = pd.read_csv("ratings_autorec.csv")

Create a dictionary to serch user's ratings by user id

In [4]:
# merge to add cluster in ratings
df_ratings_with_clusters = df_ratings.merge(
    df_books[['goodreads_book_id', 'cluster']], 
    left_on='book_id', 
    right_on='goodreads_book_id', 
    how='left'
)

# create dictionary with user and ratings
sparse_users = {}
for user_id, group in df_ratings_with_clusters.groupby('user_id'):
    books_ratings_clusters = group[['book_id', 'rating', 'cluster']].values.tolist()
    sparse_users[user_id] = books_ratings_clusters

In [9]:
sparse_users[88][:20]  # Display first 20 ratings for user 88

[[2657, 4, 0],
 [48855, 3, 4],
 [1885, 4, 0],
 [7613, 4, 0],
 [5107, 3, 8],
 [4381, 4, 1],
 [7624, 3, 0],
 [3, 4, 7],
 [18135, 3, 8],
 [5907, 3, 10],
 [3636, 4, 5],
 [11127, 3, 5],
 [77203, 2, 1],
 [1420, 4, 8],
 [11486, 5, 1],
 [168668, 2, 8],
 [41865, 1, 12],
 [4981, 3, 0],
 [6900, 2, 8],
 [14050, 2, 1]]

In [17]:
cluster_sizes = df_books['cluster'].value_counts().sort_index().values
print(cluster_sizes)

[ 409 1275  547 1273  475  853  404  790  195  931 1114  307  670  514
  243]


In [24]:
def find_clusters_ratio(user):
    # list of clusters for a user
    list_of_clusters = []
    for book_id, rating, cluster in sparse_users[user]:
        list_of_clusters.append(cluster)
    
    # calculate ratio of ratings per cluster for this user
    cluster_rating_counts = Counter(list_of_clusters)
    total_user_ratings = len(list_of_clusters)  # total number of ratings
    
    cluster_ratios = {}
    for cluster, count in cluster_rating_counts.items():
        # percentage of ratings in this cluster
        ratio = count / total_user_ratings
        cluster_ratios[cluster] = (ratio, count)  # tuple with (ratio, count)
    
    # sort clusters by ratio descending, with cluster, ratio and count
    list_of_clusters = [[cluster, ratio, count] for cluster, (ratio, count) in sorted(cluster_ratios.items(), key=lambda x: x[1][0], reverse=True)]
    return list_of_clusters

In [26]:
e88 = find_clusters_ratio(88)

In [50]:
def sample_from_cluster(n, cluster, embedding):
    
    all_books = df_books[df_books['cluster'] == cluster]
    count = 0
    while count < n:
        book = all_books.sample(1).iloc[0]
        if book['goodreads_book_id'] not in [x[0] for x in embedding]:
            embedding.append([book['goodreads_book_id'], 0])
            count += 1
    return embedding

In [None]:
def embed(user):
    embedding = []
    user_clusters = find_clusters_ratio(user)
    
    for book_id, rating, cluster in sparse_users[user]:
        embedding.append([book_id, rating])
    
    tot_to_sample = 300 - len(embedding)
    
    for cluster, ratio, _ in user_clusters:
        n = int(tot_to_sample * ratio)
        sample_from_cluster(n, cluster, embedding)
        
    while len(embedding) < 300:
        sample_from_cluster(1, user_clusters[0][0], embedding)
    
    return embedding

In [54]:
em88 = embed(88)
print(len(em88))
print(em88)

295
[[2657, 4], [48855, 3], [1885, 4], [7613, 4], [5107, 3], [4381, 4], [7624, 3], [3, 4], [18135, 3], [5907, 3], [3636, 4], [11127, 3], [77203, 2], [1420, 4], [11486, 5], [168668, 2], [41865, 1], [4981, 3], [6900, 2], [14050, 2], [114345, 3], [7244, 5], [5, 4], [8127, 3], [7604, 5], [136251, 4], [890, 3], [16981, 5], [6, 5], [1, 4], [1869, 4], [343, 3], [38447, 3], [231804, 3], [2, 4], [9791, 3], [8852, 4], [133518, 2], [12232938, 2], [2187, 4], [5148, 5], [39988, 5], [391729, 3], [7733, 4], [19380, 4], [70401, 2], [9717, 3], [1845, 1], [43641, 2], [4900, 1], [51606, 3], [227571, 1], [5043, 5], [76778, 3], [37781, 5], [6514, 4], [14706, 3], [58345, 4], [9777, 4], [187181, 2], [32261, 2], [4953, 4], [8921, 3], [101299, 1], [10890, 2], [11762, 5], [13872, 4], [116236, 3], [3867, 5], [14836, 3], [19089, 4], [14249, 5], [15881, 4], [7728, 3], [227265, 5], [77262, 4], [6759, 4], [14250, 4], [77554, 1], [14201, 4], [7073, 2], [30868, 4], [25078, 4], [5015, 1], [157993, 2], [18545, 3], [1299