# Filtering

we want to represent each user preferences with a vector 

import libraries and data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df_books = pd.read_csv("books_autorec.csv")
df_booktags = pd.read_csv("book_tags.csv")
df_tags = pd.read_csv("tags.csv")
df_ratings = pd.read_csv("ratings_autorec.csv")

save a dictionary user -> list of ratings

In [3]:
sparse_users = {}

for user_id, group in df_ratings.groupby('user_id'):
    books_ratings = group[['book_id', 'rating']].values.tolist()
    sparse_users[user_id] = books_ratings

In [4]:
from collections import defaultdict

class ClusterBasedEncoder:
    def __init__(self, sparse_ratings, df_books, books_per_cluster=100):
        self.sparse_ratings = sparse_ratings
        self.df_books = df_books.set_index('goodreads_book_id')
        self.books_per_cluster = books_per_cluster
        
        self.book_to_cluster = dict(zip(df_books['goodreads_book_id'], df_books['cluster']))
        self.cluster_to_books = defaultdict(list)
        
        for book_id, cluster in self.book_to_cluster.items():
            self.cluster_to_books[cluster].append(book_id)
        
        self._select_cluster_representatives()
    
    def _calculate_book_score(self, book_id, local_popularity=0):
        if book_id not in self.df_books.index:
            return 0
        
        book_data = self.df_books.loc[book_id]
        
        global_popularity = book_data.get('ratings_count', 0)
        work_popularity = book_data.get('work_ratings_count', 0) 
        avg_rating = book_data.get('average_rating', 0)
        
        global_pop_score = np.log1p(global_popularity) / 20
        
        local_pop_score = np.log1p(local_popularity) / 10
        
        quality_score = (avg_rating - 3.0) / 2.0 if avg_rating > 0 else 0
        quality_score = max(0, quality_score)
        
        work_pop_score = np.log1p(work_popularity) / 25 if work_popularity > 0 else 0
        
        final_score = (
            0.3 * global_pop_score +
            0.2 * local_pop_score +
            0.3 * quality_score +
            0.2 * work_pop_score
        )
        
        return final_score
    
    def _select_cluster_representatives(self):
        local_popularity = defaultdict(int)
        for user_ratings in self.sparse_ratings.values():
            for book_id, rating in user_ratings:
                local_popularity[book_id] += 1
        
        self.cluster_representatives = {}
        
        for cluster, books in self.cluster_to_books.items():
            book_scores = []
            for book_id in books:
                local_pop = local_popularity.get(book_id, 0)
                score = self._calculate_book_score(book_id, local_pop)
                book_scores.append((book_id, score))
            
            book_scores.sort(key=lambda x: x[1], reverse=True)
            
            selected_books = self._diversified_selection(book_scores, cluster)
            self.cluster_representatives[cluster] = selected_books[:self.books_per_cluster]
    
    def _diversified_selection(self, book_scores, cluster):
        selected = []
        
        high_score_count = max(1, int(self.books_per_cluster * 0.2))
        selected.extend([book_id for book_id, score in book_scores[:high_score_count]])
        
        remaining_books = book_scores[high_score_count:]
        
        rating_buckets = defaultdict(list)
        for book_id, score in remaining_books:
            if book_id in self.df_books.index:
                avg_rating = self.df_books.loc[book_id].get('average_rating', 0)
                rating_bucket = int(avg_rating) if avg_rating > 0 else 3
                rating_buckets[rating_bucket].append((book_id, score))
        
        remaining_slots = self.books_per_cluster - len(selected)
        books_per_bucket = max(1, remaining_slots // len(rating_buckets)) if rating_buckets else 0
        
        for rating_range in sorted(rating_buckets.keys(), reverse=True):
            bucket_books = sorted(rating_buckets[rating_range], key=lambda x: x[1], reverse=True)
            selected.extend([book_id for book_id, score in bucket_books[:books_per_bucket]])
            
            if len(selected) >= self.books_per_cluster:
                break
        
        return selected[:self.books_per_cluster]
    
    def encode_user(self, user_id):
        active_clusters = set()
        user_ratings_dict = {}
        
        if user_id in self.sparse_ratings:
            for book_id, rating in self.sparse_ratings[user_id]:
                user_ratings_dict[book_id] = rating
                if book_id in self.book_to_cluster:
                    active_clusters.add(self.book_to_cluster[book_id])
        
        if not active_clusters:
            active_clusters = set(self.cluster_representatives.keys())
        
        user_vector = []
        book_candidates = []
        
        for cluster in sorted(active_clusters):
            cluster_books = self.cluster_representatives[cluster]
            cluster_vector = np.zeros(len(cluster_books))
            
            for i, book_id in enumerate(cluster_books):
                if book_id in user_ratings_dict:
                    cluster_vector[i] = user_ratings_dict[book_id]
            
            user_vector.extend(cluster_vector)
            book_candidates.extend(cluster_books)
        
        return np.array(user_vector), book_candidates
    
    def get_cluster_stats(self):
        stats = {}
        for cluster, books in self.cluster_representatives.items():
            cluster_books_data = self.df_books.loc[books]
            stats[cluster] = {
                'count': len(books),
                'avg_rating_range': f"{cluster_books_data['average_rating'].min():.1f}-{cluster_books_data['average_rating'].max():.1f}",
                'avg_ratings_count': cluster_books_data['ratings_count'].mean(),
                'top_books': [(book_id, self.df_books.loc[book_id]['average_rating']) 
                             for book_id in books[:5]]
            }
        return stats

In [5]:
encoder = ClusterBasedEncoder(sparse_users, df_books, books_per_cluster=100)
stats = encoder.get_cluster_stats()
for cluster, info in stats.items():
    print(f"Cluster {cluster}: {info['count']} libri, "
          f"rating {info['avg_rating_range']}, "
          f"popolarità media: {info['avg_ratings_count']:.0f}")

Cluster 7: 100 libri, rating 3.6-4.7, popolarità media: 506825
Cluster 12: 100 libri, rating 3.5-4.6, popolarità media: 232476
Cluster 0: 100 libri, rating 3.6-4.5, popolarità media: 394047
Cluster 10: 73 libri, rating 2.8-4.8, popolarità media: 341760
Cluster 8: 100 libri, rating 3.7-4.5, popolarità media: 275897
Cluster 3: 100 libri, rating 3.6-4.5, popolarità media: 235884
Cluster 1: 75 libri, rating 2.5-4.6, popolarità media: 266917
Cluster 4: 100 libri, rating 3.5-4.6, popolarità media: 173942
Cluster 9: 81 libri, rating 2.7-4.5, popolarità media: 249263
Cluster 2: 100 libri, rating 3.7-4.6, popolarità media: 88221
Cluster 5: 100 libri, rating 3.7-4.5, popolarità media: 282880
Cluster 13: 100 libri, rating 3.6-4.5, popolarità media: 74448
Cluster 11: 100 libri, rating 3.5-4.3, popolarità media: 71973
Cluster 14: 100 libri, rating 3.7-4.8, popolarità media: 45270
Cluster 6: 100 libri, rating 3.7-4.8, popolarità media: 55805


In [6]:
def print_detailed_user_encoding(encoder, user_id):
    
    print(f"\n=== ENCODING {user_id} ===")
    
    user_vector, book_candidates = encoder.encode_user(user_id)
    
    recensiti = np.count_nonzero(user_vector)
    
    current_idx = 0
    
    for cluster in sorted(encoder.cluster_representatives.keys()):
        cluster_books = encoder.cluster_representatives[cluster]
        cluster_size = len(cluster_books)
        cluster_vector = user_vector[current_idx:current_idx + cluster_size]
        
        cluster_recensiti = np.count_nonzero(cluster_vector)
        if cluster_recensiti > 0:
            cluster_ratings = cluster_vector[cluster_vector > 0]
            print(f"  Cluster {cluster}: {cluster_recensiti}/{cluster_size} libri, "
                  f"rating medio: {cluster_ratings.mean():.2f}")
        else:
            print(f"  Cluster {cluster}: 0/{cluster_size} libri")
        
        current_idx += cluster_size
    
    if recensiti > 0 and hasattr(encoder, 'df_books'):
        print(f"\nLibri recensiti con dettagli:")
        count = 0
        for i, rating in enumerate(user_vector):
            if rating > 0 and count < 5:
                book_id = book_candidates[i]
                if book_id in encoder.df_books.index:
                    book_info = encoder.df_books.loc[book_id]
                    title = book_info.get('title', 'N/A')
                    avg_rating = book_info.get('average_rating', 'N/A')
                    cluster = encoder.book_to_cluster.get(book_id, 'N/A')
                    print(f"  Rating {rating}: [{cluster}] {title[:50]} (avg: {avg_rating})")
                else:
                    print(f"  Rating {rating}: Book ID {book_id}")
                count += 1


sample_user = list(sparse_users.keys())[0]
print(f"test con utente: {sample_user}")
print_detailed_user_encoding(encoder, sample_user)

test con utente: 1

=== ENCODING 1 ===
  Cluster 0: 24/100 libri, rating medio: 3.71
  Cluster 1: 10/75 libri, rating medio: 3.50
  Cluster 2: 5/100 libri, rating medio: 3.20
  Cluster 3: 8/100 libri, rating medio: 3.38
  Cluster 4: 5/100 libri, rating medio: 3.80
  Cluster 5: 17/100 libri, rating medio: 3.41
  Cluster 6: 16/100 libri, rating medio: 3.44
  Cluster 7: 0/100 libri
  Cluster 8: 1/100 libri, rating medio: 4.00
  Cluster 9: 0/81 libri
  Cluster 10: 0/73 libri
  Cluster 11: 0/100 libri
  Cluster 12: 0/100 libri
  Cluster 13: 0/100 libri
  Cluster 14: 0/100 libri

Libri recensiti con dettagli:
  Rating 5.0: [0] To Kill a Mockingbird (avg: 4.25)
  Rating 4.0: [0] Pride and Prejudice (avg: 4.24)
  Rating 4.0: [0] 1984 (avg: 4.14)
  Rating 4.0: [0] Memoirs of a Geisha (avg: 4.08)
  Rating 4.0: [0] Jane Eyre (avg: 4.1)
