**Imports**

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import zlib
import pickle
import os
import matplotlib.pyplot as plt
from itertools import combinations
import sys

compressed_sizes = {}


**Load dataset**

In [2]:
def load_dataset(file_path):
    df = pd.read_csv(file_path, sep='\t', header=0, usecols=[0, 1, 2, 3])  # Ignore extra columns
    return df

**Dictionary with pre-computed values**

In [3]:
def compute_compressed_sizes(user_ratings):
    
    for user, ratings in user_ratings.items():
        u_string = "".join(f"{k}:{v}" for k, v in sorted(ratings.items()))
        compressed_sizes[u_string] = len(zlib.compress(u_string.encode()))
    
    return compressed_sizes


**Similarities**

In [4]:
def linear_similarity(ratings_u, ratings_v):
    common_items = set(ratings_u.keys()).intersection(set(ratings_v.keys()))
    if not common_items:
        return 0  # If no common items, similarity is 0

    diff_sum = sum(abs(ratings_u[i] - ratings_v[i]) for i in common_items)
    ls_value = 1 - (diff_sum / len(common_items))

    return max(0, ls_value)  # Ensure similarity is non-negative

# Compression Similarity with precomputed sizes
def compression_similarity(ratings_u, ratings_v):
    u_string = "".join(f"{k}:{v}" for k, v in sorted(ratings_u.items()))
    v_string = "".join(f"{k}:{v}" for k, v in sorted(ratings_v.items()))
    c_uv = len(zlib.compress((u_string + v_string).encode()))
    c_u = compressed_sizes[u_string]
    c_v = compressed_sizes[v_string]
    return 1 - (c_uv - min(c_u, c_v)) / max(c_u, c_v)

# Kolmogorov Similarity with precomputed sizes
def kolmogorov_similarity(ratings_u, ratings_v):
    u_string = "".join(f"{k}:{v}" for k, v in sorted(ratings_u.items()))
    v_string = "".join(f"{k}:{v}" for k, v in sorted(ratings_v.items()))
    c_u = compressed_sizes[u_string]
    c_v = compressed_sizes[v_string]
    return 1 / (1 + abs(c_u - c_v))


**Simlarity Matrix and graph**

In [5]:
# Compute user similarity matrix and construct graph (with precomputed sizes)
def compute_similarity_matrix(user_ratings, similarity_measure, compressed_sizes):
    similarity_graph = nx.Graph()
    for (u, v) in combinations(user_ratings.keys(), 2):
        sim = similarity_measure(user_ratings[u], user_ratings[v])
        if sim > 0.8:
            similarity_graph.add_edge(u, v, weight=sim)
    return similarity_graph

# Detect user clusters from similarity graph
def detect_groups(similarity_graph):
    return list(nx.connected_components(similarity_graph))

**Reputation-based intra-clustering**

In [None]:
def compute_cluster_ratings(df, user_groups):
    """ Compute initial ratings per cluster based only on the items rated by users in the cluster. """
    cluster_item_ratings = {}

    for cluster_idx, user_set in enumerate(user_groups, start=1):
        # Filter ratings for users in the current cluster
        cluster_df = df[df["user_id"].isin(user_set)]
        
        # Compute average rating per item (ignoring items not rated in this cluster)
        item_avg_ratings = cluster_df.groupby("item_id")["normalized_rating"].mean().to_dict()

        # Store in dictionary
        cluster_item_ratings[cluster_idx] = item_avg_ratings

    return cluster_item_ratings


def fill_missing_ratings(cluster_item_ratings, total_items=17000):
    """Fill missing ratings for the largest clusters by borrowing from other clusters."""
    # Step 1: Sort clusters by number of rated items (largest first)
    sorted_clusters = sorted(cluster_item_ratings.keys(), key=lambda c: len(cluster_item_ratings[c]), reverse=True)
    top_clusters = sorted_clusters[:3]  # Select the 3 largest clusters
    
    for cluster in top_clusters:
        missing_items = set(range(1, total_items + 1)) - set(cluster_item_ratings[cluster].keys())  # Items missing
        
        for item in missing_items:
            for donor_cluster in sorted_clusters:  # Try to borrow from other clusters
                if item in cluster_item_ratings[donor_cluster]:
                    if item == 30:
                        print("CESARALINA")
                    cluster_item_ratings[cluster][item] = cluster_item_ratings[donor_cluster][item]
                    break  # Stop looking once we find the rating
        
        # Sort the cluster dictionary by item ID
        cluster_item_ratings[cluster] = dict(sorted(cluster_item_ratings[cluster].items()))

        print(f"CLUSTER {cluster}: {len(cluster_item_ratings[cluster])} items rated   ", cluster_item_ratings[cluster])

    return cluster_item_ratings


**Main function**

In [7]:

# Main execution
file_path = "/home/martim/Desktop/tese/datasets/book_crossing/book_ratings_normalized.dat"
df = load_dataset(file_path)

# Prepare user ratings
user_ratings = {user: dict(zip(group["item_id"], group["normalized_rating"])) for user, group in df.groupby("user_id")}

# Compute or load compressed sizes
compressed_sizes = compute_compressed_sizes(user_ratings)
print("comprimido")

comprimido


**Clustering**

In [None]:
# Choose similarity measure: compression_similarity or kolmogorov_similarity
similarity_graph = compute_similarity_matrix(user_ratings, kolmogorov_similarity, compressed_sizes)

# Detect user clusters from similarity graph
user_groups = list(nx.connected_components(similarity_graph))

# Print the number of detected clusters
print(f"User groups: {(user_groups)}")



User groups: [{1, 802, 225, 961, 337, 945, 1393, 2769, 598, 2201}, {2, 531, 1683, 1301, 2324, 1056, 1070, 2611, 313, 1595, 1084, 2627, 715, 2252, 2259, 1372, 490, 1004, 497, 2038}, {5, 2582, 2844, 2465, 2343, 1336, 2752, 1346, 1990, 1997, 727, 1762, 1522, 2546, 1908, 502, 121, 2170, 763, 124}, {514, 1799, 9, 2825, 2444, 532, 1940, 1817, 2206, 546, 1448, 2358, 60, 2878, 190, 1742, 351, 1506, 1273}, {1059, 1764, 1285, 11, 2734, 879, 1775, 415}, {544, 2531, 14, 499, 2774, 2429}, {1414, 1415, 2215, 2602, 18, 924, 2750, 2143}, {2881, 2018, 835, 1062, 2345, 2729, 957, 2579, 20, 629, 1396, 2804, 2943, 477, 94, 255}, {1929, 2836, 533, 22, 2329, 1945, 814, 2350, 2487, 439, 1080, 444, 191, 2375, 202, 2785, 2149, 105, 2411, 1644, 1518, 2808}, {608, 545, 676, 2116, 2507, 1582, 2735, 2515, 1462, 24, 377, 1082, 732}, {804, 843, 753, 2803, 2040, 25, 1148, 733}, {705, 386, 2081, 708, 1768, 396, 1561, 367, 2766, 1010, 1907, 1909, 1527, 600, 441, 26, 1437, 702}, {2630, 647, 1385, 622, 1456, 689, 1968, 6

**Ranking computation**


In [None]:
# Assuming user_groups is already defined
cluster_item_ratings = compute_cluster_ratings(df, user_groups)  # Step 1: Compute initial ratings
    
# Fill missing ratings
filled_ratings = fill_missing_ratings(cluster_item_ratings)

print(filled_ratings[4])
# Extract item rankings for items rated by users in the largest cluster
ranking_values = [
    rankings[item]
    for item in rankings.keys()
    if any(user in largest_cluster_users for user in df[df["item_id"] == item]["user_id"])
]

# Define bins from 0.1 to 1.0 (inclusive) with 0.1 increments
bins = np.arange(0.1, 1.1, 0.1)

# Create histogram
plt.figure(figsize=(8, 5))
hist, bin_edges, patches = plt.hist(ranking_values, bins=bins, color='skyblue', edgecolor='black', alpha=0.7)

# Annotate each bar with its count
for patch, count in zip(patches, hist):
    height = patch.get_height()
    if height > 0:
        plt.text(patch.get_x() + patch.get_width() / 2, height, int(count),
                 ha='center', va='bottom', fontsize=10, fontweight='bold')

# Labels and title
plt.xlabel("Ranking Score")
plt.ylabel("Frequency")
plt.title("Ranking Distribution in the Largest Cluster")
plt.xticks(bins)  # Ensure bins are correctly labeled
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()


1190
CLUSTER 2: 13117 items rated    {1: 0.6, 2: 1.0, 3: 1.0, 4: 0.7, 5: 1.0, 6: 1.0, 7: 1.0, 8: 0.6, 9: 1.0, 10: 1.0, 11: 1.0, 13: 0.7, 14: 1.0, 15: 1.0, 16: 0.9, 17: 0.9, 18: 0.8, 19: 0.7, 20: 0.7, 21: 0.4, 22: 0.7, 23: 0.5, 24: 1.0, 25: 0.8, 27: 0.5, 28: 0.8, 29: 0.7, 31: 1.0, 32: 0.9, 34: 0.7, 35: 0.7, 36: 0.6, 37: 1.0, 38: 0.8, 39: 0.65, 40: 0.8, 41: 0.6, 42: 0.9, 43: 0.8, 44: 0.85, 45: 0.8, 46: 0.8, 48: 0.9, 49: 0.6499999999999999, 50: 0.7, 51: 0.9, 52: 0.8, 53: 1.0, 54: 1.0, 55: 0.9, 56: 1.0, 57: 0.8, 58: 0.7, 59: 0.8, 60: 0.7, 62: 0.8, 63: 0.7, 64: 0.9, 65: 1.0, 66: 0.7, 67: 0.9, 68: 0.8, 69: 0.8, 70: 1.0, 73: 1.0, 74: 0.5, 75: 0.9, 76: 0.7, 77: 0.6, 78: 0.8, 79: 1.0, 80: 0.7, 81: 0.7, 82: 0.8, 83: 0.7, 84: 0.8, 85: 0.7, 87: 0.6, 88: 0.4, 89: 0.8, 90: 0.5, 91: 0.7, 92: 0.7, 93: 1.0, 94: 0.9, 95: 0.9, 96: 1.0, 97: 0.8, 98: 0.7, 99: 0.8, 101: 0.5, 104: 0.7, 105: 0.85, 106: 0.9, 107: 1.0, 108: 0.9, 109: 0.7, 110: 0.8, 111: 1.0, 112: 1.0, 113: 0.8, 114: 0.9, 115: 0.9, 116: 1.0, 117

NameError: name 'rankings' is not defined