In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.io import mmread
from sklearn.decomposition import TruncatedSVD

In [None]:
# Load the sparse user-product interaction matrix
sparse_matrix = mmread('../data/extracted/user_product_interaction_sparse.mtx').tocsr()

# Load the clustering results
clustering_results = pd.read_csv('../data/extracted/user_clusters.csv')

In [None]:
# Calculate the frequency of users in each cluster
cluster_counts = clustering_results['cluster_id'].value_counts().sort_index()

# Plot the distribution with reduced x-ticks
plt.figure(figsize=(16, 8))
cluster_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Number of Users in Each Cluster', fontsize=20)
plt.xlabel('Cluster ID', fontsize=15)
plt.ylabel('Number of Users', fontsize=15)

# Show only every 10th label for better readability
plt.xticks(range(0, len(cluster_counts), 100))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Apply Truncated SVD to reduce to 2D
svd_2d = TruncatedSVD(n_components=2, random_state=42)
reduced_2d = svd_2d.fit_transform(sparse_matrix)

# Add the 2D reduced features to the clustering results
clustering_results['pca_1'] = reduced_2d[:, 0]
clustering_results['pca_2'] = reduced_2d[:, 1]

In [None]:
plt.figure(figsize=(16, 10))
sns.scatterplot(
	x='pca_1', y='pca_2',
	hue='cluster_id',
	palette='viridis',
	data=clustering_results,
	s=10,
	alpha=0.7,
	legend=False  # Disable the legend
)
plt.title('User Clustering Visualization (2D PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.tight_layout()
plt.show()