In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from umap.umap_ import UMAP
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans, AgglomerativeClustering

from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
groups = pd.read_json('babynamesDB_groups.json')
groups = groups[groups['num_users_stored'] > 3]
group_ids = groups['_id'].to_list()

In [None]:
users = pd.read_json('babynamesDB_users.json')
users['num_groups'] = [len(i) for i in users['groups']]
users = users[['_id' , 'num_comments_stored', 'groups', 'num_posts_stored', 'num_groups']]
users = users.explode('groups')
users = users[users['groups'].isin(group_ids)]
users = pd.concat([users, pd.get_dummies(users['groups'], dtype=float)], axis=1)
users = users.groupby('_id').sum()
users = users.drop(columns=['groups'])

In [None]:
# Dimensionality Reduction

pca = PCA(n_components=2)
pca_result = pca.fit_transform(users)

plt.figure(figsize=(6, 6))
plt.scatter(pca_result[:,0], pca_result[:,1])
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('PCA for user embeddings')
plt.show()

# umap for 2 components
umap = UMAP(n_components=2)
umap_result = umap.fit_transform(users.sample(frac=0.1))

sns.pairplot(pd.DataFrame(umap_result), height=2.5)

In [None]:
# Clustering

kmeans = KMeans(n_clusters=10)
kmeans.fit(pca_result)

plt.figure(figsize=(6, 6))
plt.scatter(pca_result[:,0], pca_result[:,1], c=kmeans.labels_)
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('PCA for user embeddings with KMeans Clustering')
plt.show()

kmeans.fit(umap_result)

plt.figure(figsize=(6, 6))
plt.scatter(umap_result[:,0], umap_result[:,1], c=kmeans.labels_)
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.title('UMAP for user embeddings with KMeans Clustering')
plt.show()

agglo = AgglomerativeClustering(n_clusters=10)
agglo.fit(pca_result)

plt.figure(figsize=(6, 6))
plt.scatter(pca_result[:,0], pca_result[:,1], c=agglo.labels_)
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('PCA for user embeddings with Agglomerative Clustering')
plt.show()

agglo.fit(umap_result)

plt.figure(figsize=(6, 6))
plt.scatter(umap_result[:,0], umap_result[:,1], c=agglo.labels_)
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.title('UMAP for user embeddings with Agglomerative Clustering')
plt.show()


In [None]:
# accuracy metrics

print('PCA KMeans Silhouette Score: ', silhouette_score(pca_result, kmeans.labels_))
print('PCA KMeans Davies Bouldin Score: ', davies_bouldin_score(pca_result, kmeans.labels_))
print('PCA KMeans Calinski Harabasz Score: ', calinski_harabasz_score(pca_result, kmeans.labels_))

print('UMAP KMeans Silhouette Score: ', silhouette_score(umap_result, kmeans.labels_))
print('UMAP KMeans Davies Bouldin Score: ', davies_bouldin_score(umap_result, kmeans.labels_))
print('UMAP KMeans Calinski Harabasz Score: ', calinski_harabasz_score(umap_result, kmeans.labels_))

print('PCA Spectral Silhouette Score: ', silhouette_score(pca_result, agglo.labels_))
print('PCA Spectral Davies Bouldin Score: ', davies_bouldin_score(pca_result, agglo.labels_))
print('PCA Spectral Calinski Harabasz Score: ', calinski_harabasz_score(pca_result, agglo.labels_))

print('UMAP Spectral Silhouette Score: ', silhouette_score(umap_result, agglo.labels_))
print('UMAP Spectral Davies Bouldin Score: ', davies_bouldin_score(umap_result, agglo.labels_))
print('UMAP Spectral Calinski Harabasz Score: ', calinski_harabasz_score(umap_result, agglo.labels_))