In [1]:
import numpy as np 
import pandas as pd 
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP 

from sklearn.cluster import MiniBatchKMeans, DBSCAN, AgglomerativeClustering
from hdbscan import HDBSCAN

### Utils

In [23]:
def plot_2d(df: pd.DataFrame):
	fig = px.scatter(df, x="pc1", y="pc2", color="cluster", opacity=0.5, color_discrete_sequence=px.colors.qualitative.Plotly)

	return fig

def plot_3d(df: pd.DataFrame):
	fig = px.scatter_3d(df, x="pc1", y="pc2", z="pc3", color="cluster", opacity=0.5, color_discrete_sequence=px.colors.qualitative.Plotly)
	return fig

#### Config

In [4]:
from enum import Enum 

class DATASET(Enum):
	CELEBA_L = "celeba_buffalo_l"
	CELEBA_S = "celeba_buffalo_s"

class DATA_CATEGORY(Enum):
	FEATURES = "features"
	EMBEDDINGS = "embeddings"

class PROJECTION(Enum):
	UMAP = "umap"
	TSNE = "tsne_exaggeration_12"

In [5]:
dataset = pd.read_parquet(f"../data/{DATASET.CELEBA_L.value}__{DATA_CATEGORY.EMBEDDINGS.value}.gzip")
data_projected = np.load(f"../precomputed/{DATASET.CELEBA_L.value}__{DATA_CATEGORY.EMBEDDINGS.value}_{PROJECTION.TSNE.value}.npy")
data_projected = pd.DataFrame(data_projected, columns=["pc1", "pc2", "pc3"])

data_projected.head()

Unnamed: 0,pc1,pc2,pc3
0,12.262559,-3.68187,31.886032
1,12.058534,12.351152,25.459198
2,20.065763,-15.260885,-27.705442
3,-5.68637,-23.74382,-16.418594
4,19.74921,-21.318367,0.073007


#### KMeans

In [7]:
kmeans = MiniBatchKMeans(n_clusters=3, n_init="auto", random_state=42)
labels = kmeans.fit_predict(dataset).astype(str)

data_projected["cluster"] = labels

In [8]:
plot_3d(data_projected)

In [9]:
plot_2d(data_projected)

#### HDBScan

In [10]:
N_SAMPLES = 5000

indices = np.random.permutation(list(range(dataset.shape[0])))
dataset_sample = dataset.loc[indices[:N_SAMPLES],:]
data_projected_sample = np.take(data_projected, indices[:N_SAMPLES], axis=0)

In [11]:
hdb = HDBSCAN()
labels = hdb.fit_predict(dataset_sample).astype(str)

data_projected_sample["cluster"] = labels

In [12]:
plot_3d(data_projected_sample)

In [13]:
plot_2d(data_projected_sample)

#### Agglomerative Clustering

In [14]:
N_SAMPLES = 10000

indices = np.random.permutation(list(range(dataset.shape[0])))
dataset_sample = dataset.loc[indices[:N_SAMPLES],:]
data_projected_sample = np.take(data_projected, indices[:N_SAMPLES], axis=0)

In [26]:
agglo = AgglomerativeClustering(n_clusters=10)
labels = agglo.fit_predict(dataset_sample).astype(str)

data_projected_sample["cluster"] = labels

In [28]:
plot_3d(data_projected_sample)

In [27]:
plot_2d(data_projected_sample)